Skip to content

Commit d99336a

Browse files
skomskijasnell
authored andcommitted
src: replace naive search in Buffer::IndexOf
Adds the string search implementation from v8 which uses naive search if pattern length < 8 or to a specific badness then uses Boyer-Moore-Horspool Added benchmark shows the expected improvements Added option to use ucs2 encoding with Buffer::IndexOf Reviewed-By: James M Snell <[email protected]> Reviewed-By: Trevor Norris <[email protected]> PR-URL: #2539
1 parent 3eaa593 commit d99336a

File tree

8 files changed

+4935
-60
lines changed

8 files changed

+4935
-60
lines changed

benchmark/buffers/buffer-indexof.js

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
var common = require('../common.js');
2+
var fs = require('fs');
3+
4+
var bench = common.createBenchmark(main, {
5+
search: ['@', 'SQ', '10x', '--l', 'Alice', 'Gryphon', 'Panther',
6+
'Ou est ma chatte?', 'found it very', 'among mad people',
7+
'neighbouring pool', 'Soo--oop', 'aaaaaaaaaaaaaaaaa',
8+
'venture to go near the house till she had brought herself down to',
9+
'</i> to the Caterpillar'],
10+
encoding: ['undefined', 'utf8', 'ucs2', 'binary'],
11+
type: ['buffer', 'string'],
12+
iter: [1]
13+
});
14+
15+
function main(conf) {
16+
var iter = (conf.iter) * 100000;
17+
var aliceBuffer = fs.readFileSync(__dirname + '/../fixtures/alice.html');
18+
var search = conf.search;
19+
var encoding = conf.encoding;
20+
21+
if (encoding === 'undefined') {
22+
encoding = undefined;
23+
}
24+
25+
if (encoding === 'ucs2') {
26+
aliceBuffer = new Buffer(aliceBuffer.toString(), encoding);
27+
}
28+
29+
if (conf.type === 'buffer') {
30+
search = new Buffer(new Buffer(search).toString(), encoding);
31+
}
32+
33+
bench.start();
34+
for (var i = 0; i < iter; i++) {
35+
aliceBuffer.indexOf(search, 0, encoding);
36+
}
37+
bench.end(iter);
38+
}

benchmark/fixtures/alice.html

+3,865
Large diffs are not rendered by default.

lib/buffer.js

+39-6
Original file line numberDiff line numberDiff line change
@@ -414,20 +414,53 @@ Buffer.prototype.compare = function compare(b) {
414414
return binding.compare(this, b);
415415
};
416416

417+
function slowIndexOf(buffer, val, byteOffset, encoding) {
418+
var loweredCase = false;
419+
for (;;) {
420+
switch (encoding) {
421+
case 'utf8':
422+
case 'utf-8':
423+
case 'ucs2':
424+
case 'ucs-2':
425+
case 'utf16le':
426+
case 'utf-16le':
427+
case 'binary':
428+
return binding.indexOfString(buffer, val, byteOffset, encoding);
417429

418-
Buffer.prototype.indexOf = function indexOf(val, byteOffset) {
430+
case 'base64':
431+
case 'ascii':
432+
case 'hex':
433+
return binding.indexOfBuffer(
434+
buffer, Buffer(val, encoding), byteOffset, encoding);
435+
436+
default:
437+
if (loweredCase) {
438+
throw new TypeError('Unknown encoding: ' + encoding);
439+
}
440+
441+
encoding = ('' + encoding).toLowerCase();
442+
loweredCase = true;
443+
}
444+
}
445+
}
446+
447+
Buffer.prototype.indexOf = function indexOf(val, byteOffset, encoding) {
419448
if (byteOffset > 0x7fffffff)
420449
byteOffset = 0x7fffffff;
421450
else if (byteOffset < -0x80000000)
422451
byteOffset = -0x80000000;
423452
byteOffset >>= 0;
424453

425-
if (typeof val === 'string')
426-
return binding.indexOfString(this, val, byteOffset);
427-
if (val instanceof Buffer)
428-
return binding.indexOfBuffer(this, val, byteOffset);
429-
if (typeof val === 'number')
454+
if (typeof val === 'string') {
455+
if (encoding === undefined) {
456+
return binding.indexOfString(this, val, byteOffset, encoding);
457+
}
458+
return slowIndexOf(this, val, byteOffset, encoding);
459+
} else if (val instanceof Buffer) {
460+
return binding.indexOfBuffer(this, val, byteOffset, encoding);
461+
} else if (typeof val === 'number') {
430462
return binding.indexOfNumber(this, val, byteOffset);
463+
}
431464

432465
throw new TypeError('val must be string, number or Buffer');
433466
};

node.gyp

+1
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@
169169
'src/util.h',
170170
'src/util-inl.h',
171171
'src/util.cc',
172+
'src/string_search.cc',
172173
'deps/http_parser/http_parser.h',
173174
'deps/v8/include/v8.h',
174175
'deps/v8/include/v8-debug.h',

src/node_buffer.cc

+124-54
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "env.h"
55
#include "env-inl.h"
66
#include "string_bytes.h"
7+
#include "string_search.h"
78
#include "util.h"
89
#include "util-inl.h"
910
#include "v8-profiler.h"
@@ -792,87 +793,156 @@ void Compare(const FunctionCallbackInfo<Value> &args) {
792793
}
793794

794795

795-
int32_t IndexOf(const char* haystack,
796-
size_t h_length,
797-
const char* needle,
798-
size_t n_length) {
799-
CHECK_GE(h_length, n_length);
800-
// TODO(trevnorris): Implement Boyer-Moore string search algorithm.
801-
for (size_t i = 0; i < h_length - n_length + 1; i++) {
802-
if (haystack[i] == needle[0]) {
803-
if (memcmp(haystack + i, needle, n_length) == 0)
804-
return i;
805-
}
806-
}
807-
return -1;
808-
}
809-
810-
811796
void IndexOfString(const FunctionCallbackInfo<Value>& args) {
812797
ASSERT(args[1]->IsString());
813798
ASSERT(args[2]->IsNumber());
814799

800+
enum encoding enc = ParseEncoding(args.GetIsolate(),
801+
args[3],
802+
UTF8);
803+
815804
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
816805
SPREAD_ARG(args[0], ts_obj);
817806

818-
node::Utf8Value str(args.GetIsolate(), args[1]);
819-
int32_t offset_i32 = args[2]->Int32Value();
820-
uint32_t offset;
807+
Local<String> needle = args[1].As<String>();
808+
const char* haystack = ts_obj_data;
809+
const size_t haystack_length = ts_obj_length;
810+
const size_t needle_length = needle->Utf8Length();
811+
812+
813+
if (needle_length == 0 || haystack_length == 0) {
814+
return args.GetReturnValue().Set(-1);
815+
}
816+
817+
int64_t offset_i64 = args[2]->IntegerValue();
818+
size_t offset = 0;
821819

822-
if (offset_i32 < 0) {
823-
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
820+
if (offset_i64 < 0) {
821+
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0) {
824822
offset = 0;
825-
else
826-
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
823+
} else {
824+
offset = static_cast<size_t>(haystack_length + offset_i64);
825+
}
827826
} else {
828-
offset = static_cast<uint32_t>(offset_i32);
827+
offset = static_cast<size_t>(offset_i64);
829828
}
830829

831-
if (str.length() == 0 ||
832-
ts_obj_length == 0 ||
833-
(offset != 0 && str.length() + offset <= str.length()) ||
834-
str.length() + offset > ts_obj_length)
830+
if (haystack_length < offset || needle_length + offset > haystack_length) {
835831
return args.GetReturnValue().Set(-1);
832+
}
836833

837-
int32_t r =
838-
IndexOf(ts_obj_data + offset, ts_obj_length - offset, *str, str.length());
839-
args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
840-
}
834+
size_t result = haystack_length;
835+
836+
if (enc == UCS2) {
837+
String::Value needle_value(needle);
838+
if (*needle_value == nullptr)
839+
return args.GetReturnValue().Set(-1);
840+
841+
if (haystack_length < 2 || needle_value.length() < 1) {
842+
return args.GetReturnValue().Set(-1);
843+
}
844+
845+
result = SearchString(reinterpret_cast<const uint16_t*>(haystack),
846+
haystack_length / 2,
847+
reinterpret_cast<const uint16_t*>(*needle_value),
848+
needle_value.length(),
849+
offset / 2);
850+
result *= 2;
851+
} else if (enc == UTF8) {
852+
String::Utf8Value needle_value(needle);
853+
if (*needle_value == nullptr)
854+
return args.GetReturnValue().Set(-1);
855+
856+
result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
857+
haystack_length,
858+
reinterpret_cast<const uint8_t*>(*needle_value),
859+
needle_length,
860+
offset);
861+
} else if (enc == BINARY) {
862+
uint8_t* needle_data = static_cast<uint8_t*>(malloc(needle_length));
863+
if (needle_data == nullptr) {
864+
return args.GetReturnValue().Set(-1);
865+
}
866+
needle->WriteOneByte(
867+
needle_data, 0, needle_length, String::NO_NULL_TERMINATION);
868+
869+
result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
870+
haystack_length,
871+
needle_data,
872+
needle_length,
873+
offset);
874+
free(needle_data);
875+
}
841876

877+
args.GetReturnValue().Set(
878+
result == haystack_length ? -1 : static_cast<int>(result));
879+
}
842880

843881
void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
844882
ASSERT(args[1]->IsObject());
845883
ASSERT(args[2]->IsNumber());
846884

885+
enum encoding enc = ParseEncoding(args.GetIsolate(),
886+
args[3],
887+
UTF8);
888+
847889
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
848890
SPREAD_ARG(args[0], ts_obj);
849891
SPREAD_ARG(args[1], buf);
850-
const int32_t offset_i32 = args[2]->Int32Value();
851-
uint32_t offset;
852892

853893
if (buf_length > 0)
854894
CHECK_NE(buf_data, nullptr);
855895

856-
if (offset_i32 < 0) {
857-
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
896+
const char* haystack = ts_obj_data;
897+
const size_t haystack_length = ts_obj_length;
898+
const char* needle = buf_data;
899+
const size_t needle_length = buf_length;
900+
901+
if (needle_length == 0 || haystack_length == 0) {
902+
return args.GetReturnValue().Set(-1);
903+
}
904+
905+
int64_t offset_i64 = args[2]->IntegerValue();
906+
size_t offset = 0;
907+
908+
if (offset_i64 < 0) {
909+
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0)
858910
offset = 0;
859911
else
860-
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
912+
offset = static_cast<size_t>(haystack_length + offset_i64);
861913
} else {
862-
offset = static_cast<uint32_t>(offset_i32);
914+
offset = static_cast<size_t>(offset_i64);
863915
}
864916

865-
if (buf_length == 0 ||
866-
ts_obj_length == 0 ||
867-
(offset != 0 && buf_length + offset <= buf_length) ||
868-
buf_length + offset > ts_obj_length)
917+
if (haystack_length < offset || needle_length + offset > haystack_length) {
869918
return args.GetReturnValue().Set(-1);
919+
}
870920

871-
int32_t r =
872-
IndexOf(ts_obj_data + offset, ts_obj_length - offset, buf_data, buf_length);
873-
args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
874-
}
921+
size_t result = haystack_length;
875922

923+
if (enc == UCS2) {
924+
if (haystack_length < 2 || needle_length < 2) {
925+
return args.GetReturnValue().Set(-1);
926+
}
927+
result = SearchString(
928+
reinterpret_cast<const uint16_t*>(haystack),
929+
haystack_length / 2,
930+
reinterpret_cast<const uint16_t*>(needle),
931+
needle_length / 2,
932+
offset / 2);
933+
result *= 2;
934+
} else {
935+
result = SearchString(
936+
reinterpret_cast<const uint8_t*>(haystack),
937+
haystack_length,
938+
reinterpret_cast<const uint8_t*>(needle),
939+
needle_length,
940+
offset);
941+
}
942+
943+
args.GetReturnValue().Set(
944+
result == haystack_length ? -1 : static_cast<int>(result));
945+
}
876946

877947
void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
878948
ASSERT(args[1]->IsNumber());
@@ -882,25 +952,25 @@ void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
882952
SPREAD_ARG(args[0], ts_obj);
883953

884954
uint32_t needle = args[1]->Uint32Value();
885-
int32_t offset_i32 = args[2]->Int32Value();
886-
uint32_t offset;
955+
int64_t offset_i64 = args[2]->IntegerValue();
956+
size_t offset;
887957

888-
if (offset_i32 < 0) {
889-
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
958+
if (offset_i64 < 0) {
959+
if (offset_i64 + static_cast<int64_t>(ts_obj_length) < 0)
890960
offset = 0;
891961
else
892-
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
962+
offset = static_cast<size_t>(ts_obj_length + offset_i64);
893963
} else {
894-
offset = static_cast<uint32_t>(offset_i32);
964+
offset = static_cast<size_t>(offset_i64);
895965
}
896966

897967
if (ts_obj_length == 0 || offset + 1 > ts_obj_length)
898968
return args.GetReturnValue().Set(-1);
899969

900970
void* ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset);
901971
char* ptr_char = static_cast<char*>(ptr);
902-
args.GetReturnValue().Set(
903-
ptr ? static_cast<int32_t>(ptr_char - ts_obj_data) : -1);
972+
args.GetReturnValue().Set(ptr ? static_cast<int>(ptr_char - ts_obj_data)
973+
: -1);
904974
}
905975

906976

src/string_search.cc

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#include "string_search.h"
2+
3+
namespace node {
4+
namespace stringsearch {
5+
6+
int StringSearchBase::kBadCharShiftTable[kUC16AlphabetSize];
7+
int StringSearchBase::kGoodSuffixShiftTable[kBMMaxShift + 1];
8+
int StringSearchBase::kSuffixTable[kBMMaxShift + 1];
9+
}
10+
} // namespace node::stringsearch

0 commit comments

Comments
 (0)