Skip to content

Commit 37725e9

Browse files
committed
lib: introduce string matcher function generator
This commit introduces a case-insensitive string matcher function generator. It generates (readable) functions that can match strings faster than converting a string to all lowercase/uppercase and then checking against a set of strings. Currently this is used for Buffer.isEncoding() to help improve performance in the slow case where the encoding does not initially match a lowered case version.
1 parent 9209bf6 commit 37725e9

File tree

4 files changed

+280
-21
lines changed

4 files changed

+280
-21
lines changed
+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
'use strict';
2+
3+
const common = require('../common.js');
4+
const v8 = require('v8');
5+
6+
const bench = common.createBenchmark(main, {
7+
encoding: [
8+
'hex',
9+
'utf8',
10+
'utf-8',
11+
'ascii',
12+
'binary',
13+
'base64',
14+
'ucs2',
15+
'ucs-2',
16+
'utf16le',
17+
'utf-16le',
18+
'HEX',
19+
'UTF8',
20+
'UTF-8',
21+
'ASCII',
22+
'BINARY',
23+
'BASE64',
24+
'UCS2',
25+
'UCS-2',
26+
'UTF16LE',
27+
'UTF-16LE',
28+
'utf9',
29+
'utf-7',
30+
'utf17le',
31+
'utf-17le',
32+
'Unicode-FTW',
33+
'new gnu gun'
34+
],
35+
n: [1e8]
36+
});
37+
38+
function main(conf) {
39+
var encoding = conf.encoding;
40+
var n = +conf.n;
41+
42+
// Force optimization before starting the benchmark
43+
Buffer.isEncoding(encoding);
44+
v8.setFlagsFromString('--allow_natives_syntax');
45+
eval('%OptimizeFunctionOnNextCall(Buffer.isEncoding)');
46+
Buffer.isEncoding(encoding);
47+
48+
bench.start();
49+
for (let i = 0; i < n; i++) {
50+
Buffer.isEncoding(encoding);
51+
}
52+
bench.end(n);
53+
}

lib/buffer.js

+41-20
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/* eslint-disable require-buffer */
22
'use strict';
33

4+
const makeStringMatcher = require('internal/util').makeStringMatcher;
45
const binding = process.binding('buffer');
56
const bindingObj = {};
67

@@ -17,6 +18,18 @@ var poolSize, poolOffset, allocPool;
1718
binding.setupBufferJS(Buffer.prototype, bindingObj);
1819
const flags = bindingObj.flags;
1920
const kNoZeroFill = 0;
21+
const ENCODINGS = [
22+
'hex',
23+
'utf8',
24+
'utf-8',
25+
'ascii',
26+
'binary',
27+
'base64',
28+
'ucs2',
29+
'ucs-2',
30+
'utf16le',
31+
'utf-16le'
32+
];
2033

2134
function createBuffer(size) {
2235
const ui8 = new Uint8Array(size);
@@ -184,29 +197,37 @@ Buffer.compare = function compare(a, b) {
184197
};
185198

186199

200+
const isEncodingCI = makeStringMatcher(ENCODINGS, 'isEncodingCI');
187201
Buffer.isEncoding = function(encoding) {
188-
var loweredCase = false;
189-
for (;;) {
190-
switch (encoding) {
191-
case 'hex':
192-
case 'utf8':
193-
case 'utf-8':
194-
case 'ascii':
195-
case 'binary':
196-
case 'base64':
197-
case 'ucs2':
198-
case 'ucs-2':
199-
case 'utf16le':
200-
case 'utf-16le':
202+
if (typeof encoding !== 'string')
203+
encoding = '' + encoding;
204+
switch (encoding.length) {
205+
case 3:
206+
if (encoding === 'hex')
201207
return true;
202-
203-
default:
204-
if (loweredCase)
205-
return false;
206-
encoding = ('' + encoding).toLowerCase();
207-
loweredCase = true;
208-
}
208+
break;
209+
case 4:
210+
if (encoding === 'utf8' || encoding === 'ucs2')
211+
return true;
212+
break;
213+
case 5:
214+
if (encoding === 'utf-8' || encoding === 'ascii' || encoding === 'ucs-2')
215+
return true;
216+
break;
217+
case 6:
218+
if (encoding === 'binary' || encoding === 'base64')
219+
return true;
220+
break;
221+
case 7:
222+
if (encoding === 'utf16le')
223+
return true;
224+
break;
225+
case 8:
226+
if (encoding === 'utf-16le')
227+
return true;
228+
break;
209229
}
230+
return isEncodingCI(encoding);
210231
};
211232

212233

lib/internal/util.js

+182
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,185 @@ exports.isError = function isError(e) {
9696
exports.objectToString = function objectToString(o) {
9797
return Object.prototype.toString.call(o);
9898
};
99+
100+
exports.makeStringMatcher = function(strings, fnName, returnLowered) {
101+
if (!Array.isArray(strings) &&
102+
(typeof strings !== 'object' || strings === null)) {
103+
throw new Error('"strings" argument must be an array or an object');
104+
}
105+
106+
if (typeof fnName !== 'string') {
107+
returnLowered = fnName;
108+
fnName = undefined;
109+
}
110+
111+
var minLen = Infinity;
112+
var maxLen = -1;
113+
var replaces;
114+
115+
// A tree-like object that stores paths to strings, character by character
116+
var paths = Object.create(null);
117+
118+
if (!Array.isArray(strings)) {
119+
if (returnLowered) {
120+
// Allow an object that maps allowed inputs to desired return values
121+
// This is useful to normalize outputs
122+
// (e.g. { 'utf8': 'utf8', 'utf-8': 'utf8' })
123+
replaces = strings;
124+
}
125+
strings = Object.keys(strings);
126+
} else {
127+
strings = strings.slice();
128+
}
129+
130+
strings.sort(function(a, b) {
131+
// Sort the allowed inputs by length first, then by normal string comparison
132+
a = ('' + a).toLowerCase();
133+
b = ('' + b).toLowerCase();
134+
if (a.length === b.length) {
135+
if (a < b)
136+
return -1;
137+
else if (a > b)
138+
return 1;
139+
else
140+
return 0;
141+
}
142+
return a.length - b.length;
143+
}).forEach(function(string) {
144+
// Populate our tree-like object, grouping strings by length
145+
// (e.g. for `strings` of ['utf8', 'ucs2'] this would look like:
146+
// {
147+
// 4: {
148+
// 'u': {
149+
// 'c': {
150+
// 's': {
151+
// '2': 'ucs2'
152+
// }
153+
// },
154+
// 't': {
155+
// 'f': {
156+
// '8': 'utf8'
157+
// }
158+
// }
159+
// }
160+
// })
161+
string = ('' + string).toLowerCase();
162+
minLen = Math.min(string.length, minLen);
163+
maxLen = Math.max(string.length, maxLen);
164+
if (paths[string.length] === undefined)
165+
paths[string.length] = Object.create(null);
166+
var p = paths[string.length];
167+
for (var i = 0; i < string.length; ++i) {
168+
var chr = string[i];
169+
if (p[chr] === undefined) {
170+
if (i + 1 < string.length)
171+
p = p[chr] = Object.create(null);
172+
else
173+
p[chr] = (replaces ? replaces[string] : string);
174+
} else
175+
p = p[chr];
176+
}
177+
});
178+
179+
var code = "'use strict';\n";
180+
181+
if (maxLen > -1) {
182+
code += 'switch (input.length) {\n';
183+
var indent = ' ';
184+
Object.keys(paths).forEach(function(len) {
185+
len = +len;
186+
if (len === 0)
187+
return;
188+
code += indent + `case ${len}:\n`;
189+
if (len === 0) {
190+
// Zero length strings are a simple case that can be easily handled
191+
if (returnLowered)
192+
code += indent + " return '';\n";
193+
else
194+
code += indent + ' return true;\n';
195+
return;
196+
}
197+
198+
var p = paths[len];
199+
var depth = 0;
200+
var i;
201+
202+
// Create a finite stack up front for tracking our traversal of the
203+
// `paths` tree object
204+
var stack = new Array(len);
205+
for (i = 0; i < len; ++i)
206+
stack[i] = { p: null, keys: null };
207+
208+
indent += ' ';
209+
while (true) {
210+
stack[depth].p = p;
211+
var keys = stack[depth].keys;
212+
if (keys === null) {
213+
// We need to refresh our key list to start descending the current
214+
// path in the tree
215+
keys = stack[depth].keys = Object.keys(p);
216+
code += indent + `switch (input.charCodeAt(${depth})) {\n`;
217+
}
218+
if (keys.length === 0) {
219+
// There's nothing left to process at this node in the tree
220+
indent = indent.slice(0, -2);
221+
if (depth === 0) {
222+
// If we've reached the top of the stack and have no nodes left,
223+
// that means we are done with all strings of the current length
224+
break;
225+
}
226+
code += indent + '}\n';
227+
code += indent + 'break;\n';
228+
indent = indent.slice(0, -2);
229+
// Remove the current node from its parent, because it is currently
230+
// empty
231+
--depth;
232+
delete stack[depth].p[stack[depth].keys[0]];
233+
stack[depth].keys.shift();
234+
p = stack[depth].p;
235+
if (stack[depth].keys.length > 0)
236+
indent = indent.slice(0, -2);
237+
continue;
238+
}
239+
var chr = keys[0];
240+
var lowerCode = chr.charCodeAt(0);
241+
var upperCode = chr.toUpperCase().charCodeAt(0);
242+
indent += ' ';
243+
var commentChr = JSON.stringify(chr);
244+
code += indent + `case ${lowerCode}: // ${commentChr} \n`;
245+
if (lowerCode !== upperCode) {
246+
commentChr = JSON.stringify(chr.toUpperCase());
247+
code += indent + `case ${upperCode}: // ${commentChr} \n`;
248+
}
249+
if (depth + 1 === len) {
250+
// We're at a leaf node (the end of a string), this is where we can
251+
// return whatever type of output that was requested
252+
if (returnLowered)
253+
code += indent + ` return ${JSON.stringify(p[chr])};\n`;
254+
else
255+
code += indent + ' return true;\n';
256+
// Remove the current (leaf) node
257+
keys.shift();
258+
delete p[chr];
259+
} else {
260+
indent += ' ';
261+
p = p[chr];
262+
++depth;
263+
// We're descending the tree another level, so make sure to force a
264+
// re-rendering of the keys at the current node in case we are
265+
// descending again after the first string
266+
stack[depth].keys = null;
267+
}
268+
}
269+
code += indent + '}\n';
270+
code += indent + 'break;\n';
271+
indent = indent.slice(0, -2);
272+
});
273+
code += '}\n';
274+
}
275+
code += `return ${returnLowered ? 'undefined' : 'false'};\n`;
276+
if (fnName)
277+
return (new Function(`return function ${fnName}(input) {\n${code}};`))();
278+
else
279+
return new Function('input', code);
280+
};

test/parallel/test-buffer.js

+4-1
Original file line numberDiff line numberDiff line change
@@ -997,7 +997,10 @@ Buffer(Buffer(0), 0, 0);
997997
'ucs2',
998998
'ucs-2',
999999
'utf16le',
1000-
'utf-16le' ].forEach(function(enc) {
1000+
'utf-16le',
1001+
'HEX',
1002+
'UTF-8',
1003+
'BaSe64' ].forEach(function(enc) {
10011004
assert.equal(Buffer.isEncoding(enc), true);
10021005
});
10031006

0 commit comments

Comments
 (0)