lib: introduce string matcher function generator

mscdex · mscdex · commit 37725e9baefd · 2016-02-17T15:57:25.000-05:00
This commit introduces a case-insensitive string matcher function
generator. It generates (readable) functions that can match strings
faster than converting a string to all lowercase/uppercase and then
checking against a set of strings.

Currently this is used for Buffer.isEncoding() to help improve
performance in the slow case where the encoding does not initially
match a lowered case version.
diff --git a/benchmark/buffers/buffer-isencoding.js b/benchmark/buffers/buffer-isencoding.js
@@ -0,0 +1,53 @@
+'use strict';
+
+const common = require('../common.js');
+const v8 = require('v8');
+
+const bench = common.createBenchmark(main, {
+  encoding: [
+    'hex',
+    'utf8',
+    'utf-8',
+    'ascii',
+    'binary',
+    'base64',
+    'ucs2',
+    'ucs-2',
+    'utf16le',
+    'utf-16le',
+    'HEX',
+    'UTF8',
+    'UTF-8',
+    'ASCII',
+    'BINARY',
+    'BASE64',
+    'UCS2',
+    'UCS-2',
+    'UTF16LE',
+    'UTF-16LE',
+    'utf9',
+    'utf-7',
+    'utf17le',
+    'utf-17le',
+    'Unicode-FTW',
+    'new gnu gun'
+  ],
+  n: [1e8]
+});
+
+function main(conf) {
+  var encoding = conf.encoding;
+  var n = +conf.n;
+
+  // Force optimization before starting the benchmark
+  Buffer.isEncoding(encoding);
+  v8.setFlagsFromString('--allow_natives_syntax');
+  eval('%OptimizeFunctionOnNextCall(Buffer.isEncoding)');
+  Buffer.isEncoding(encoding);
+
+  bench.start();
+  for (let i = 0; i < n; i++) {
+    Buffer.isEncoding(encoding);
+  }
+  bench.end(n);
+}
diff --git a/lib/buffer.js b/lib/buffer.js
@@ -1,6 +1,7 @@
 /* eslint-disable require-buffer */
 'use strict';
 
+const makeStringMatcher = require('internal/util').makeStringMatcher;
 const binding = process.binding('buffer');
 const bindingObj = {};
 
@@ -17,6 +18,18 @@ var poolSize, poolOffset, allocPool;
 binding.setupBufferJS(Buffer.prototype, bindingObj);
 const flags = bindingObj.flags;
 const kNoZeroFill = 0;
+const ENCODINGS = [
+  'hex',
+  'utf8',
+  'utf-8',
+  'ascii',
+  'binary',
+  'base64',
+  'ucs2',
+  'ucs-2',
+  'utf16le',
+  'utf-16le'
+];
 
 function createBuffer(size) {
   const ui8 = new Uint8Array(size);
@@ -184,29 +197,37 @@ Buffer.compare = function compare(a, b) {
 };
 
 
+const isEncodingCI = makeStringMatcher(ENCODINGS, 'isEncodingCI');
 Buffer.isEncoding = function(encoding) {
-  var loweredCase = false;
-  for (;;) {
-    switch (encoding) {
-      case 'hex':
-      case 'utf8':
-      case 'utf-8':
-      case 'ascii':
-      case 'binary':
-      case 'base64':
-      case 'ucs2':
-      case 'ucs-2':
-      case 'utf16le':
-      case 'utf-16le':
+  if (typeof encoding !== 'string')
+    encoding = '' + encoding;
+  switch (encoding.length) {
+    case 3:
+      if (encoding === 'hex')
         return true;
-
-      default:
-        if (loweredCase)
-          return false;
-        encoding = ('' + encoding).toLowerCase();
-        loweredCase = true;
-    }
+      break;
+    case 4:
+      if (encoding === 'utf8' || encoding === 'ucs2')
+        return true;
+      break;
+    case 5:
+      if (encoding === 'utf-8' || encoding === 'ascii' || encoding === 'ucs-2')
+        return true;
+      break;
+    case 6:
+      if (encoding === 'binary' || encoding === 'base64')
+        return true;
+      break;
+    case 7:
+      if (encoding === 'utf16le')
+        return true;
+      break;
+    case 8:
+      if (encoding === 'utf-16le')
+        return true;
+      break;
   }
+  return isEncodingCI(encoding);
 };
 
 
diff --git a/lib/internal/util.js b/lib/internal/util.js
@@ -96,3 +96,185 @@ exports.isError = function isError(e) {
 exports.objectToString = function objectToString(o) {
   return Object.prototype.toString.call(o);
 };
+
+exports.makeStringMatcher = function(strings, fnName, returnLowered) {
+  if (!Array.isArray(strings) &&
+      (typeof strings !== 'object' || strings === null)) {
+    throw new Error('"strings" argument must be an array or an object');
+  }
+
+  if (typeof fnName !== 'string') {
+    returnLowered = fnName;
+    fnName = undefined;
+  }
+
+  var minLen = Infinity;
+  var maxLen = -1;
+  var replaces;
+
+  // A tree-like object that stores paths to strings, character by character
+  var paths = Object.create(null);
+
+  if (!Array.isArray(strings)) {
+    if (returnLowered) {
+      // Allow an object that maps allowed inputs to desired return values
+      // This is useful to normalize outputs
+      // (e.g. { 'utf8': 'utf8', 'utf-8': 'utf8' })
+      replaces = strings;
+    }
+    strings = Object.keys(strings);
+  } else {
+    strings = strings.slice();
+  }
+
+  strings.sort(function(a, b) {
+    // Sort the allowed inputs by length first, then by normal string comparison
+    a = ('' + a).toLowerCase();
+    b = ('' + b).toLowerCase();
+    if (a.length === b.length) {
+      if (a < b)
+        return -1;
+      else if (a > b)
+        return 1;
+      else
+        return 0;
+    }
+    return a.length - b.length;
+  }).forEach(function(string) {
+    // Populate our tree-like object, grouping strings by length
+    // (e.g. for `strings` of ['utf8', 'ucs2'] this would look like:
+    //  {
+    //    4: {
+    //      'u': {
+    //        'c': {
+    //          's': {
+    //            '2': 'ucs2'
+    //          }
+    //        },
+    //        't': {
+    //          'f': {
+    //            '8': 'utf8'
+    //          }
+    //        }
+    //    }
+    //  })
+    string = ('' + string).toLowerCase();
+    minLen = Math.min(string.length, minLen);
+    maxLen = Math.max(string.length, maxLen);
+    if (paths[string.length] === undefined)
+      paths[string.length] = Object.create(null);
+    var p = paths[string.length];
+    for (var i = 0; i < string.length; ++i) {
+      var chr = string[i];
+      if (p[chr] === undefined) {
+        if (i + 1 < string.length)
+          p = p[chr] = Object.create(null);
+        else
+          p[chr] = (replaces ? replaces[string] : string);
+      } else
+        p = p[chr];
+    }
+  });
+
+  var code = "'use strict';\n";
+
+  if (maxLen > -1) {
+    code += 'switch (input.length) {\n';
+    var indent = '  ';
+    Object.keys(paths).forEach(function(len) {
+      len = +len;
+      if (len === 0)
+        return;
+      code += indent + `case ${len}:\n`;
+      if (len === 0) {
+        // Zero length strings are a simple case that can be easily handled
+        if (returnLowered)
+          code += indent + "  return '';\n";
+        else
+          code += indent + '  return true;\n';
+        return;
+      }
+
+      var p = paths[len];
+      var depth = 0;
+      var i;
+
+      // Create a finite stack up front for tracking our traversal of the
+      // `paths` tree object
+      var stack = new Array(len);
+      for (i = 0; i < len; ++i)
+        stack[i] = { p: null, keys: null };
+
+      indent += '  ';
+      while (true) {
+        stack[depth].p = p;
+        var keys = stack[depth].keys;
+        if (keys === null) {
+          // We need to refresh our key list to start descending the current
+          // path in the tree
+          keys = stack[depth].keys = Object.keys(p);
+          code += indent + `switch (input.charCodeAt(${depth})) {\n`;
+        }
+        if (keys.length === 0) {
+          // There's nothing left to process at this node in the tree
+          indent = indent.slice(0, -2);
+          if (depth === 0) {
+            // If we've reached the top of the stack and have no nodes left,
+            // that means we are done with all strings of the current length
+            break;
+          }
+          code += indent + '}\n';
+          code += indent + 'break;\n';
+          indent = indent.slice(0, -2);
+          // Remove the current node from its parent, because it is currently
+          // empty
+          --depth;
+          delete stack[depth].p[stack[depth].keys[0]];
+          stack[depth].keys.shift();
+          p = stack[depth].p;
+          if (stack[depth].keys.length > 0)
+            indent = indent.slice(0, -2);
+          continue;
+        }
+        var chr = keys[0];
+        var lowerCode = chr.charCodeAt(0);
+        var upperCode = chr.toUpperCase().charCodeAt(0);
+        indent += '  ';
+        var commentChr = JSON.stringify(chr);
+        code += indent + `case ${lowerCode}: // ${commentChr} \n`;
+        if (lowerCode !== upperCode) {
+          commentChr = JSON.stringify(chr.toUpperCase());
+          code += indent + `case ${upperCode}: // ${commentChr} \n`;
+        }
+        if (depth + 1 === len) {
+          // We're at a leaf node (the end of a string), this is where we can
+          // return whatever type of output that was requested
+          if (returnLowered)
+            code += indent + `  return ${JSON.stringify(p[chr])};\n`;
+          else
+            code += indent + '  return true;\n';
+          // Remove the current (leaf) node
+          keys.shift();
+          delete p[chr];
+        } else {
+          indent += '  ';
+          p = p[chr];
+          ++depth;
+          // We're descending the tree another level, so make sure to force a
+          // re-rendering of the keys at the current node in case we are
+          // descending again after the first string
+          stack[depth].keys = null;
+        }
+      }
+      code += indent + '}\n';
+      code += indent + 'break;\n';
+      indent = indent.slice(0, -2);
+    });
+    code += '}\n';
+  }
+  code += `return ${returnLowered ? 'undefined' : 'false'};\n`;
+  if (fnName)
+    return (new Function(`return function ${fnName}(input) {\n${code}};`))();
+  else
+    return new Function('input', code);
+};
diff --git a/test/parallel/test-buffer.js b/test/parallel/test-buffer.js
@@ -997,7 +997,10 @@ Buffer(Buffer(0), 0, 0);
   'ucs2',
   'ucs-2',
   'utf16le',
-  'utf-16le' ].forEach(function(enc) {
+  'utf-16le',
+  'HEX',
+  'UTF-8',
+  'BaSe64' ].forEach(function(enc) {
     assert.equal(Buffer.isEncoding(enc), true);
   });