Skip to content

Commit 24ef1e6

Browse files
mscdextargos
authored andcommitted
string_decoder: align UTF-8 handling with V8
V8 5.5 changed how invalid characters are handled and it now appears to follow the WHATWG Encoding standard, where all of an invalid character's bytes are replaced by a single replacement character (\ufffd) instead of replacing each invalid byte with separate replacement characters. Example: the byte sequence 0xF0,0xB8,0x41 is decoded as '\ufffdA' in V8 5.5, but is decoded as '\ufffd\ufffdA' in previous versions of V8. PR-URL: #9618 Reviewed-By: Ali Ijaz Sheikh <[email protected]> Reviewed-By: Ben Noordhuis <[email protected]>
1 parent 007386e commit 24ef1e6

File tree

3 files changed

+16
-28
lines changed

3 files changed

+16
-28
lines changed

lib/string_decoder.js

+11-11
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ StringDecoder.prototype.fillLast = function(buf) {
8181
};
8282

8383
// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
84-
// continuation byte.
84+
// continuation byte. If an invalid byte is detected, -2 is returned.
8585
function utf8CheckByte(byte) {
8686
if (byte <= 0x7F)
8787
return 0;
@@ -91,7 +91,7 @@ function utf8CheckByte(byte) {
9191
return 3;
9292
else if (byte >> 3 === 0x1E)
9393
return 4;
94-
return -1;
94+
return (byte >> 6 === 0x02 ? -1 : -2);
9595
}
9696

9797
// Checks at most 3 bytes at the end of a Buffer in order to detect an
@@ -107,15 +107,15 @@ function utf8CheckIncomplete(self, buf, i) {
107107
self.lastNeed = nb - 1;
108108
return nb;
109109
}
110-
if (--j < i)
110+
if (--j < i || nb === -2)
111111
return 0;
112112
nb = utf8CheckByte(buf[j]);
113113
if (nb >= 0) {
114114
if (nb > 0)
115115
self.lastNeed = nb - 2;
116116
return nb;
117117
}
118-
if (--j < i)
118+
if (--j < i || nb === -2)
119119
return 0;
120120
nb = utf8CheckByte(buf[j]);
121121
if (nb >= 0) {
@@ -133,25 +133,25 @@ function utf8CheckIncomplete(self, buf, i) {
133133
// Validates as many continuation bytes for a multi-byte UTF-8 character as
134134
// needed or are available. If we see a non-continuation byte where we expect
135135
// one, we "replace" the validated continuation bytes we've seen so far with
136-
// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding
136+
// a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding
137137
// behavior. The continuation byte check is included three times in the case
138138
// where all of the continuation bytes for a character exist in the same buffer.
139139
// It is also done this way as a slight performance increase instead of using a
140140
// loop.
141141
function utf8CheckExtraBytes(self, buf, p) {
142142
if ((buf[0] & 0xC0) !== 0x80) {
143143
self.lastNeed = 0;
144-
return '\ufffd'.repeat(p);
144+
return '\ufffd';
145145
}
146146
if (self.lastNeed > 1 && buf.length > 1) {
147147
if ((buf[1] & 0xC0) !== 0x80) {
148148
self.lastNeed = 1;
149-
return '\ufffd'.repeat(p + 1);
149+
return '\ufffd';
150150
}
151151
if (self.lastNeed > 2 && buf.length > 2) {
152152
if ((buf[2] & 0xC0) !== 0x80) {
153153
self.lastNeed = 2;
154-
return '\ufffd'.repeat(p + 2);
154+
return '\ufffd';
155155
}
156156
}
157157
}
@@ -184,12 +184,12 @@ function utf8Text(buf, i) {
184184
return buf.toString('utf8', i, end);
185185
}
186186

187-
// For UTF-8, a replacement character for each buffered byte of a (partial)
188-
// character needs to be added to the output.
187+
// For UTF-8, a replacement character is added when ending on a partial
188+
// character.
189189
function utf8End(buf) {
190190
const r = (buf && buf.length ? this.write(buf) : '');
191191
if (this.lastNeed)
192-
return r + '\ufffd'.repeat(this.lastTotal - this.lastNeed);
192+
return r + '\ufffd';
193193
return r;
194194
}
195195

test/parallel/test-string-decoder-end.js

-7
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,13 @@ for (let i = 1; i <= 16; i++) {
1818

1919
encodings.forEach(testEncoding);
2020

21-
console.log('ok');
22-
2321
function testEncoding(encoding) {
2422
bufs.forEach((buf) => {
2523
testBuf(encoding, buf);
2624
});
2725
}
2826

2927
function testBuf(encoding, buf) {
30-
console.error('# %s', encoding, buf);
31-
3228
// write one byte at a time.
3329
let s = new SD(encoding);
3430
let res1 = '';
@@ -46,9 +42,6 @@ function testBuf(encoding, buf) {
4642
// .toString() on the buffer
4743
const res3 = buf.toString(encoding);
4844

49-
console.log('expect=%j', res3);
50-
console.log('res1=%j', res1);
51-
console.log('res2=%j', res2);
5245
assert.strictEqual(res1, res3, 'one byte at a time should match toString');
5346
assert.strictEqual(res2, res3, 'all bytes at once should match toString');
5447
}

test/parallel/test-string-decoder.js

+5-10
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ const StringDecoder = require('string_decoder').StringDecoder;
88
let decoder = new StringDecoder();
99
assert.strictEqual(decoder.encoding, 'utf8');
1010

11-
process.stdout.write('scanning ');
12-
1311
// UTF-8
1412
test('utf-8', Buffer.from('$', 'utf-8'), '$');
1513
test('utf-8', Buffer.from('¢', 'utf-8'), '¢');
@@ -42,32 +40,30 @@ test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA');
4240
test('utf-8', Buffer.from('E2', 'hex'), '\ufffd');
4341
test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA');
4442
test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338');
45-
test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA');
43+
test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffdA');
4644
test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338');
4745
test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0');
4846
test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38');
49-
test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338');
47+
test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\u0338');
5048
test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001');
51-
test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D
52-
'\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd');
5349
test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379');
50+
// CESU-8 of U+1D40D
51+
test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), '\ufffd\ufffd');
5452

5553
// UCS-2
5654
test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc');
5755

5856
// UTF-16LE
5957
test('utf16le', Buffer.from('3DD84DDC', 'hex'), '\ud83d\udc4d'); // thumbs up
6058

61-
console.log(' crayon!');
62-
6359
// Additional UTF-8 tests
6460
decoder = new StringDecoder('utf8');
6561
assert.strictEqual(decoder.write(Buffer.from('E1', 'hex')), '');
6662
assert.strictEqual(decoder.end(), '\ufffd');
6763

6864
decoder = new StringDecoder('utf8');
6965
assert.strictEqual(decoder.write(Buffer.from('E18B', 'hex')), '');
70-
assert.strictEqual(decoder.end(), '\ufffd\ufffd');
66+
assert.strictEqual(decoder.end(), '\ufffd');
7167

7268
decoder = new StringDecoder('utf8');
7369
assert.strictEqual(decoder.write(Buffer.from('\ufffd')), '\ufffd');
@@ -131,7 +127,6 @@ function test(encoding, input, expected, singleSequence) {
131127
output += decoder.write(input.slice(write[0], write[1]));
132128
});
133129
output += decoder.end();
134-
process.stdout.write('.');
135130
if (output !== expected) {
136131
const message =
137132
'Expected "' + unicodeEscape(expected) + '", ' +

0 commit comments

Comments
 (0)