Skip to content

Commit 3be9b22

Browse files
addaleaxtargos
authored andcommitted
encoding: make TextDecoder handle BOM correctly
Do not accept the BOM if it comes from a different encoding, and only discard the BOM after it has actually been read (including when it is spread over multiple chunks in streaming mode). Fixes: #25315 PR-URL: #30132 Reviewed-By: Gus Caplan <[email protected]>
1 parent 2b162a8 commit 3be9b22

File tree

5 files changed

+49
-34
lines changed

5 files changed

+49
-34
lines changed

lib/internal/encoding.js

+12-15
Original file line numberDiff line numberDiff line change
@@ -484,25 +484,22 @@ function makeTextDecoderJS() {
484484
this[kFlags] |= CONVERTER_FLAGS_FLUSH;
485485
}
486486

487-
if (!this[kBOMSeen] && !(this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM)) {
488-
if (this[kEncoding] === 'utf-8') {
489-
if (input.length >= 3 &&
490-
input[0] === 0xEF && input[1] === 0xBB && input[2] === 0xBF) {
491-
input = input.slice(3);
492-
}
493-
} else if (this[kEncoding] === 'utf-16le') {
494-
if (input.length >= 2 && input[0] === 0xFF && input[1] === 0xFE) {
495-
input = input.slice(2);
496-
}
487+
let result = this[kFlags] & CONVERTER_FLAGS_FLUSH ?
488+
this[kHandle].end(input) :
489+
this[kHandle].write(input);
490+
491+
if (result.length > 0 &&
492+
!this[kBOMSeen] &&
493+
!(this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM)) {
494+
// If the very first result in the stream is a BOM, and we are not
495+
// explicitly told to ignore it, then we discard it.
496+
if (result[0] === '\ufeff') {
497+
result = result.slice(1);
497498
}
498499
this[kBOMSeen] = true;
499500
}
500501

501-
if (this[kFlags] & CONVERTER_FLAGS_FLUSH) {
502-
return this[kHandle].end(input);
503-
}
504-
505-
return this[kHandle].write(input);
502+
return result;
506503
}
507504
}
508505

src/node_buffer.cc

+4-4
Original file line numberDiff line numberDiff line change
@@ -219,10 +219,10 @@ size_t Length(Local<Object> obj) {
219219
}
220220

221221

222-
inline MaybeLocal<Uint8Array> New(Environment* env,
223-
Local<ArrayBuffer> ab,
224-
size_t byte_offset,
225-
size_t length) {
222+
MaybeLocal<Uint8Array> New(Environment* env,
223+
Local<ArrayBuffer> ab,
224+
size_t byte_offset,
225+
size_t length) {
226226
CHECK(!env->buffer_prototype_object().IsEmpty());
227227
Local<Uint8Array> ui = Uint8Array::New(ab, byte_offset, length);
228228
Maybe<bool> mb =

src/node_i18n.cc

+27-10
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ using v8::NewStringType;
9595
using v8::Object;
9696
using v8::ObjectTemplate;
9797
using v8::String;
98+
using v8::Uint8Array;
9899
using v8::Value;
99100

100101
namespace i18n {
@@ -227,25 +228,41 @@ class ConverterObject : public BaseObject, Converter {
227228
const char* source = input.data();
228229
size_t source_length = input.length();
229230

230-
if (converter->unicode_ && !converter->ignoreBOM_ && !converter->bomSeen_) {
231-
int32_t bomOffset = 0;
232-
ucnv_detectUnicodeSignature(source, source_length, &bomOffset, &status);
233-
source += bomOffset;
234-
source_length -= bomOffset;
235-
converter->bomSeen_ = true;
236-
}
237-
238231
UChar* target = *result;
239232
ucnv_toUnicode(converter->conv,
240233
&target, target + (limit * sizeof(UChar)),
241234
&source, source + source_length,
242235
nullptr, flush, &status);
243236

244237
if (U_SUCCESS(status)) {
245-
if (limit > 0)
238+
bool omit_initial_bom = false;
239+
if (limit > 0) {
246240
result.SetLength(target - &result[0]);
241+
if (result.length() > 0 &&
242+
converter->unicode_ &&
243+
!converter->ignoreBOM_ &&
244+
!converter->bomSeen_) {
245+
// If the very first result in the stream is a BOM, and we are not
246+
// explicitly told to ignore it, then we mark it for discarding.
247+
if (result[0] == 0xFEFF) {
248+
omit_initial_bom = true;
249+
}
250+
converter->bomSeen_ = true;
251+
}
252+
}
247253
ret = ToBufferEndian(env, &result);
248-
args.GetReturnValue().Set(ret.ToLocalChecked());
254+
if (omit_initial_bom && !ret.IsEmpty()) {
255+
// Peform `ret = ret.slice(2)`.
256+
CHECK(ret.ToLocalChecked()->IsUint8Array());
257+
Local<Uint8Array> orig_ret = ret.ToLocalChecked().As<Uint8Array>();
258+
ret = Buffer::New(env,
259+
orig_ret->Buffer(),
260+
orig_ret->ByteOffset() + 2,
261+
orig_ret->ByteLength() - 2)
262+
.FromMaybe(Local<Uint8Array>());
263+
}
264+
if (!ret.IsEmpty())
265+
args.GetReturnValue().Set(ret.ToLocalChecked());
249266
return;
250267
}
251268

src/node_internals.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,11 @@ v8::MaybeLocal<v8::Object> New(Environment* env,
161161
char* data,
162162
size_t length,
163163
bool uses_malloc);
164-
164+
// Creates a Buffer instance over an existing Uint8Array.
165+
v8::MaybeLocal<v8::Uint8Array> New(Environment* env,
166+
v8::Local<v8::ArrayBuffer> ab,
167+
size_t byte_offset,
168+
size_t length);
165169
// Construct a Buffer from a MaybeStackBuffer (and also its subclasses like
166170
// Utf8Value and TwoByteValue).
167171
// If |buf| is invalidated, an empty MaybeLocal is returned, and nothing is

test/wpt/status/encoding.json

+1-4
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,7 @@
2222
"fail": "iso-2022-jp decoder state handling bug: https://encoding.spec.whatwg.org/#iso-2022-jp-decoder"
2323
},
2424
"textdecoder-byte-order-marks.any.js": {
25-
"fail": "Mismatching BOM should not be ignored"
26-
},
27-
"textdecoder-copy.any.js": {
28-
"fail": "Should not have output BOM: https://encoding.spec.whatwg.org/#concept-td-serialize"
25+
"requires": ["small-icu"]
2926
},
3027
"textdecoder-fatal-single-byte.any.js": {
3128
"requires": ["full-icu"],

0 commit comments

Comments
 (0)