Skip to content

Commit 4e65198

Browse files
addaleaxtargos
authored andcommitted
src: allow UTF-16 in generic StringBytes decode call
This allows removing a number of special cases in other parts of the code, at least one of which was incorrect in expecting aligned input when that was not guaranteed. Fixes: #22358 PR-URL: #22622 Reviewed-By: Tobias Nießen <[email protected]>
1 parent f064d44 commit 4e65198

File tree

6 files changed

+40
-96
lines changed

6 files changed

+40
-96
lines changed

src/node.h

-1
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,6 @@ NODE_DEPRECATED("Use FatalException(isolate, ...)",
418418
return FatalException(v8::Isolate::GetCurrent(), try_catch);
419419
})
420420

421-
// Don't call with encoding=UCS2.
422421
NODE_EXTERN v8::Local<v8::Value> Encode(v8::Isolate* isolate,
423422
const char* buf,
424423
size_t len,

src/node_buffer.cc

-59
Original file line numberDiff line numberDiff line change
@@ -467,65 +467,6 @@ void StringSlice(const FunctionCallbackInfo<Value>& args) {
467467
}
468468

469469

470-
template <>
471-
void StringSlice<UCS2>(const FunctionCallbackInfo<Value>& args) {
472-
Isolate* isolate = args.GetIsolate();
473-
Environment* env = Environment::GetCurrent(isolate);
474-
475-
THROW_AND_RETURN_UNLESS_BUFFER(env, args.This());
476-
SPREAD_BUFFER_ARG(args.This(), ts_obj);
477-
478-
if (ts_obj_length == 0)
479-
return args.GetReturnValue().SetEmptyString();
480-
481-
SLICE_START_END(args[0], args[1], ts_obj_length)
482-
length /= 2;
483-
484-
const char* data = ts_obj_data + start;
485-
const uint16_t* buf;
486-
bool release = false;
487-
488-
// Node's "ucs2" encoding expects LE character data inside a Buffer, so we
489-
// need to reorder on BE platforms. See https://nodejs.org/api/buffer.html
490-
// regarding Node's "ucs2" encoding specification.
491-
const bool aligned = (reinterpret_cast<uintptr_t>(data) % sizeof(*buf) == 0);
492-
if (IsLittleEndian() && !aligned) {
493-
// Make a copy to avoid unaligned accesses in v8::String::NewFromTwoByte().
494-
// This applies ONLY to little endian platforms, as misalignment will be
495-
// handled by a byte-swapping operation in StringBytes::Encode on
496-
// big endian platforms.
497-
uint16_t* copy = new uint16_t[length];
498-
for (size_t i = 0, k = 0; i < length; i += 1, k += 2) {
499-
// Assumes that the input is little endian.
500-
const uint8_t lo = static_cast<uint8_t>(data[k + 0]);
501-
const uint8_t hi = static_cast<uint8_t>(data[k + 1]);
502-
copy[i] = lo | hi << 8;
503-
}
504-
buf = copy;
505-
release = true;
506-
} else {
507-
buf = reinterpret_cast<const uint16_t*>(data);
508-
}
509-
510-
Local<Value> error;
511-
MaybeLocal<Value> ret =
512-
StringBytes::Encode(isolate,
513-
buf,
514-
length,
515-
&error);
516-
517-
if (release)
518-
delete[] buf;
519-
520-
if (ret.IsEmpty()) {
521-
CHECK(!error.IsEmpty());
522-
isolate->ThrowException(error);
523-
return;
524-
}
525-
args.GetReturnValue().Set(ret.ToLocalChecked());
526-
}
527-
528-
529470
// bytesCopied = copy(buffer, target[, targetStart][, sourceStart][, sourceEnd])
530471
void Copy(const FunctionCallbackInfo<Value> &args) {
531472
Environment* env = Environment::GetCurrent(args);

src/string_bytes.cc

+32-25
Original file line numberDiff line numberDiff line change
@@ -625,7 +625,6 @@ MaybeLocal<Value> StringBytes::Encode(Isolate* isolate,
625625
size_t buflen,
626626
enum encoding encoding,
627627
Local<Value>* error) {
628-
CHECK_NE(encoding, UCS2);
629628
CHECK_BUFLEN_IN_RANGE(buflen);
630629

631630
if (!buflen && encoding != BUFFER) {
@@ -703,6 +702,37 @@ MaybeLocal<Value> StringBytes::Encode(Isolate* isolate,
703702
return ExternOneByteString::New(isolate, dst, dlen, error);
704703
}
705704

705+
case UCS2: {
706+
if (IsBigEndian()) {
707+
uint16_t* dst = node::UncheckedMalloc<uint16_t>(buflen / 2);
708+
if (dst == nullptr) {
709+
*error = node::ERR_MEMORY_ALLOCATION_FAILED(isolate);
710+
return MaybeLocal<Value>();
711+
}
712+
for (size_t i = 0, k = 0; k < buflen / 2; i += 2, k += 1) {
713+
// The input is in *little endian*, because that's what Node.js
714+
// expects, so the high byte comes after the low byte.
715+
const uint8_t hi = static_cast<uint8_t>(buf[i + 1]);
716+
const uint8_t lo = static_cast<uint8_t>(buf[i + 0]);
717+
dst[k] = static_cast<uint16_t>(hi) << 8 | lo;
718+
}
719+
return ExternTwoByteString::New(isolate, dst, buflen / 2, error);
720+
}
721+
if (reinterpret_cast<uintptr_t>(buf) % 2 != 0) {
722+
// Unaligned data still means we can't directly pass it to V8.
723+
char* dst = node::UncheckedMalloc(buflen);
724+
if (dst == nullptr) {
725+
*error = node::ERR_MEMORY_ALLOCATION_FAILED(isolate);
726+
return MaybeLocal<Value>();
727+
}
728+
memcpy(dst, buf, buflen);
729+
return ExternTwoByteString::New(
730+
isolate, reinterpret_cast<uint16_t*>(dst), buflen / 2, error);
731+
}
732+
return ExternTwoByteString::NewFromCopy(
733+
isolate, reinterpret_cast<const uint16_t*>(buf), buflen / 2, error);
734+
}
735+
706736
default:
707737
CHECK(0 && "unknown encoding");
708738
break;
@@ -742,30 +772,7 @@ MaybeLocal<Value> StringBytes::Encode(Isolate* isolate,
742772
enum encoding encoding,
743773
Local<Value>* error) {
744774
const size_t len = strlen(buf);
745-
MaybeLocal<Value> ret;
746-
if (encoding == UCS2) {
747-
// In Node, UCS2 means utf16le. The data must be in little-endian
748-
// order and must be aligned on 2-bytes. This returns an empty
749-
// value if it's not aligned and ensures the appropriate byte order
750-
// on big endian architectures.
751-
const bool be = IsBigEndian();
752-
if (len % 2 != 0)
753-
return ret;
754-
std::vector<uint16_t> vec(len / 2);
755-
for (size_t i = 0, k = 0; i < len; i += 2, k += 1) {
756-
const uint8_t hi = static_cast<uint8_t>(buf[i + 0]);
757-
const uint8_t lo = static_cast<uint8_t>(buf[i + 1]);
758-
vec[k] = be ?
759-
static_cast<uint16_t>(hi) << 8 | lo
760-
: static_cast<uint16_t>(lo) << 8 | hi;
761-
}
762-
ret = vec.empty() ?
763-
static_cast< Local<Value> >(String::Empty(isolate))
764-
: StringBytes::Encode(isolate, &vec[0], vec.size(), error);
765-
} else {
766-
ret = StringBytes::Encode(isolate, buf, len, encoding, error);
767-
}
768-
return ret;
775+
return Encode(isolate, buf, len, encoding, error);
769776
}
770777

771778
} // namespace node

src/string_bytes.h

-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,6 @@ class StringBytes {
9090
int* chars_written = nullptr);
9191

9292
// Take the bytes in the src, and turn it into a Buffer or String.
93-
// Don't call with encoding=UCS2.
9493
static v8::MaybeLocal<v8::Value> Encode(v8::Isolate* isolate,
9594
const char* buf,
9695
size_t buflen,

src/string_decoder.cc

-10
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,6 @@ MaybeLocal<String> MakeString(Isolate* isolate,
3030
data,
3131
v8::NewStringType::kNormal,
3232
length);
33-
} else if (encoding == UCS2) {
34-
#ifdef DEBUG
35-
CHECK_EQ(reinterpret_cast<uintptr_t>(data) % 2, 0);
36-
CHECK_EQ(length % 2, 0);
37-
#endif
38-
ret = StringBytes::Encode(
39-
isolate,
40-
reinterpret_cast<const uint16_t*>(data),
41-
length / 2,
42-
&error);
4333
} else {
4434
ret = StringBytes::Encode(
4535
isolate,

test/parallel/test-string-decoder.js

+8
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,14 @@ decoder = new StringDecoder('utf16le');
143143
assert.strictEqual(decoder.write(Buffer.from('3DD84D', 'hex')), '\ud83d');
144144
assert.strictEqual(decoder.end(), '');
145145

146+
// Regression test for https://github.com/nodejs/node/issues/22358
147+
// (unaligned UTF-16 access).
148+
decoder = new StringDecoder('utf16le');
149+
assert.strictEqual(decoder.write(Buffer.alloc(1)), '');
150+
assert.strictEqual(decoder.write(Buffer.alloc(20)), '\0'.repeat(10));
151+
assert.strictEqual(decoder.write(Buffer.alloc(48)), '\0'.repeat(24));
152+
assert.strictEqual(decoder.end(), '');
153+
146154
common.expectsError(
147155
() => new StringDecoder(1),
148156
{

0 commit comments

Comments
 (0)