Skip to content

Commit 21fb98e

Browse files
authored
src: use simdutf for converting externalized builtins to UTF-16
Remove the dependency on ICU for this part, as well as the hacky way of converting embedder main sources to UTF-8 via V8 APIs. Allow `UnionBytes` to own the memory its pointing to in order to simplify the code on the `BuiltinLoader` side. PR-URL: #46119 Reviewed-By: Yagiz Nizipli <[email protected]> Reviewed-By: James M Snell <[email protected]> Reviewed-By: Rafael Gonzaga <[email protected]> Reviewed-By: Darshan Sen <[email protected]>
1 parent 757c104 commit 21fb98e

10 files changed

+106
-105
lines changed

configure.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -2024,11 +2024,7 @@ def make_bin_override():
20242024
for builtin in shareable_builtins:
20252025
builtin_id = 'node_shared_builtin_' + builtin.replace('/', '_') + '_path'
20262026
if getattr(options, builtin_id):
2027-
if options.with_intl == 'none':
2028-
option_name = '--shared-builtin-' + builtin + '-path'
2029-
error(option_name + ' is incompatible with --with-intl=none' )
2030-
else:
2031-
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
2027+
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
20322028
else:
20332029
output['variables']['node_builtin_shareable_builtins'] += [shareable_builtins[builtin]]
20342030

node.gyp

+1
Original file line numberDiff line numberDiff line change
@@ -973,6 +973,7 @@
973973
'deps/googletest/googletest.gyp:gtest_main',
974974
'deps/histogram/histogram.gyp:histogram',
975975
'deps/uvwasi/uvwasi.gyp:uvwasi',
976+
'deps/simdutf/simdutf.gyp:simdutf',
976977
],
977978

978979
'includes': [

src/api/environment.cc

+2-13
Original file line numberDiff line numberDiff line change
@@ -467,21 +467,10 @@ MaybeLocal<Value> LoadEnvironment(
467467
Environment* env,
468468
const char* main_script_source_utf8) {
469469
CHECK_NOT_NULL(main_script_source_utf8);
470-
Isolate* isolate = env->isolate();
471470
return LoadEnvironment(
472-
env,
473-
[&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
474-
// This is a slightly hacky way to convert UTF-8 to UTF-16.
475-
Local<String> str =
476-
String::NewFromUtf8(isolate,
477-
main_script_source_utf8).ToLocalChecked();
478-
auto main_utf16 = std::make_unique<String::Value>(isolate, str);
479-
480-
// TODO(addaleax): Avoid having a global table for all scripts.
471+
env, [&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
481472
std::string name = "embedder_main_" + std::to_string(env->thread_id());
482-
builtins::BuiltinLoader::Add(
483-
name.c_str(), UnionBytes(**main_utf16, main_utf16->length()));
484-
env->set_main_utf16(std::move(main_utf16));
473+
builtins::BuiltinLoader::Add(name.c_str(), main_script_source_utf8);
485474
Realm* realm = env->principal_realm();
486475

487476
return realm->ExecuteBootstrapper(name.c_str());

src/env-inl.h

-5
Original file line numberDiff line numberDiff line change
@@ -805,11 +805,6 @@ void Environment::RemoveCleanupHook(CleanupQueue::Callback fn, void* arg) {
805805
cleanup_queue_.Remove(fn, arg);
806806
}
807807

808-
void Environment::set_main_utf16(std::unique_ptr<v8::String::Value> str) {
809-
CHECK(!main_utf16_);
810-
main_utf16_ = std::move(str);
811-
}
812-
813808
void Environment::set_process_exit_handler(
814809
std::function<void(Environment*, ExitCode)>&& handler) {
815810
process_exit_handler_ = std::move(handler);

src/env.h

-6
Original file line numberDiff line numberDiff line change
@@ -961,7 +961,6 @@ class Environment : public MemoryRetainer {
961961

962962
#endif // HAVE_INSPECTOR
963963

964-
inline void set_main_utf16(std::unique_ptr<v8::String::Value>);
965964
inline void set_process_exit_handler(
966965
std::function<void(Environment*, ExitCode)>&& handler);
967966

@@ -1144,11 +1143,6 @@ class Environment : public MemoryRetainer {
11441143

11451144
std::unique_ptr<Realm> principal_realm_ = nullptr;
11461145

1147-
// Keeps the main script source alive is one was passed to LoadEnvironment().
1148-
// We should probably find a way to just use plain `v8::String`s created from
1149-
// the source passed to LoadEnvironment() directly instead.
1150-
std::unique_ptr<v8::String::Value> main_utf16_;
1151-
11521146
// Used by allocate_managed_buffer() and release_managed_buffer() to keep
11531147
// track of the BackingStore for a given pointer.
11541148
std::unordered_map<char*, std::unique_ptr<v8::BackingStore>>

src/node_builtins.cc

+14-12
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "env-inl.h"
44
#include "node_external_reference.h"
55
#include "node_internals.h"
6+
#include "simdutf.h"
67
#include "util-inl.h"
78

89
namespace node {
@@ -35,7 +36,6 @@ BuiltinLoader BuiltinLoader::instance_;
3536

3637
BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
3738
LoadJavaScriptSource();
38-
#if defined(NODE_HAVE_I18N_SUPPORT)
3939
#ifdef NODE_SHARED_BUILTIN_CJS_MODULE_LEXER_LEXER_PATH
4040
AddExternalizedBuiltin(
4141
"internal/deps/cjs-module-lexer/lexer",
@@ -52,7 +52,6 @@ BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
5252
AddExternalizedBuiltin("internal/deps/undici/undici",
5353
STRINGIFY(NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH));
5454
#endif // NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH
55-
#endif // NODE_HAVE_I18N_SUPPORT
5655
}
5756

5857
BuiltinLoader* BuiltinLoader::GetInstance() {
@@ -240,7 +239,6 @@ MaybeLocal<String> BuiltinLoader::LoadBuiltinSource(Isolate* isolate,
240239
#endif // NODE_BUILTIN_MODULES_PATH
241240
}
242241

243-
#if defined(NODE_HAVE_I18N_SUPPORT)
244242
void BuiltinLoader::AddExternalizedBuiltin(const char* id,
245243
const char* filename) {
246244
std::string source;
@@ -252,16 +250,20 @@ void BuiltinLoader::AddExternalizedBuiltin(const char* id,
252250
return;
253251
}
254252

255-
icu::UnicodeString utf16 = icu::UnicodeString::fromUTF8(
256-
icu::StringPiece(source.data(), source.length()));
257-
auto source_utf16 = std::make_unique<icu::UnicodeString>(utf16);
258-
Add(id,
259-
UnionBytes(reinterpret_cast<const uint16_t*>((*source_utf16).getBuffer()),
260-
utf16.length()));
261-
// keep source bytes for builtin alive while BuiltinLoader exists
262-
GetInstance()->externalized_source_bytes_.push_back(std::move(source_utf16));
253+
Add(id, source);
254+
}
255+
256+
bool BuiltinLoader::Add(const char* id, std::string_view utf8source) {
257+
size_t expected_u16_length =
258+
simdutf::utf16_length_from_utf8(utf8source.data(), utf8source.length());
259+
auto out = std::make_shared<std::vector<uint16_t>>(expected_u16_length);
260+
size_t u16_length = simdutf::convert_utf8_to_utf16le(
261+
utf8source.data(),
262+
utf8source.length(),
263+
reinterpret_cast<char16_t*>(out->data()));
264+
out->resize(u16_length);
265+
return Add(id, UnionBytes(out));
263266
}
264-
#endif // NODE_HAVE_I18N_SUPPORT
265267

266268
// Returns Local<Function> of the compiled module if return_code_cache
267269
// is false (we are only compiling the function).

src/node_builtins.h

+1-8
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,6 @@
88
#include <memory>
99
#include <set>
1010
#include <string>
11-
#if defined(NODE_HAVE_I18N_SUPPORT)
12-
#include <unicode/unistr.h>
13-
#endif // NODE_HAVE_I18N_SUPPORT
1411
#include <vector>
1512
#include "node_mutex.h"
1613
#include "node_union_bytes.h"
@@ -71,6 +68,7 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
7168
static v8::Local<v8::String> GetConfigString(v8::Isolate* isolate);
7269
static bool Exists(const char* id);
7370
static bool Add(const char* id, const UnionBytes& source);
71+
static bool Add(const char* id, std::string_view utf8source);
7472

7573
static bool CompileAllBuiltins(v8::Local<v8::Context> context);
7674
static void RefreshCodeCache(const std::vector<CodeCacheInfo>& in);
@@ -136,18 +134,13 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
136134
static void HasCachedBuiltins(
137135
const v8::FunctionCallbackInfo<v8::Value>& args);
138136

139-
#if defined(NODE_HAVE_I18N_SUPPORT)
140137
static void AddExternalizedBuiltin(const char* id, const char* filename);
141-
#endif // NODE_HAVE_I18N_SUPPORT
142138

143139
static BuiltinLoader instance_;
144140
BuiltinCategories builtin_categories_;
145141
BuiltinSourceMap source_;
146142
BuiltinCodeCacheMap code_cache_;
147143
UnionBytes config_;
148-
#if defined(NODE_HAVE_I18N_SUPPORT)
149-
std::list<std::unique_ptr<icu::UnicodeString>> externalized_source_bytes_;
150-
#endif // NODE_HAVE_I18N_SUPPORT
151144

152145
// Used to synchronize access to the code cache map
153146
Mutex code_cache_mutex_;

src/node_union_bytes.h

+9-55
Original file line numberDiff line numberDiff line change
@@ -11,57 +11,20 @@
1111

1212
namespace node {
1313

14-
class NonOwningExternalOneByteResource
15-
: public v8::String::ExternalOneByteStringResource {
16-
public:
17-
explicit NonOwningExternalOneByteResource(const uint8_t* data, size_t length)
18-
: data_(data), length_(length) {}
19-
~NonOwningExternalOneByteResource() override = default;
20-
21-
const char* data() const override {
22-
return reinterpret_cast<const char*>(data_);
23-
}
24-
size_t length() const override { return length_; }
25-
26-
NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) =
27-
delete;
28-
NonOwningExternalOneByteResource& operator=(
29-
const NonOwningExternalOneByteResource&) = delete;
30-
31-
private:
32-
const uint8_t* data_;
33-
size_t length_;
34-
};
35-
36-
class NonOwningExternalTwoByteResource
37-
: public v8::String::ExternalStringResource {
38-
public:
39-
explicit NonOwningExternalTwoByteResource(const uint16_t* data, size_t length)
40-
: data_(data), length_(length) {}
41-
~NonOwningExternalTwoByteResource() override = default;
42-
43-
const uint16_t* data() const override { return data_; }
44-
size_t length() const override { return length_; }
45-
46-
NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) =
47-
delete;
48-
NonOwningExternalTwoByteResource& operator=(
49-
const NonOwningExternalTwoByteResource&) = delete;
50-
51-
private:
52-
const uint16_t* data_;
53-
size_t length_;
54-
};
55-
5614
// Similar to a v8::String, but it's independent from Isolates
5715
// and can be materialized in Isolates as external Strings
58-
// via ToStringChecked. The data pointers are owned by the caller.
16+
// via ToStringChecked.
5917
class UnionBytes {
6018
public:
6119
UnionBytes(const uint16_t* data, size_t length)
6220
: one_bytes_(nullptr), two_bytes_(data), length_(length) {}
6321
UnionBytes(const uint8_t* data, size_t length)
6422
: one_bytes_(data), two_bytes_(nullptr), length_(length) {}
23+
template <typename T> // T = uint8_t or uint16_t
24+
explicit UnionBytes(std::shared_ptr<std::vector</*const*/ T>> data)
25+
: UnionBytes(data->data(), data->size()) {
26+
owning_ptr_ = data;
27+
}
6528

6629
UnionBytes(const UnionBytes&) = default;
6730
UnionBytes& operator=(const UnionBytes&) = default;
@@ -77,23 +40,14 @@ class UnionBytes {
7740
CHECK_NOT_NULL(one_bytes_);
7841
return one_bytes_;
7942
}
80-
v8::Local<v8::String> ToStringChecked(v8::Isolate* isolate) const {
81-
if (is_one_byte()) {
82-
NonOwningExternalOneByteResource* source =
83-
new NonOwningExternalOneByteResource(one_bytes_data(), length_);
84-
return v8::String::NewExternalOneByte(isolate, source).ToLocalChecked();
85-
} else {
86-
NonOwningExternalTwoByteResource* source =
87-
new NonOwningExternalTwoByteResource(two_bytes_data(), length_);
88-
return v8::String::NewExternalTwoByte(isolate, source).ToLocalChecked();
89-
}
90-
}
91-
size_t length() { return length_; }
43+
v8::Local<v8::String> ToStringChecked(v8::Isolate* isolate) const;
44+
size_t length() const { return length_; }
9245

9346
private:
9447
const uint8_t* one_bytes_;
9548
const uint16_t* two_bytes_;
9649
size_t length_;
50+
std::shared_ptr<void> owning_ptr_;
9751
};
9852

9953
} // namespace node

src/util.cc

+62
Original file line numberDiff line numberDiff line change
@@ -515,4 +515,66 @@ void SetConstructorFunction(Isolate* isolate,
515515
that->Set(name, tmpl);
516516
}
517517

518+
namespace {
519+
520+
class NonOwningExternalOneByteResource
521+
: public v8::String::ExternalOneByteStringResource {
522+
public:
523+
explicit NonOwningExternalOneByteResource(const UnionBytes& source)
524+
: source_(source) {}
525+
~NonOwningExternalOneByteResource() override = default;
526+
527+
const char* data() const override {
528+
return reinterpret_cast<const char*>(source_.one_bytes_data());
529+
}
530+
size_t length() const override { return source_.length(); }
531+
532+
NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) =
533+
delete;
534+
NonOwningExternalOneByteResource& operator=(
535+
const NonOwningExternalOneByteResource&) = delete;
536+
537+
private:
538+
const UnionBytes source_;
539+
};
540+
541+
class NonOwningExternalTwoByteResource
542+
: public v8::String::ExternalStringResource {
543+
public:
544+
explicit NonOwningExternalTwoByteResource(const UnionBytes& source)
545+
: source_(source) {}
546+
~NonOwningExternalTwoByteResource() override = default;
547+
548+
const uint16_t* data() const override { return source_.two_bytes_data(); }
549+
size_t length() const override { return source_.length(); }
550+
551+
NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) =
552+
delete;
553+
NonOwningExternalTwoByteResource& operator=(
554+
const NonOwningExternalTwoByteResource&) = delete;
555+
556+
private:
557+
const UnionBytes source_;
558+
};
559+
560+
} // anonymous namespace
561+
562+
Local<String> UnionBytes::ToStringChecked(Isolate* isolate) const {
563+
if (UNLIKELY(length() == 0)) {
564+
// V8 requires non-null data pointers for empty external strings,
565+
// but we don't guarantee that. Solve this by not creating an
566+
// external string at all in that case.
567+
return String::Empty(isolate);
568+
}
569+
if (is_one_byte()) {
570+
NonOwningExternalOneByteResource* source =
571+
new NonOwningExternalOneByteResource(*this);
572+
return String::NewExternalOneByte(isolate, source).ToLocalChecked();
573+
} else {
574+
NonOwningExternalTwoByteResource* source =
575+
new NonOwningExternalTwoByteResource(*this);
576+
return String::NewExternalTwoByte(isolate, source).ToLocalChecked();
577+
}
578+
}
579+
518580
} // namespace node

test/cctest/test_util.cc

+16-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
#include "util-inl.h"
21
#include "debug_utils-inl.h"
32
#include "env-inl.h"
43
#include "gtest/gtest.h"
4+
#include "simdutf.h"
5+
#include "util-inl.h"
56

67
using node::Calloc;
78
using node::Malloc;
@@ -298,3 +299,17 @@ TEST(UtilTest, SPrintF) {
298299
const std::string with_zero = std::string("a") + '\0' + 'b';
299300
EXPECT_EQ(SPrintF("%s", with_zero), with_zero);
300301
}
302+
303+
TEST(UtilTest, SimdutfEndiannessDoesNotMeanEndianness) {
304+
// In simdutf, "LE" does *not* refer to Little Endian, it refers
305+
// to 16-byte code units that are stored using *host* endianness.
306+
// This is weird and confusing naming, and so we add this assertion
307+
// here to verify that this is actually the case (so that CI tells
308+
// us if it changed, because for most people Little Endian is
309+
// host endianness, so locally everything would work fine).
310+
const char utf8source[] = "\xe7\x8c\xab";
311+
char16_t u16output;
312+
size_t u16len = simdutf::convert_utf8_to_utf16le(utf8source, 3, &u16output);
313+
EXPECT_EQ(u16len, 1u);
314+
EXPECT_EQ(u16output, 0x732B);
315+
}

0 commit comments

Comments
 (0)