11
11
#include < string>
12
12
#include < string_view>
13
13
#include < vector>
14
+ #include " embedded_data.h"
14
15
#include " executable_wrapper.h"
15
16
#include " simdutf.h"
16
17
#include " uv.h"
@@ -396,11 +397,14 @@ const std::string& GetCode(uint16_t index) {
396
397
397
398
#ifdef NODE_JS2C_USE_STRING_LITERALS
398
399
const char * string_literal_def_template = " static const %s *%s_raw = " ;
400
+ constexpr std::string_view latin1_string_literal_start =
401
+ " reinterpret_cast<const uint8_t*>(\" " ;
399
402
constexpr std::string_view ascii_string_literal_start =
400
403
" reinterpret_cast<const uint8_t*>(R\" JS2C1b732aee(" ;
401
404
constexpr std::string_view utf16_string_literal_start =
402
405
" reinterpret_cast<const uint16_t*>(uR\" JS2C1b732aee(" ;
403
- constexpr std::string_view string_literal_end = " )JS2C1b732aee\" );" ;
406
+ constexpr std::string_view latin1_string_literal_end = " \" );" ;
407
+ constexpr std::string_view utf_string_literal_end = " )JS2C1b732aee\" );" ;
404
408
#else
405
409
const char * array_literal_def_template = " static const %s %s_raw[] = " ;
406
410
constexpr std::string_view array_literal_start = " {\n " ;
@@ -424,9 +428,15 @@ constexpr std::string_view array_literal_end = "\n};\n\n";
424
428
// If NODE_JS2C_USE_STRING_LITERALS is defined, the data is output as C++
425
429
// raw strings (i.e. R"JS2C1b732aee(...)JS2C1b732aee") rather than as an
426
430
// array. This speeds up compilation for gcc/clang.
431
+ enum class CodeType {
432
+ kAscii , // Code points are all within 0-127
433
+ kLatin1 , // Code points are all within 0-255
434
+ kTwoByte ,
435
+ };
427
436
template <typename T>
428
437
Fragment GetDefinitionImpl (const std::vector<char >& code,
429
- const std::string& var) {
438
+ const std::string& var,
439
+ CodeType type) {
430
440
constexpr bool is_two_byte = std::is_same_v<T, uint16_t >;
431
441
static_assert (is_two_byte || std::is_same_v<T, char >);
432
442
@@ -440,11 +450,14 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
440
450
441
451
#ifdef NODE_JS2C_USE_STRING_LITERALS
442
452
const char * literal_def_template = string_literal_def_template;
443
- size_t def_size = 512 + code.size ();
453
+ // For code that contains Latin-1 characters, be conservative and assume
454
+ // they all need escaping: one "\" and three digits.
455
+ size_t unit = type == CodeType::kLatin1 ? 4 : 1 ;
456
+ size_t def_size = 512 + code.size () * unit;
444
457
#else
445
458
const char * literal_def_template = array_literal_def_template;
446
459
constexpr size_t unit =
447
- (is_two_byte ? 5 : 3 ) + 1 ; // 0-65536 or 0-127 and a ","
460
+ (is_two_byte ? 5 : 3 ) + 1 ; // 0-65536 or 0-255 and a ","
448
461
size_t def_size = 512 + count * unit;
449
462
#endif
450
463
@@ -456,16 +469,56 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
456
469
assert (cur != 0 );
457
470
458
471
#ifdef NODE_JS2C_USE_STRING_LITERALS
459
- constexpr std::string_view start_string_view =
460
- is_two_byte ? utf16_string_literal_start : ascii_string_literal_start;
472
+ std::string_view start_string_view;
473
+ switch (type) {
474
+ case CodeType::kAscii :
475
+ start_string_view = ascii_string_literal_start;
476
+ break ;
477
+ case CodeType::kLatin1 :
478
+ start_string_view = latin1_string_literal_start;
479
+ break ;
480
+ case CodeType::kTwoByte :
481
+ start_string_view = utf16_string_literal_start;
482
+ break ;
483
+ }
461
484
462
485
memcpy (
463
486
result.data () + cur, start_string_view.data (), start_string_view.size ());
464
487
cur += start_string_view.size ();
465
488
466
- memcpy (result.data () + cur, code.data (), code.size ());
467
- cur += code.size ();
489
+ if (type != CodeType::kLatin1 ) {
490
+ memcpy (result.data () + cur, code.data (), code.size ());
491
+ cur += code.size ();
492
+ } else {
493
+ const uint8_t * ptr = reinterpret_cast <const uint8_t *>(code.data ());
494
+ for (size_t i = 0 ; i < count; ++i) {
495
+ // Avoid using snprintf on large chunks of data because it's much slower.
496
+ // It's fine to use it on small amount of data though.
497
+ uint8_t ch = ptr[i];
498
+ if (ch > 127 ) {
499
+ Debug (" In %s, found non-ASCII Latin-1 character at %zu: %d\n " ,
500
+ var.c_str (),
501
+ i,
502
+ ch);
503
+ }
504
+ const std::string& str = GetOctalCode (ch);
505
+ memcpy (result.data () + cur, str.c_str (), str.size ());
506
+ cur += str.size ();
507
+ }
508
+ }
468
509
510
+ std::string_view string_literal_end;
511
+ switch (type) {
512
+ case CodeType::kAscii :
513
+ string_literal_end = utf_string_literal_end;
514
+ break ;
515
+ case CodeType::kLatin1 :
516
+ string_literal_end = latin1_string_literal_end;
517
+ break ;
518
+ case CodeType::kTwoByte :
519
+ string_literal_end = utf_string_literal_end;
520
+ break ;
521
+ }
469
522
memcpy (result.data () + cur,
470
523
string_literal_end.data (),
471
524
string_literal_end.size ());
@@ -476,10 +529,10 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
476
529
array_literal_start.size ());
477
530
cur += array_literal_start.size ();
478
531
479
- const std::vector<T>* codepoints;
480
-
481
- std::vector<uint16_t > utf16_codepoints;
532
+ // Avoid using snprintf on large chunks of data because it's much slower.
533
+ // It's fine to use it on small amount of data though.
482
534
if constexpr (is_two_byte) {
535
+ std::vector<uint16_t > utf16_codepoints;
483
536
utf16_codepoints.resize (count);
484
537
size_t utf16_count = simdutf::convert_utf8_to_utf16 (
485
538
code.data (),
@@ -488,19 +541,25 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
488
541
assert (utf16_count != 0 );
489
542
utf16_codepoints.resize (utf16_count);
490
543
Debug (" static size %zu\n " , utf16_count);
491
- codepoints = &utf16_codepoints;
544
+ for (size_t i = 0 ; i < utf16_count; ++i) {
545
+ const std::string& str = GetCode (utf16_codepoints[i]);
546
+ memcpy (result.data () + cur, str.c_str (), str.size ());
547
+ cur += str.size ();
548
+ }
492
549
} else {
493
- // The code is ASCII, so no need to translate.
494
- codepoints = &code;
495
- }
496
-
497
- for (size_t i = 0 ; i < codepoints->size (); ++i) {
498
- // Avoid using snprintf on large chunks of data because it's much slower.
499
- // It's fine to use it on small amount of data though.
500
- const std::string& str = GetCode (static_cast <uint16_t >((*codepoints)[i]));
501
-
502
- memcpy (result.data () + cur, str.c_str (), str.size ());
503
- cur += str.size ();
550
+ const uint8_t * ptr = reinterpret_cast <const uint8_t *>(code.data ());
551
+ for (size_t i = 0 ; i < count; ++i) {
552
+ uint16_t ch = static_cast <uint16_t >(ptr[i]);
553
+ if (ch > 127 ) {
554
+ Debug (" In %s, found non-ASCII Latin-1 character at %zu: %d\n " ,
555
+ var.c_str (),
556
+ i,
557
+ ch);
558
+ }
559
+ const std::string& str = GetCode (ch);
560
+ memcpy (result.data () + cur, str.c_str (), str.size ());
561
+ cur += str.size ();
562
+ }
504
563
}
505
564
506
565
memcpy (
@@ -520,17 +579,80 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
520
579
return result;
521
580
}
522
581
523
- Fragment GetDefinition (const std::string& var, const std::vector<char >& code) {
524
- Debug (" GetDefinition %s, code size %zu " , var.c_str (), code.size ());
525
- bool is_one_byte = simdutf::validate_ascii (code.data (), code.size ());
526
- Debug (" with %s\n " , is_one_byte ? " 1-byte chars" : " 2-byte chars" );
582
+ bool Simplify (const std::vector<char >& code,
583
+ const std::string& var,
584
+ std::vector<char >* simplified) {
585
+ // Allowlist files to avoid false positives.
586
+ // TODO(joyeecheung): this could be removed if undici updates itself
587
+ // to replace "’" with "'" though we could still keep this skeleton in
588
+ // place for future hot fixes that are verified by humans.
589
+ if (var != " internal_deps_undici_undici" ) {
590
+ return false ;
591
+ }
527
592
528
- if (is_one_byte) {
529
- Debug (" static size %zu\n " , code.size ());
530
- return GetDefinitionImpl<char >(code, var);
531
- } else {
532
- return GetDefinitionImpl<uint16_t >(code, var);
593
+ size_t code_size = code.size ();
594
+ simplified->reserve (code_size);
595
+ const uint8_t * ptr = reinterpret_cast <const uint8_t *>(code.data ());
596
+ size_t simplified_count = 0 ;
597
+ for (size_t i = 0 ; i < code_size; ++i) {
598
+ switch (ptr[i]) {
599
+ case 226 : { // ’ [ 226, 128, 153 ] -> '
600
+ if (i + 2 < code_size && ptr[i + 1 ] == 128 && ptr[i + 2 ] == 153 ) {
601
+ simplified->push_back (' \' ' );
602
+ i += 2 ;
603
+ simplified_count++;
604
+ break ;
605
+ }
606
+ }
607
+ default : {
608
+ simplified->push_back (code[i]);
609
+ break ;
610
+ }
611
+ }
533
612
}
613
+
614
+ if (simplified_count > 0 ) {
615
+ Debug (" Simplified %d characters, " , simplified_count);
616
+ Debug (" old size %d, new size %d\n " , code_size, simplified->size ());
617
+ return true ;
618
+ }
619
+ return false ;
620
+ }
621
+
622
+ Fragment GetDefinition (const std::string& var, const std::vector<char >& code) {
623
+ Debug (" GetDefinition %s, code size %zu\n " , var.c_str (), code.size ());
624
+ bool is_ascii = simdutf::validate_ascii (code.data (), code.size ());
625
+
626
+ if (is_ascii) {
627
+ Debug (" ASCII-only, static size %zu\n " , code.size ());
628
+ return GetDefinitionImpl<char >(code, var, CodeType::kAscii );
629
+ }
630
+
631
+ std::vector<char > latin1 (code.size ());
632
+ auto result = simdutf::convert_utf8_to_latin1_with_errors (
633
+ code.data (), code.size (), latin1.data ());
634
+ if (!result.error ) {
635
+ latin1.resize (result.count );
636
+ Debug (" Latin-1-only, old size %zu, new size %zu\n " ,
637
+ code.size (),
638
+ latin1.size ());
639
+ return GetDefinitionImpl<char >(latin1, var, CodeType::kLatin1 );
640
+ }
641
+
642
+ // Since V8 only supports Latin-1 and UTF16 as underlying representation
643
+ // we have to encode all files containing two-byte characters as UTF16.
644
+ // While some files do need two-byte characters, some just
645
+ // unintentionally have them. Replace certain characters that are known
646
+ // to have sane one-byte equivalent to save space.
647
+ std::vector<char > simplified;
648
+ if (Simplify (code, var, &simplified)) { // Changed.
649
+ Debug (" %s is simplified, re-generate definition\n " , var.c_str ());
650
+ return GetDefinition (var, simplified);
651
+ }
652
+
653
+ // Simplification did not turn the code into 1-byte string. Just
654
+ // use the original.
655
+ return GetDefinitionImpl<uint16_t >(code, var, CodeType::kTwoByte );
534
656
}
535
657
536
658
int AddModule (const std::string& filename,
0 commit comments