Skip to content

Commit 56d065e

Browse files
perf(es/fast-lexer): replace PHF with static keyword lookup table (#10181)
**Description:** Replaces the perfect hash function (PHF) keyword lookup with a more performant static keyword lookup table. The new implementation: - Removes dependency on `phf` crate - Adds a custom static keyword lookup mechanism - Introduces a more cache-friendly keyword search algorithm - Adds support for the `module` keyword
1 parent 11727a6 commit 56d065e

File tree

5 files changed

+198
-114
lines changed

5 files changed

+198
-114
lines changed

Cargo.lock

-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/swc_ecma_fast_parser/Cargo.toml

-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ swc_ecma_ast = { version = "8.0.0", path = "../swc_ecma_ast" }
2121
assume = { workspace = true }
2222
memchr = { workspace = true }
2323
num-bigint = { workspace = true }
24-
phf = { workspace = true, features = ["macros"] }
2524
wide = { workspace = true }
2625

2726
[dev-dependencies]

crates/swc_ecma_fast_parser/src/lexer/cursor.rs

+25-24
Original file line numberDiff line numberDiff line change
@@ -108,30 +108,31 @@ impl<'a> Cursor<'a> {
108108
where
109109
F: FnMut(u8) -> bool,
110110
{
111-
const BATCH_SIZE: u32 = 32;
112-
113-
// Process in batches if we have more than BATCH_SIZE bytes
114-
while self.pos + BATCH_SIZE <= self.len {
115-
let mut should_stop = false;
116-
117-
// Check all bytes in the batch
118-
for i in 0..BATCH_SIZE {
119-
// SAFETY: We've verified bounds above
120-
let byte = unsafe { *self.input.get_unchecked((self.pos + i) as usize) };
121-
if !predicate(byte) {
122-
should_stop = true;
123-
break;
124-
}
125-
}
126-
127-
if should_stop {
128-
// Found stopping byte, switch to byte-by-byte
129-
break;
130-
}
131-
132-
// Skip the entire batch
133-
self.pos += BATCH_SIZE;
134-
}
111+
// Warning: Do not scalarize if we do not use SIMD
112+
// const BATCH_SIZE: u32 = 32;
113+
114+
// // Process in batches if we have more than BATCH_SIZE bytes
115+
// while self.pos + BATCH_SIZE <= self.len {
116+
// let mut should_stop = false;
117+
118+
// // Check all bytes in the batch
119+
// for i in 0..BATCH_SIZE {
120+
// // SAFETY: We've verified bounds above
121+
// let byte = unsafe { *self.input.get_unchecked((self.pos + i) as
122+
// usize) }; if !predicate(byte) {
123+
// should_stop = true;
124+
// break;
125+
// }
126+
// }
127+
128+
// if should_stop {
129+
// // Found stopping byte, switch to byte-by-byte
130+
// break;
131+
// }
132+
133+
// // Skip the entire batch
134+
// self.pos += BATCH_SIZE;
135+
// }
135136

136137
// Byte-by-byte for the remainder
137138
while let Some(byte) = self.peek() {

crates/swc_ecma_fast_parser/src/lexer/identifier.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ impl Lexer<'_> {
101101
let ident_start = start_pos.0;
102102
let ident_end = self.cursor.position();
103103
let ident_bytes = unsafe { self.cursor.slice_unchecked(ident_start, ident_end) };
104+
// SAFETY: We've verified the bytes are valid UTF-8
104105
let ident_str = unsafe { std::str::from_utf8_unchecked(ident_bytes) };
105106
let had_line_break_bool: bool = self.had_line_break.into();
106107

@@ -110,7 +111,6 @@ impl Lexer<'_> {
110111
// Only process if first byte is an ASCII lowercase letter (all keywords start
111112
// with a-z)
112113
if len > 0 && ident_bytes[0] >= b'a' && ident_bytes[0] <= b'z' {
113-
// Fallback path: Check in the PHF map if this is a keyword
114114
// Only runs for potential keywords not in our direct lookup tables
115115
if let Some(token_type) = keyword_to_token_type(ident_str) {
116116
return Ok(Token::new(

crates/swc_ecma_fast_parser/src/token.rs

+172-87
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
use std::fmt;
88

99
use num_bigint::BigInt as BigIntValue;
10-
use phf::phf_map;
1110
use swc_atoms::Atom;
1211
use swc_common::Span;
1312

@@ -138,6 +137,7 @@ pub enum TokenType {
138137
While = 135,
139138
With = 136,
140139
Yield = 137,
140+
Module = 138,
141141

142142
// TypeScript-related keywords (starting from 150)
143143
Abstract = 150,
@@ -465,6 +465,7 @@ impl TokenType {
465465
TokenType::Shebang => "#!",
466466
TokenType::EOF => "EOF",
467467
TokenType::Invalid => "invalid token",
468+
TokenType::Module => "module",
468469
}
469470
}
470471
}
@@ -601,93 +602,177 @@ impl fmt::Debug for Token {
601602
}
602603
}
603604

604-
// Compile-time keyword to token type mapping using PHF
605-
static KEYWORDS: phf::Map<&'static str, TokenType> = phf_map! {
606-
// JavaScript keywords
607-
"await" => TokenType::Await,
608-
"break" => TokenType::Break,
609-
"case" => TokenType::Case,
610-
"catch" => TokenType::Catch,
611-
"class" => TokenType::Class,
612-
"const" => TokenType::Const,
613-
"continue" => TokenType::Continue,
614-
"debugger" => TokenType::Debugger,
615-
"default" => TokenType::Default,
616-
"delete" => TokenType::Delete,
617-
"do" => TokenType::Do,
618-
"else" => TokenType::Else,
619-
"export" => TokenType::Export,
620-
"extends" => TokenType::Extends,
621-
"false" => TokenType::False,
622-
"finally" => TokenType::Finally,
623-
"for" => TokenType::For,
624-
"function" => TokenType::Function,
625-
"if" => TokenType::If,
626-
"import" => TokenType::Import,
627-
"in" => TokenType::In,
628-
"instanceof" => TokenType::InstanceOf,
629-
"let" => TokenType::Let,
630-
"new" => TokenType::New,
631-
"null" => TokenType::Null,
632-
"return" => TokenType::Return,
633-
"super" => TokenType::Super,
634-
"switch" => TokenType::Switch,
635-
"this" => TokenType::This,
636-
"throw" => TokenType::Throw,
637-
"true" => TokenType::True,
638-
"try" => TokenType::Try,
639-
"typeof" => TokenType::TypeOf,
640-
"var" => TokenType::Var,
641-
"void" => TokenType::Void,
642-
"while" => TokenType::While,
643-
"with" => TokenType::With,
644-
"yield" => TokenType::Yield,
645-
646-
// TypeScript-related keywords
647-
"abstract" => TokenType::Abstract,
648-
"any" => TokenType::Any,
649-
"as" => TokenType::As,
650-
"asserts" => TokenType::Asserts,
651-
"assert" => TokenType::Assert,
652-
"async" => TokenType::Async,
653-
"bigint" => TokenType::Bigint,
654-
"boolean" => TokenType::Boolean,
655-
"constructor" => TokenType::Constructor,
656-
"declare" => TokenType::Declare,
657-
"enum" => TokenType::Enum,
658-
"from" => TokenType::From,
659-
"get" => TokenType::Get,
660-
"global" => TokenType::Global,
661-
"implements" => TokenType::Implements,
662-
"interface" => TokenType::Interface,
663-
"intrinsic" => TokenType::Intrinsic,
664-
"is" => TokenType::Is,
665-
"keyof" => TokenType::Keyof,
666-
"namespace" => TokenType::Namespace,
667-
"never" => TokenType::Never,
668-
"number" => TokenType::Number,
669-
"object" => TokenType::Object,
670-
"of" => TokenType::Of,
671-
"package" => TokenType::Package,
672-
"private" => TokenType::Private,
673-
"protected" => TokenType::Protected,
674-
"public" => TokenType::Public,
675-
"readonly" => TokenType::Readonly,
676-
"require" => TokenType::Require,
677-
"set" => TokenType::Set,
678-
"static" => TokenType::Static,
679-
"string" => TokenType::String,
680-
"symbol" => TokenType::Symbol,
681-
"type" => TokenType::Type,
682-
"undefined" => TokenType::Undefined,
683-
"unique" => TokenType::Unique,
684-
"unknown" => TokenType::Unknown,
685-
"using" => TokenType::Using,
605+
struct KeywordEntry(&'static str, TokenType);
606+
607+
/// A static array of KeywordEntry tuples, each containing a keyword
608+
/// string and its corresponding TokenType.
609+
static KEYWORD_LOOKUP: [KeywordEntry; 78] = [
610+
KeywordEntry("await", TokenType::Await),
611+
KeywordEntry("break", TokenType::Break),
612+
KeywordEntry("case", TokenType::Case),
613+
KeywordEntry("catch", TokenType::Catch),
614+
KeywordEntry("class", TokenType::Class),
615+
KeywordEntry("const", TokenType::Const),
616+
KeywordEntry("continue", TokenType::Continue),
617+
KeywordEntry("debugger", TokenType::Debugger),
618+
KeywordEntry("default", TokenType::Default),
619+
KeywordEntry("delete", TokenType::Delete),
620+
KeywordEntry("do", TokenType::Do),
621+
KeywordEntry("else", TokenType::Else),
622+
KeywordEntry("export", TokenType::Export),
623+
KeywordEntry("extends", TokenType::Extends),
624+
KeywordEntry("false", TokenType::False),
625+
KeywordEntry("finally", TokenType::Finally),
626+
KeywordEntry("for", TokenType::For),
627+
KeywordEntry("function", TokenType::Function),
628+
KeywordEntry("if", TokenType::If),
629+
KeywordEntry("import", TokenType::Import),
630+
KeywordEntry("in", TokenType::In),
631+
KeywordEntry("instanceof", TokenType::InstanceOf),
632+
KeywordEntry("let", TokenType::Let),
633+
KeywordEntry("new", TokenType::New),
634+
KeywordEntry("null", TokenType::Null),
635+
KeywordEntry("return", TokenType::Return),
636+
KeywordEntry("super", TokenType::Super),
637+
KeywordEntry("switch", TokenType::Switch),
638+
KeywordEntry("this", TokenType::This),
639+
KeywordEntry("throw", TokenType::Throw),
640+
KeywordEntry("true", TokenType::True),
641+
KeywordEntry("try", TokenType::Try),
642+
KeywordEntry("typeof", TokenType::TypeOf),
643+
KeywordEntry("var", TokenType::Var),
644+
KeywordEntry("void", TokenType::Void),
645+
KeywordEntry("while", TokenType::While),
646+
KeywordEntry("with", TokenType::With),
647+
KeywordEntry("yield", TokenType::Yield),
648+
KeywordEntry("module", TokenType::Module),
649+
KeywordEntry("abstract", TokenType::Abstract),
650+
KeywordEntry("any", TokenType::Any),
651+
KeywordEntry("as", TokenType::As),
652+
KeywordEntry("asserts", TokenType::Asserts),
653+
KeywordEntry("assert", TokenType::Assert),
654+
KeywordEntry("async", TokenType::Async),
655+
KeywordEntry("bigint", TokenType::Bigint),
656+
KeywordEntry("boolean", TokenType::Boolean),
657+
KeywordEntry("constructor", TokenType::Constructor),
658+
KeywordEntry("declare", TokenType::Declare),
659+
KeywordEntry("enum", TokenType::Enum),
660+
KeywordEntry("from", TokenType::From),
661+
KeywordEntry("get", TokenType::Get),
662+
KeywordEntry("global", TokenType::Global),
663+
KeywordEntry("implements", TokenType::Implements),
664+
KeywordEntry("interface", TokenType::Interface),
665+
KeywordEntry("intrinsic", TokenType::Intrinsic),
666+
KeywordEntry("is", TokenType::Is),
667+
KeywordEntry("keyof", TokenType::Keyof),
668+
KeywordEntry("namespace", TokenType::Namespace),
669+
KeywordEntry("never", TokenType::Never),
670+
KeywordEntry("number", TokenType::Number),
671+
KeywordEntry("object", TokenType::Object),
672+
KeywordEntry("of", TokenType::Of),
673+
KeywordEntry("package", TokenType::Package),
674+
KeywordEntry("private", TokenType::Private),
675+
KeywordEntry("protected", TokenType::Protected),
676+
KeywordEntry("public", TokenType::Public),
677+
KeywordEntry("readonly", TokenType::Readonly),
678+
KeywordEntry("require", TokenType::Require),
679+
KeywordEntry("set", TokenType::Set),
680+
KeywordEntry("static", TokenType::Static),
681+
KeywordEntry("string", TokenType::String),
682+
KeywordEntry("symbol", TokenType::Symbol),
683+
KeywordEntry("type", TokenType::Type),
684+
KeywordEntry("undefined", TokenType::Undefined),
685+
KeywordEntry("unique", TokenType::Unique),
686+
KeywordEntry("unknown", TokenType::Unknown),
687+
KeywordEntry("using", TokenType::Using),
688+
];
689+
690+
const MAX_KEYWORD_LEN: usize = 16;
691+
692+
const MAX_KEYWORD_SLOT_LEN: usize = 4;
693+
694+
/// Static keyword table for fast keyword lookup
695+
static KEYWORD_TABLE: [[[u8; MAX_KEYWORD_SLOT_LEN]; 26]; MAX_KEYWORD_LEN] = {
696+
// Initialize the table with 255 (u8) at each position
697+
let mut table = [[[255u8; MAX_KEYWORD_SLOT_LEN]; 26]; MAX_KEYWORD_LEN];
698+
699+
// Iterate over the keyword lookup table
700+
let mut i = 0;
701+
while i < KEYWORD_LOOKUP.len() {
702+
let word = KEYWORD_LOOKUP[i].0;
703+
let len = word.len();
704+
705+
// Check if the length of the word is within the valid range
706+
if len > 0 && len <= 16 {
707+
let first_char = word.as_bytes()[0];
708+
let len_idx = len - 1;
709+
let char_idx = (first_char - b'a') as usize;
710+
711+
// Find an empty slot in the table for the current word
712+
let mut slot_idx = 0;
713+
while slot_idx < MAX_KEYWORD_SLOT_LEN && table[len_idx][char_idx][slot_idx] != 255 {
714+
slot_idx += 1;
715+
}
716+
717+
// If an empty slot is found, store the index of the keyword entry in the table
718+
if slot_idx < MAX_KEYWORD_SLOT_LEN {
719+
table[len_idx][char_idx][slot_idx] = i as u8;
720+
}
721+
}
722+
i += 1;
723+
}
724+
725+
// Return the initialized table
726+
table
686727
};
687728

729+
/// Attempts to find a keyword in the static keyword table and returns its
730+
/// corresponding TokenType.
731+
///
732+
/// This function takes a word as input and checks if it matches any of the
733+
/// keywords stored in the KEYWORD_TABLE. If a match is found, it returns the
734+
/// TokenType associated with the keyword. Otherwise, it returns None.
735+
fn find_keyword_from_table(word: &str) -> Option<TokenType> {
736+
// Determine the length of the word to check if it's within the valid range
737+
let len = word.len();
738+
if len > 0 && len <= 16 {
739+
// SAFETY: word len is within 1..=16 bounds
740+
let first_byte = *unsafe { word.as_bytes().get_unchecked(0) };
741+
let len_idx = len - 1;
742+
let byte_idx = (first_byte - b'a') as usize;
743+
744+
let mut slot_idx = 0;
745+
while slot_idx < MAX_KEYWORD_SLOT_LEN {
746+
// Retrieve the index of the keyword entry from the table
747+
let idx = *unsafe {
748+
KEYWORD_TABLE
749+
.get_unchecked(len_idx)
750+
.get_unchecked(byte_idx)
751+
.get_unchecked(slot_idx)
752+
};
753+
// If the index is 255, it means we've reached the end of the slot
754+
if idx == 255 {
755+
break;
756+
}
757+
758+
// SAFETY: idx is within bounds
759+
let entry = unsafe { KEYWORD_LOOKUP.get_unchecked(idx as usize) };
760+
761+
// Check if the word matches the keyword in the entry
762+
if entry.0 == word {
763+
return Some(entry.1);
764+
}
765+
766+
slot_idx += 1;
767+
}
768+
}
769+
770+
None
771+
}
772+
688773
/// Convert a keyword string to TokenType
689-
/// Uses a PHF map for O(1) time complexity with zero runtime overhead
690-
/// Optimized with fast-path checks for common keywords
774+
/// Utilizes the first byte and word length to quickly locate the keyword in the
775+
/// table Optimized with fast-path checks for common keywords
691776
#[inline(always)]
692777
pub fn keyword_to_token_type(word: &str) -> Option<TokenType> {
693778
// Fast path for the most common keywords
@@ -772,8 +857,8 @@ pub fn keyword_to_token_type(word: &str) -> Option<TokenType> {
772857
_ => {}
773858
}
774859

775-
// Fallback to the PHF map for less common keywords
776-
KEYWORDS.get(word).copied()
860+
// Fallback to KEYWORD_TABLE for less common keywords
861+
find_keyword_from_table(word)
777862
}
778863

779864
#[cfg(test)]

0 commit comments

Comments
 (0)