Skip to content

Commit dbce3b4

Browse files
committed
Auto merge of rust-lang#122013 - Swatinem:unicode-gen-fastpath, r=scottmcm
Add a lower bound check to `unicode-table-generator` output This adds a dedicated check for the lower bound (if it is outside of ASCII range) to the output of the `unicode-table-generator` tool. This generalized the ASCII-only fast-path, but only for the `Grapheme_Extend` property for now, as that is the only one with a lower bound outside of ASCII.
2 parents 54692c3 + 488598c commit dbce3b4

File tree

4 files changed

+32
-4
lines changed

4 files changed

+32
-4
lines changed

library/core/src/char/methods.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -927,7 +927,7 @@ impl char {
927927
#[must_use]
928928
#[inline]
929929
pub(crate) fn is_grapheme_extended(self) -> bool {
930-
self > '\x7f' && unicode::Grapheme_Extend(self)
930+
unicode::Grapheme_Extend(self)
931931
}
932932

933933
/// Returns `true` if this `char` has one of the general categories for numbers.

library/core/src/unicode/unicode_data.rs

+4
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,11 @@ pub mod grapheme_extend {
315315
15, 0, 7, 1, 17, 2, 7, 1, 2, 1, 5, 100, 1, 160, 7, 0, 1, 61, 4, 0, 4, 0, 7, 109, 7, 0, 96,
316316
128, 240, 0,
317317
];
318+
#[inline]
318319
pub fn lookup(c: char) -> bool {
320+
(c as u32) >= 0x300 && lookup_slow(c)
321+
}
322+
fn lookup_slow(c: char) -> bool {
319323
super::skip_search(
320324
c as u32,
321325
&SHORT_OFFSET_RUNS,

src/tools/unicode-table-generator/src/raw_emitter.rs

+5-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ impl RawEmitter {
2323
}
2424

2525
fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> {
26+
let first_code_point = ranges.first().unwrap().start;
2627
let last_code_point = ranges.last().unwrap().end;
2728
// bitset for every bit in the codepoint range
2829
//
@@ -101,7 +102,10 @@ impl RawEmitter {
101102
)
102103
.unwrap();
103104
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
104-
writeln!(&mut self.file, " super::bitset_search(",).unwrap();
105+
if first_code_point > 0x7f {
106+
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap();
107+
}
108+
writeln!(&mut self.file, " super::bitset_search(").unwrap();
105109
writeln!(&mut self.file, " c as u32,").unwrap();
106110
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
107111
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();

src/tools/unicode-table-generator/src/skiplist.rs

+22-2
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ impl ShortOffsetRunHeader {
2525

2626
impl RawEmitter {
2727
pub fn emit_skiplist(&mut self, ranges: &[Range<u32>]) {
28+
let first_code_point = ranges.first().unwrap().start;
2829
let mut offsets = Vec::<u32>::new();
29-
let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::<Vec<u32>>();
30+
let points = ranges.iter().flat_map(|r| [r.start, r.end]).collect::<Vec<u32>>();
3031
let mut offset = 0;
3132
for pt in points {
3233
let delta = pt - offset;
@@ -86,7 +87,26 @@ impl RawEmitter {
8687
.unwrap();
8788
self.bytes_used += coded_offsets.len();
8889

89-
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
90+
// The inlining in this code works like the following:
91+
//
92+
// The `skip_search` function is always inlined into the parent `lookup` fn,
93+
// thus the compiler can generate optimal code based on the referenced `static`s.
94+
//
95+
// In the case of ASCII optimization, the lower-bounds check is inlined into
96+
// the caller, and slower-path `skip_search` is outlined into a separate `lookup_slow` fn.
97+
//
98+
// Thus, in both cases, the `skip_search` function is specialized for the `static`s,
99+
// and outlined into the prebuilt `std`.
100+
if first_code_point > 0x7f {
101+
writeln!(&mut self.file, "#[inline]").unwrap();
102+
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
103+
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} && lookup_slow(c)")
104+
.unwrap();
105+
writeln!(&mut self.file, "}}").unwrap();
106+
writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap();
107+
} else {
108+
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
109+
}
90110
writeln!(&mut self.file, " super::skip_search(",).unwrap();
91111
writeln!(&mut self.file, " c as u32,").unwrap();
92112
writeln!(&mut self.file, " &SHORT_OFFSET_RUNS,").unwrap();

0 commit comments

Comments
 (0)