Skip to content

Commit b6bc906

Browse files
Remove separate encoding for a single nonzero-mapping byte
In practice, for the two data sets that still use the bitset encoding (uppercase and lowercase) this is not a significant win, so just drop it entirely. It costs us about 5 bytes, and the complexity is nontrivial.
1 parent 9c1ceec commit b6bc906

File tree

3 files changed

+9
-46
lines changed

3 files changed

+9
-46
lines changed

src/libcore/unicode/unicode_data.rs

+7-15
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,15 @@ fn bitset_search<
1010
>(
1111
needle: u32,
1212
chunk_idx_map: &[u8; N],
13-
last_chunk_idx: u16,
1413
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
1514
bitset_canonical: &[u64; CANONICAL],
1615
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
1716
) -> bool {
1817
let bucket_idx = (needle / 64) as usize;
1918
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
2019
let chunk_piece = bucket_idx % CHUNK_SIZE;
21-
// The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`,
22-
// so we need to remap it
23-
let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) {
24-
chunk_idx_map[chunk_map_idx]
25-
} else if chunk_map_idx == last_chunk_idx as usize {
26-
chunk_idx_map[chunk_idx_map.len() - 1]
20+
let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) {
21+
v
2722
} else {
2823
return false;
2924
};
@@ -317,12 +312,12 @@ pub mod grapheme_extend {
317312

318313
#[rustfmt::skip]
319314
pub mod lowercase {
320-
const BITSET_LAST_CHUNK_MAP: u16 = 122;
321-
static BITSET_CHUNKS_MAP: [u8; 119] = [
315+
static BITSET_CHUNKS_MAP: [u8; 123] = [
322316
13, 16, 0, 0, 8, 0, 0, 11, 12, 9, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
323317
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
324318
0, 0, 3, 1, 0, 14, 0, 7, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
325-
0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 6,
319+
0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0,
320+
0, 0, 6,
326321
];
327322
static BITSET_INDEX_CHUNKS: [[u8; 16]; 18] = [
328323
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
@@ -408,7 +403,6 @@ pub mod lowercase {
408403
super::bitset_search(
409404
c as u32,
410405
&BITSET_CHUNKS_MAP,
411-
BITSET_LAST_CHUNK_MAP,
412406
&BITSET_INDEX_CHUNKS,
413407
&BITSET_CANONICAL,
414408
&BITSET_MAPPING,
@@ -449,13 +443,12 @@ pub mod n {
449443

450444
#[rustfmt::skip]
451445
pub mod uppercase {
452-
const BITSET_LAST_CHUNK_MAP: u16 = 124;
453-
static BITSET_CHUNKS_MAP: [u8; 124] = [
446+
static BITSET_CHUNKS_MAP: [u8; 125] = [
454447
12, 15, 5, 5, 0, 5, 5, 2, 4, 11, 5, 14, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
455448
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
456449
5, 5, 5, 6, 5, 13, 5, 10, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
457450
5, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 16, 5, 5,
458-
5, 5, 9, 3,
451+
5, 5, 9, 5, 3,
459452
];
460453
static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [
461454
[41, 41, 5, 33, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 5, 0],
@@ -529,7 +522,6 @@ pub mod uppercase {
529522
super::bitset_search(
530523
c as u32,
531524
&BITSET_CHUNKS_MAP,
532-
BITSET_LAST_CHUNK_MAP,
533525
&BITSET_INDEX_CHUNKS,
534526
&BITSET_CANONICAL,
535527
&BITSET_MAPPING,

src/tools/unicode-table-generator/src/range_search.rs

+2-7
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,15 @@ fn bitset_search<
88
>(
99
needle: u32,
1010
chunk_idx_map: &[u8; N],
11-
last_chunk_idx: u16,
1211
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
1312
bitset_canonical: &[u64; CANONICAL],
1413
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
1514
) -> bool {
1615
let bucket_idx = (needle / 64) as usize;
1716
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
1817
let chunk_piece = bucket_idx % CHUNK_SIZE;
19-
// The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`,
20-
// so we need to remap it
21-
let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) {
22-
chunk_idx_map[chunk_map_idx]
23-
} else if chunk_map_idx == last_chunk_idx as usize {
24-
chunk_idx_map[chunk_idx_map.len() - 1]
18+
let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) {
19+
v
2520
} else {
2621
return false;
2722
};

src/tools/unicode-table-generator/src/raw_emitter.rs

-24
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,6 @@ impl RawEmitter {
139139
writeln!(&mut self.file, " super::bitset_search(",).unwrap();
140140
writeln!(&mut self.file, " c as u32,").unwrap();
141141
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
142-
writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap();
143142
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
144143
writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap();
145144
writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap();
@@ -170,29 +169,6 @@ impl RawEmitter {
170169
chunk_indices.push(chunk_map[chunk]);
171170
}
172171

173-
// If one of the chunks has all of the entries point to the bitset
174-
// word filled with zeros, then pop those off the end -- we know they
175-
// are useless.
176-
let zero_chunk_idx = chunks.iter().position(|chunk| chunk.iter().all(|e| *e == zero_at));
177-
while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
178-
chunk_indices.pop();
179-
}
180-
// We do not count the LAST_CHUNK_MAP as adding bytes because it's a
181-
// small constant whose values are inlined directly into the instruction
182-
// stream.
183-
writeln!(
184-
&mut self.file,
185-
"const BITSET_LAST_CHUNK_MAP: u16 = {};",
186-
chunk_indices.len() - 1,
187-
)
188-
.unwrap();
189-
let nonzero = chunk_indices.pop().unwrap();
190-
// Try to pop again, now that we've recorded a non-zero pointing index
191-
// into the LAST_CHUNK_MAP.
192-
while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
193-
chunk_indices.pop();
194-
}
195-
chunk_indices.push(nonzero);
196172
writeln!(
197173
&mut self.file,
198174
"static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",

0 commit comments

Comments
 (0)