From 903f67d599cf12d2d202d5177bb3edecb6cdab00 Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Thu, 19 Mar 2020 10:55:58 -0400 Subject: [PATCH 01/14] Avoid re-fetching Unicode data If the unicode-downloads folder already exists, we likely just fetched the data, so don't make any further network requests. Unicode versions are released rarely enough that this doesn't matter much in practice. --- .../unicode-table-generator/src/unicode_download.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/tools/unicode-table-generator/src/unicode_download.rs b/src/tools/unicode-table-generator/src/unicode_download.rs index 3f6de9ea3bbd7..fa57f650ac082 100644 --- a/src/tools/unicode-table-generator/src/unicode_download.rs +++ b/src/tools/unicode-table-generator/src/unicode_download.rs @@ -11,10 +11,15 @@ static RESOURCES: &[&str] = pub fn fetch_latest() { let directory = Path::new(UNICODE_DIRECTORY); + if directory.exists() { + eprintln!( + "Not refetching unicode data, already exists, please delete {:?} to regenerate", + directory + ); + return; + } if let Err(e) = std::fs::create_dir_all(directory) { - if e.kind() != std::io::ErrorKind::AlreadyExists { - panic!("Failed to create {:?}: {}", UNICODE_DIRECTORY, e); - } + panic!("Failed to create {:?}: {}", UNICODE_DIRECTORY, e); } let output = Command::new("curl").arg(URL_PREFIX.to_owned() + README).output().unwrap(); if !output.status.success() { From 7c4baedb3a090f2dc9e653bd7c03165be84acad3 Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Thu, 19 Mar 2020 11:38:41 -0400 Subject: [PATCH 02/14] Dynamically choose best chunk size Try chunk sizes between 1 and 64, selecting the one which minimizes the number of bytes used. 16, the previous constant, turned out to be a rather good choice, with 5/9 of the datasets still using it. Alphabetic : 3036 bytes (- 19 bytes) Case_Ignorable : 2136 bytes Cased : 934 bytes Cc : 32 bytes (- 11 bytes) Grapheme_Extend: 1774 bytes Lowercase : 985 bytes N : 1225 bytes (- 41 bytes) Uppercase : 934 bytes White_Space : 97 bytes (- 43 bytes) Total table sizes: 11153 bytes (-114 bytes) --- src/libcore/unicode/mod.rs | 8 +- src/libcore/unicode/unicode_data.rs | 206 ++++++++---------- .../src/raw_emitter.rs | 58 +++-- 3 files changed, 134 insertions(+), 138 deletions(-) diff --git a/src/libcore/unicode/mod.rs b/src/libcore/unicode/mod.rs index b6eaf06aa7f63..d1c68863e1632 100644 --- a/src/libcore/unicode/mod.rs +++ b/src/libcore/unicode/mod.rs @@ -34,16 +34,16 @@ pub use unicode_data::uppercase::lookup as Uppercase; pub use unicode_data::white_space::lookup as White_Space; #[inline(always)] -fn range_search( +fn range_search( needle: u32, chunk_idx_map: &[u8; N], (last_chunk_idx, last_chunk_mapping): (u16, u8), - bitset_chunk_idx: &[[u8; 16]; N1], + bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], bitset: &[u64; N2], ) -> bool { let bucket_idx = (needle / 64) as usize; - let chunk_map_idx = bucket_idx / 16; - let chunk_piece = bucket_idx % 16; + let chunk_map_idx = bucket_idx / CHUNK_SIZE; + let chunk_piece = bucket_idx % CHUNK_SIZE; let chunk_idx = if chunk_map_idx >= N { if chunk_map_idx == last_chunk_idx as usize { last_chunk_mapping diff --git a/src/libcore/unicode/unicode_data.rs b/src/libcore/unicode/unicode_data.rs index 3e90028613ce1..a89f3481f4957 100644 --- a/src/libcore/unicode/unicode_data.rs +++ b/src/libcore/unicode/unicode_data.rs @@ -5,69 +5,63 @@ pub const UNICODE_VERSION: (u32, u32, u32) = (13, 0, 0); #[rustfmt::skip] pub mod alphabetic { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (196, 44); - static BITSET_CHUNKS_MAP: [u8; 196] = [ - 6, 32, 10, 18, 19, 23, 21, 12, 7, 5, 0, 20, 14, 50, 50, 50, 50, 50, 50, 37, 50, 50, 50, 50, - 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 49, 50, 30, 8, 50, 50, 50, 50, - 50, 50, 50, 50, 50, 50, 46, 0, 0, 0, 0, 0, 0, 0, 0, 4, 36, 17, 31, 16, 25, 24, 26, 13, 15, - 45, 27, 0, 0, 50, 11, 0, 0, 0, 40, 0, 0, 0, 0, 0, 0, 0, 0, 39, 1, 50, 50, 50, 50, 50, 48, - 50, 34, 0, 0, 0, 0, 0, 0, 0, 0, 35, 0, 0, 28, 0, 0, 0, 0, 0, 29, 0, 0, 9, 0, 33, 2, 3, 0, 0, - 0, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, - 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 42, 50, 50, 50, - 43, 22, 50, 50, 50, 50, 41, 50, 50, 50, 50, 50, 50, 47, 0, 0, 0, 38, 0, 50, 50, 50, 50, + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (393, 67); + static BITSET_CHUNKS_MAP: [u8; 393] = [ + 8, 60, 56, 38, 16, 33, 34, 24, 35, 50, 41, 49, 37, 39, 20, 66, 9, 0, 6, 0, 0, 0, 36, 18, + 26, 0, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 70, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 73, 74, 74, 52, 15, 13, 21, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 69, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 48, 65, 10, 32, 7, 53, 64, 31, 19, 44, 5, 42, 27, 45, 30, 22, 29, 28, 4, + 74, 68, 46, 0, 0, 0, 0, 0, 74, 74, 17, 0, 0, 0, 0, 0, 0, 0, 74, 43, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 74, 23, 0, 11, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 72, 74, + 74, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 61, 0, 0, 0, 0, 47, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 51, 55, 0, 0, 0, 0, 14, 3, 0, 0, 57, 0, 0, 25, 1, 0, 0, 0, 0, 0, 0, + 0, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 59, 74, 74, 74, 74, 74, 74, 74, + 63, 40, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 54, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 71, 0, 0, 0, 0, 0, 0, 74, 12, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74, 74, ]; - static BITSET_INDEX_CHUNKS: [[u8; 16]; 51] = [ - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 254, 0, 0, 254, 247, 39, 68], - [0, 0, 0, 0, 0, 0, 0, 0, 111, 135, 113, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 195, 205, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 254, 254, 254, 254, 254, 210, 254, 25, 136, 251, 71, 243], - [0, 0, 182, 52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 107, 103, 180, 254, 254, 254, 254, 254, 254, 254, 61, 0, 155, 222, 181], - [0, 148, 30, 0, 172, 226, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [48, 80, 254, 169, 206, 123, 189, 139, 95, 179, 145, 86, 211, 204, 254, 56], - [53, 0, 0, 0, 129, 17, 0, 0, 0, 0, 0, 58, 0, 0, 0, 0], - [59, 54, 185, 203, 171, 191, 161, 117, 158, 87, 164, 118, 162, 67, 159, 23], - [62, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [95, 131, 168, 105, 254, 254, 254, 82, 254, 254, 254, 254, 236, 130, 137, 120], - [101, 0, 225, 146, 151, 2, 217, 45, 144, 246, 32, 101, 0, 0, 0, 0], - [119, 253, 224, 175, 193, 254, 227, 195, 0, 0, 0, 0, 0, 0, 0, 0], - [143, 190, 91, 0, 153, 218, 24, 0, 0, 0, 0, 92, 0, 0, 66, 0], - [150, 94, 37, 85, 102, 0, 157, 0, 88, 122, 31, 46, 89, 74, 20, 0], - [154, 34, 254, 110, 0, 84, 0, 0, 0, 0, 233, 19, 216, 108, 237, 21], - [166, 42, 165, 72, 167, 177, 126, 76, 109, 16, 127, 38, 1, 192, 124, 0], - [176, 246, 234, 174, 254, 254, 254, 254, 254, 235, 140, 241, 240, 26, 228, 128], - [213, 239, 254, 77, 209, 64, 142, 238, 63, 0, 0, 0, 0, 0, 0, 0], - [225, 101, 207, 89, 98, 81, 208, 10, 232, 83, 147, 1, 188, 13, 178, 70], - [237, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254], - [253, 254, 254, 254, 254, 254, 254, 254, 254, 214, 231, 99, 79, 78, 183, 27], - [254, 6, 100, 50, 75, 90, 254, 28, 134, 0, 202, 51, 163, 43, 0, 0], - [254, 9, 75, 75, 49, 0, 0, 0, 0, 0, 69, 0, 199, 6, 195, 93], - [254, 41, 254, 8, 0, 0, 141, 33, 145, 4, 97, 0, 55, 0, 0, 0], - [254, 62, 254, 254, 254, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [254, 121, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [254, 242, 170, 252, 138, 245, 254, 254, 254, 254, 220, 173, 186, 212, 219, 14], - [254, 254, 15, 132, 254, 254, 254, 254, 57, 149, 254, 65, 223, 254, 249, 187], - [254, 254, 196, 114, 201, 44, 0, 0, 254, 254, 254, 254, 95, 47, 0, 0], - [254, 254, 250, 254, 194, 229, 156, 73, 230, 215, 254, 152, 246, 248, 71, 104], - [254, 254, 254, 5, 254, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [254, 254, 254, 22, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [254, 254, 254, 254, 37, 200, 254, 254, 254, 254, 254, 116, 0, 0, 0, 0], - [254, 254, 254, 254, 133, 246, 244, 112, 0, 184, 254, 125, 106, 221, 145, 29], - [254, 254, 254, 254, 254, 254, 254, 0, 254, 254, 254, 254, 254, 254, 254, 254], - [254, 254, 254, 254, 254, 254, 254, 254, 35, 0, 0, 0, 0, 0, 0, 0], - [254, 254, 254, 254, 254, 254, 254, 254, 101, 37, 0, 60, 65, 160, 18, 0], - [254, 254, 254, 254, 254, 254, 254, 254, 254, 7, 0, 0, 0, 0, 0, 0], - [254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 197, 254, 254, 254, 254, 254], - [254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 35, 254, 254, 254, 254], - [254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 84, 254, 254, 254], - [254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 11, 0, 0], - [254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 25, 0], - [254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 198, 115], - [254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 40], - [254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 96], - [254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 125], - [254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254], + static BITSET_INDEX_CHUNKS: [[u8; 8]; 75] = [ + [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 195, 205, 10, 0], [0, 0, 0, 0, 254, 254, 254, 254], + [0, 0, 0, 58, 0, 0, 0, 0], [0, 0, 0, 92, 0, 0, 66, 0], [0, 0, 69, 0, 199, 6, 195, 93], + [0, 0, 182, 52, 0, 0, 0, 0], [0, 0, 233, 19, 216, 108, 237, 21], + [0, 107, 103, 180, 254, 254, 254, 254], [0, 148, 30, 0, 172, 226, 9, 0], + [0, 184, 254, 125, 106, 221, 145, 29], [0, 254, 0, 0, 254, 247, 39, 68], + [35, 0, 0, 0, 0, 0, 0, 0], [48, 80, 254, 169, 206, 123, 189, 139], + [53, 0, 0, 0, 129, 17, 0, 0], [57, 149, 254, 65, 223, 254, 249, 187], + [59, 54, 185, 203, 171, 191, 161, 117], [62, 0, 0, 0, 0, 0, 0, 0], + [63, 0, 0, 0, 0, 0, 0, 0], [88, 122, 31, 46, 89, 74, 20, 0], + [95, 131, 168, 105, 254, 254, 254, 82], [95, 179, 145, 86, 211, 204, 254, 56], + [101, 0, 225, 146, 151, 2, 217, 45], [101, 37, 0, 60, 65, 160, 18, 0], + [109, 16, 127, 38, 1, 192, 124, 0], [111, 135, 113, 0, 0, 0, 0, 0], + [119, 253, 224, 175, 193, 254, 227, 195], [134, 0, 202, 51, 163, 43, 0, 0], + [143, 190, 91, 0, 153, 218, 24, 0], [144, 246, 32, 101, 0, 0, 0, 0], + [145, 4, 97, 0, 55, 0, 0, 0], [150, 94, 37, 85, 102, 0, 157, 0], + [154, 34, 254, 110, 0, 84, 0, 0], [158, 87, 164, 118, 162, 67, 159, 23], + [166, 42, 165, 72, 167, 177, 126, 76], [176, 246, 234, 174, 254, 254, 254, 254], + [213, 239, 254, 77, 209, 64, 142, 238], [225, 101, 207, 89, 98, 81, 208, 10], + [230, 215, 254, 152, 246, 248, 71, 104], [232, 83, 147, 1, 188, 13, 178, 70], + [237, 254, 254, 254, 254, 254, 254, 254], [253, 254, 254, 254, 254, 254, 254, 254], + [254, 6, 100, 50, 75, 90, 254, 28], [254, 7, 0, 0, 0, 0, 0, 0], + [254, 9, 75, 75, 49, 0, 0, 0], [254, 41, 254, 8, 0, 0, 141, 33], + [254, 62, 254, 254, 254, 3, 0, 0], [254, 121, 36, 0, 0, 0, 0, 0], + [254, 210, 254, 25, 136, 251, 71, 243], [254, 214, 231, 99, 79, 78, 183, 27], + [254, 235, 140, 241, 240, 26, 228, 128], [254, 242, 170, 252, 138, 245, 254, 254], + [254, 254, 15, 132, 254, 254, 254, 254], [254, 254, 196, 114, 201, 44, 0, 0], + [254, 254, 197, 254, 254, 254, 254, 254], [254, 254, 220, 173, 186, 212, 219, 14], + [254, 254, 250, 254, 194, 229, 156, 73], [254, 254, 254, 5, 254, 12, 0, 0], + [254, 254, 254, 22, 9, 0, 0, 0], [254, 254, 254, 35, 254, 254, 254, 254], + [254, 254, 254, 61, 0, 155, 222, 181], [254, 254, 254, 116, 0, 0, 0, 0], + [254, 254, 254, 254, 37, 200, 254, 254], [254, 254, 254, 254, 84, 254, 254, 254], + [254, 254, 254, 254, 95, 47, 0, 0], [254, 254, 254, 254, 133, 246, 244, 112], + [254, 254, 254, 254, 236, 130, 137, 120], [254, 254, 254, 254, 254, 11, 0, 0], + [254, 254, 254, 254, 254, 254, 25, 0], [254, 254, 254, 254, 254, 254, 198, 115], + [254, 254, 254, 254, 254, 254, 254, 0], [254, 254, 254, 254, 254, 254, 254, 40], + [254, 254, 254, 254, 254, 254, 254, 96], [254, 254, 254, 254, 254, 254, 254, 125], + [254, 254, 254, 254, 254, 254, 254, 254], ]; static BITSET: [u64; 255] = [ 0, 1, 7, 15, 17, 31, 63, 127, 179, 511, 1023, 2047, 2191, 4079, 4087, 8191, 8319, 16384, @@ -297,8 +291,8 @@ pub mod cc { static BITSET_LAST_CHUNK_MAP: (u16, u8) = (0, 0); static BITSET_CHUNKS_MAP: [u8; 0] = [ ]; - static BITSET_INDEX_CHUNKS: [[u8; 16]; 1] = [ - [1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + static BITSET_INDEX_CHUNKS: [[u8; 5]; 1] = [ + [1, 2, 1, 0, 0], ]; static BITSET: [u64; 3] = [ 0, 4294967295, 9223372036854775808, @@ -460,49 +454,35 @@ pub mod lowercase { #[rustfmt::skip] pub mod n { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (127, 0); - static BITSET_CHUNKS_MAP: [u8; 127] = [ - 31, 8, 11, 25, 19, 4, 29, 21, 24, 28, 0, 16, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 3, 13, 18, 26, 17, 23, 20, 15, 22, 0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 5, 2, 0, 0, 10, 0, 14, 27, 12, 0, 1, + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (254, 0); + static BITSET_CHUNKS_MAP: [u8; 254] = [ + 44, 0, 0, 29, 5, 31, 35, 26, 22, 6, 0, 12, 40, 20, 27, 0, 33, 0, 39, 7, 0, 0, 17, 0, 45, + 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41, 43, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 10, 16, 21, 0, 37, 34, 18, 36, 32, 15, 25, 24, 13, 0, + 30, 1, 0, 0, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 14, 0, 3, 0, 0, 0, 0, 4, 15, 0, 0, 11, 0, 38, 0, 8, 0, 0, 0, 0, 2, ]; - static BITSET_INDEX_CHUNKS: [[u8; 16]; 34] = [ - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 47], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 72], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 33, 0, 0, 0, 49], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 43, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 0, 0, 22, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 47, 0, 0, 0, 2], - [0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 0, 31, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 47, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 31, 0, 45, 0, 31, 0, 31, 0, 41, 0, 34], - [0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 37, 44, 4, 0, 0, 0, 0, 52, 23, 3, 0, 13], - [0, 0, 0, 7, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 35, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 62, 47, 0, 0, 0, 0, 60, 0, 0, 24, 10, 0, 5], - [0, 0, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 2, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 53, 0, 0], - [0, 15, 0, 15, 0, 0, 0, 0, 0, 15, 0, 2, 51, 0, 0, 0], - [0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 26, 0, 0, 0, 15, 25, 0, 0, 0, 0, 0, 0, 0, 0, 11], - [0, 32, 0, 47, 65, 0, 0, 39, 0, 0, 0, 47, 0, 0, 0, 0], - [0, 46, 2, 0, 0, 71, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 59, 0, 31, 0, 42, 0, 31, 0, 15, 0, 15, 36, 0, 0, 0], - [0, 63, 30, 61, 18, 0, 55, 70, 0, 57, 20, 28, 0, 64, 29, 0], - [0, 66, 38, 0, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 69, 19, 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65, 9, 0], - [15, 0, 0, 0, 0, 8, 0, 17, 0, 0, 16, 0, 0, 15, 47, 0], - [40, 0, 0, 15, 2, 0, 0, 48, 0, 15, 0, 0, 0, 0, 0, 47], - [47, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [50, 0, 0, 0, 0, 0, 12, 0, 25, 21, 67, 0, 0, 0, 0, 0], - [73, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + static BITSET_INDEX_CHUNKS: [[u8; 8]; 47] = [ + [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 11], [0, 0, 0, 0, 0, 0, 0, 47], + [0, 0, 0, 0, 0, 0, 0, 72], [0, 0, 0, 0, 0, 2, 0, 0], [0, 0, 0, 0, 0, 31, 0, 45], + [0, 0, 0, 0, 0, 53, 0, 0], [0, 0, 0, 0, 0, 65, 9, 0], [0, 0, 0, 0, 6, 0, 0, 0], + [0, 0, 0, 0, 15, 0, 0, 0], [0, 0, 0, 0, 37, 44, 4, 0], [0, 0, 0, 7, 0, 15, 0, 0], + [0, 0, 0, 33, 0, 0, 0, 49], [0, 0, 0, 35, 0, 15, 0, 0], [0, 0, 0, 36, 0, 43, 0, 0], + [0, 0, 0, 47, 0, 0, 0, 0], [0, 0, 0, 52, 23, 3, 0, 13], [0, 0, 0, 54, 0, 0, 0, 0], + [0, 0, 0, 62, 47, 0, 0, 0], [0, 0, 14, 0, 0, 0, 0, 0], [0, 0, 16, 0, 0, 15, 47, 0], + [0, 0, 25, 0, 0, 0, 0, 0], [0, 2, 15, 0, 0, 0, 0, 0], [0, 15, 0, 0, 0, 0, 0, 47], + [0, 15, 0, 2, 51, 0, 0, 0], [0, 15, 0, 15, 0, 0, 0, 0], [0, 15, 0, 15, 36, 0, 0, 0], + [0, 16, 0, 0, 0, 0, 0, 0], [0, 25, 0, 0, 0, 22, 0, 0], [0, 25, 0, 47, 0, 0, 0, 2], + [0, 26, 0, 0, 0, 15, 25, 0], [0, 31, 0, 31, 0, 41, 0, 34], [0, 32, 0, 47, 65, 0, 0, 39], + [0, 46, 2, 0, 0, 71, 1, 0], [0, 57, 20, 28, 0, 64, 29, 0], [0, 59, 0, 31, 0, 42, 0, 31], + [0, 60, 0, 0, 24, 10, 0, 5], [0, 63, 30, 61, 18, 0, 55, 70], [0, 66, 38, 0, 56, 0, 0, 0], + [0, 69, 19, 68, 0, 0, 0, 0], [15, 0, 0, 0, 0, 8, 0, 17], [25, 0, 0, 31, 0, 0, 0, 0], + [25, 21, 67, 0, 0, 0, 0, 0], [40, 0, 0, 15, 2, 0, 0, 48], [47, 0, 58, 0, 0, 0, 0, 0], + [50, 0, 0, 0, 0, 0, 12, 0], [73, 27, 0, 0, 0, 0, 0, 0], ]; static BITSET: [u64; 74] = [ 0, 999, 1023, 1026, 3072, 4064, 8191, 65408, 65472, 1048575, 1966080, 2097151, 3932160, @@ -591,16 +571,12 @@ pub mod uppercase { #[rustfmt::skip] pub mod white_space { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (12, 2); - static BITSET_CHUNKS_MAP: [u8; 9] = [ - 3, 0, 0, 0, 0, 1, 0, 0, 4, + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (32, 2); + static BITSET_CHUNKS_MAP: [u8; 22] = [ + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, ]; - static BITSET_INDEX_CHUNKS: [[u8; 16]; 5] = [ - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], - [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [4, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + static BITSET_INDEX_CHUNKS: [[u8; 6]; 4] = [ + [0, 0, 0, 0, 0, 0], [0, 0, 5, 2, 0, 0], [1, 0, 0, 0, 0, 0], [4, 0, 3, 0, 0, 0], ]; static BITSET: [u64; 6] = [ 0, 1, 2147483648, 4294967328, 4294983168, 144036023240703, diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index 3e60ce13f9223..5d4a4c0e04498 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -42,6 +42,7 @@ use std::convert::TryFrom; use std::fmt::Write; use std::ops::Range; +#[derive(Clone)] pub struct RawEmitter { pub file: String, pub bytes_used: usize, @@ -65,6 +66,8 @@ impl RawEmitter { if unique_words.len() > u8::max_value() as usize { panic!("cannot pack {} into 8 bits", unique_words.len()); } + // needed for the chunk mapping to work + assert_eq!(unique_words[0], 0, "first word is all zeros"); let word_indices = unique_words .iter() @@ -72,17 +75,42 @@ impl RawEmitter { .enumerate() .map(|(idx, word)| (word, u8::try_from(idx).unwrap())) .collect::>(); + let compressed_words = words.iter().map(|w| word_indices[w]).collect::>(); + + let mut best = None; + for length in 1..=64 { + let mut temp = self.clone(); + temp.emit_chunk_map(&compressed_words, length); + if let Some((_, size)) = best { + if temp.bytes_used < size { + best = Some((length, temp.bytes_used)); + } + } else { + best = Some((length, temp.bytes_used)); + } + } + self.emit_chunk_map(&compressed_words, best.unwrap().0); + + writeln!( + &mut self.file, + "static BITSET: [u64; {}] = [{}];", + unique_words.len(), + fmt_list(&unique_words), + ) + .unwrap(); + self.bytes_used += 8 * unique_words.len(); + } - let mut idx = words.iter().map(|w| word_indices[w]).collect::>(); - let chunk_length = 16; - for _ in 0..(chunk_length - (idx.len() % chunk_length)) { - assert_eq!(unique_words[0], 0, "first word is all zeros"); - // pad out bitset index with zero words so we have all chunks of 16 - idx.push(0); + fn emit_chunk_map(&mut self, compressed_words: &[u8], chunk_length: usize) { + let mut compressed_words = compressed_words.to_vec(); + for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) { + // pad out bitset index with zero words so we have all chunks of + // chunkchunk_length + compressed_words.push(0); } let mut chunks = BTreeSet::new(); - for chunk in idx.chunks(chunk_length) { + for chunk in compressed_words.chunks(chunk_length) { chunks.insert(chunk); } let chunk_map = chunks @@ -92,7 +120,7 @@ impl RawEmitter { .map(|(idx, chunk)| (chunk, idx)) .collect::>(); let mut chunk_indices = Vec::new(); - for chunk in idx.chunks(chunk_length) { + for chunk in compressed_words.chunks(chunk_length) { chunk_indices.push(chunk_map[chunk]); } writeln!( @@ -105,7 +133,6 @@ impl RawEmitter { self.bytes_used += 3; // Strip out the empty pieces, presuming our above pop() made us now // have some trailing zeros. - assert_eq!(unique_words[0], 0, "first word is all zeros"); while let Some(0) = chunk_indices.last() { chunk_indices.pop(); } @@ -119,20 +146,13 @@ impl RawEmitter { self.bytes_used += chunk_indices.len(); writeln!( &mut self.file, - "static BITSET_INDEX_CHUNKS: [[u8; 16]; {}] = [{}];", + "static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];", + chunk_length, chunks.len(), fmt_list(chunks.iter()), ) .unwrap(); - self.bytes_used += 16 * chunks.len(); - writeln!( - &mut self.file, - "static BITSET: [u64; {}] = [{}];", - unique_words.len(), - fmt_list(&unique_words), - ) - .unwrap(); - self.bytes_used += 8 * unique_words.len(); + self.bytes_used += chunk_length * chunks.len(); } pub fn emit_lookup(&mut self) { From 580a6342ef9d435d241b74e86b99dc1131a526f8 Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Thu, 19 Mar 2020 13:15:32 -0400 Subject: [PATCH 03/14] Generate tests for Unicode property data Currently the test file takes a while to compile -- 30 seconds or so -- but since it's not going to be committed, and is just for local testing, that seems fine. --- src/tools/unicode-table-generator/src/main.rs | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 839d914baa954..39c288dfc61e8 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -152,9 +152,17 @@ fn main() { std::process::exit(1); }); + // Optional test path, which is a Rust source file testing that the unicode + // property lookups are correct. + let test_path = std::env::args().nth(2); + let unicode_data = load_data(); let ranges_by_property = &unicode_data.ranges; + if let Some(path) = test_path { + std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap(); + } + let mut total_bytes = 0; let mut modules = Vec::new(); for (property, ranges) in ranges_by_property { @@ -236,6 +244,99 @@ fn fmt_list(values: impl IntoIterator) -> String { out } +fn generate_tests(data_path: &str, ranges: &[(&str, Vec>)]) -> String { + let mut s = String::new(); + s.push_str("#![allow(incomplete_features, unused)]\n"); + s.push_str("#![feature(const_generics)]\n\n"); + s.push_str(&format!("#[path = \"{}\"]\n", data_path)); + s.push_str("mod unicode_data;\n\n"); + + s.push_str( + " +#[inline(always)] +fn range_search( + needle: u32, + chunk_idx_map: &[u8; N], + (last_chunk_idx, last_chunk_mapping): (u16, u8), + bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], + bitset: &[u64; N2], +) -> bool { + let bucket_idx = (needle / 64) as usize; + let chunk_map_idx = bucket_idx / CHUNK_SIZE; + let chunk_piece = bucket_idx % CHUNK_SIZE; + let chunk_idx = if chunk_map_idx >= N { + if chunk_map_idx == last_chunk_idx as usize { + last_chunk_mapping + } else { + return false; + } + } else { + chunk_idx_map[chunk_map_idx] + }; + let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece]; + let word = bitset[(idx as usize)]; + (word & (1 << (needle % 64) as u64)) != 0 +} + ", + ); + + s.push_str("\nfn main() {\n"); + + for (property, ranges) in ranges { + s.push_str(&format!(r#" println!("Testing {}");"#, property)); + s.push('\n'); + s.push_str(&format!(" {}();\n", property.to_lowercase())); + let mut is_true = Vec::new(); + let mut is_false = Vec::new(); + for ch_num in 0..(std::char::MAX as u32) { + if std::char::from_u32(ch_num).is_none() { + continue; + } + if ranges.iter().any(|r| r.contains(&ch_num)) { + is_true.push(ch_num); + } else { + is_false.push(ch_num); + } + } + + s.push_str(&format!(" fn {}() {{\n", property.to_lowercase())); + generate_asserts(&mut s, property, &is_true, true); + generate_asserts(&mut s, property, &is_false, false); + s.push_str(" }\n\n"); + } + + s.push_str("}"); + s +} + +fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) { + for range in ranges_from_set(points) { + if range.end == range.start + 1 { + s.push_str(&format!( + " assert!({}unicode_data::{}::lookup(std::char::from_u32({}).unwrap()), \"{}\");\n", + if truthy { "" } else { "!" }, + property.to_lowercase(), + range.start, + std::char::from_u32(range.start).unwrap(), + )); + } else { + s.push_str(&format!(" for chn in {:?}u32 {{\n", range)); + s.push_str(&format!( + " assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n", + if truthy { "" } else { "!" }, + property.to_lowercase(), + )); + s.push_str(" }\n"); + } + } +} + +fn ranges_from_set(set: &[u32]) -> Vec> { + let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::>>(); + merge_ranges(&mut ranges); + ranges +} + fn merge_ranges(ranges: &mut Vec>) { loop { let mut new_ranges = Vec::new(); From 6c7691a37bf485b28fecb6856e6ede8fa952f99e Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Fri, 20 Mar 2020 18:38:08 -0400 Subject: [PATCH 04/14] Pre-pop zero chunks before mapping LAST_CHUNK_MAP This avoids wasting a small amount of space for some of the data sets. The chunk resizing is caused by but not directly related to changes in this commit. Alphabetic : 3036 bytes Case_Ignorable : 2133 bytes (- 3 bytes) Cased : 934 bytes Cc : 32 bytes Grapheme_Extend: 1760 bytes (-14 bytes) Lowercase : 985 bytes N : 1220 bytes (- 5 bytes) Uppercase : 934 bytes White_Space : 97 bytes Total table sizes: 11131 bytes (-22 bytes) --- src/libcore/unicode/unicode_data.rs | 160 ++++++++---------- .../src/raw_emitter.rs | 24 ++- 2 files changed, 88 insertions(+), 96 deletions(-) diff --git a/src/libcore/unicode/unicode_data.rs b/src/libcore/unicode/unicode_data.rs index a89f3481f4957..7a72f080e33f1 100644 --- a/src/libcore/unicode/unicode_data.rs +++ b/src/libcore/unicode/unicode_data.rs @@ -134,49 +134,41 @@ pub mod alphabetic { #[rustfmt::skip] pub mod case_ignorable { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (896, 33); - static BITSET_CHUNKS_MAP: [u8; 125] = [ - 25, 14, 21, 30, 28, 4, 17, 23, 22, 0, 0, 16, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 13, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 3, 6, 9, 0, 7, 11, 32, 31, 26, 29, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, - 10, 0, 8, 0, 19, 0, 12, 0, 1, + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 51); + static BITSET_CHUNKS_MAP: [u8; 250] = [ + 36, 19, 16, 26, 29, 40, 47, 38, 42, 5, 0, 9, 23, 25, 34, 3, 30, 0, 0, 0, 0, 0, 21, 31, 39, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 15, 22, 28, + 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 32, 1, 11, 0, 0, 0, 44, 8, 18, 50, 41, 49, 45, 37, 43, + 46, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 6, 20, 0, 0, 0, 48, 0, 0, 27, 12, 0, 0, 10, 0, 0, 0, 0, 2, ]; - static BITSET_INDEX_CHUNKS: [[u8; 16]; 34] = [ - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 166], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22, 47, 57], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 0, 173, 3], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 94, 90, 136, 38], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 104, 7, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 78, 27, 0, 148, 138, 81, 44, 119], - [0, 0, 0, 0, 0, 0, 0, 0, 154, 0, 0, 58, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 167, 99, 77, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 130, 0, 0, 0, 48, 0, 116, 0, 0], - [0, 0, 0, 0, 0, 172, 70, 0, 0, 8, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 60, 0, 0, 0, 0, 0, 67, 0, 0, 24, 0, 0], - [0, 0, 0, 29, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 135, 0, 0, 0, 0, 16, 162, 46, 86, 51, 80, 13, 111], - [0, 0, 12, 0, 0, 43, 163, 92, 35, 82, 0, 71, 175, 14, 83, 131], - [0, 0, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 133, 0, 87, 0, 150, 0, 178, 75, 0, 0, 0, 0, 0, 0, 0], - [20, 5, 61, 0, 120, 0, 0, 0, 32, 156, 176, 1, 126, 91, 69, 88], - [26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [62, 0, 0, 0, 137, 0, 0, 0, 0, 0, 0, 76, 0, 0, 0, 0], - [66, 0, 0, 152, 72, 25, 134, 59, 102, 124, 165, 101, 0, 64, 0, 68], - [73, 33, 0, 181, 125, 85, 122, 139, 123, 100, 123, 169, 155, 54, 4, 18], - [74, 151, 36, 84, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [106, 135, 0, 112, 177, 107, 180, 168, 0, 0, 0, 0, 0, 0, 157, 142], - [109, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [113, 50, 108, 0, 0, 0, 0, 0, 0, 0, 174, 182, 182, 114, 10, 0], - [115, 0, 0, 0, 141, 5, 0, 49, 145, 34, 31, 0, 0, 0, 0, 0], - [118, 0, 42, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [143, 95, 37, 121, 0, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0], - [161, 0, 103, 0, 160, 11, 30, 0, 0, 0, 0, 93, 0, 0, 0, 0], - [164, 55, 155, 53, 127, 52, 2, 28, 117, 21, 128, 19, 110, 147, 129, 9], - [170, 41, 153, 6, 0, 0, 159, 39, 158, 1, 105, 0, 65, 0, 0, 0], - [171, 149, 132, 17, 98, 89, 146, 23, 140, 0, 0, 63, 127, 97, 0, 0], - [179, 182, 0, 0, 182, 182, 182, 79, 0, 0, 0, 0, 0, 0, 0, 0], + static BITSET_INDEX_CHUNKS: [[u8; 8]; 52] = [ + [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 130], [0, 0, 0, 0, 0, 0, 0, 166], + [0, 0, 0, 0, 0, 0, 157, 142], [0, 0, 0, 0, 0, 22, 47, 57], [0, 0, 0, 0, 0, 45, 0, 0], + [0, 0, 0, 0, 0, 172, 70, 0], [0, 0, 0, 0, 40, 0, 173, 3], [0, 0, 0, 0, 60, 0, 0, 0], + [0, 0, 0, 0, 94, 90, 136, 38], [0, 0, 0, 29, 0, 15, 0, 0], [0, 0, 0, 48, 0, 116, 0, 0], + [0, 0, 0, 76, 0, 0, 0, 0], [0, 0, 0, 93, 0, 0, 0, 0], [0, 0, 0, 96, 104, 7, 0, 0], + [0, 0, 0, 135, 0, 0, 0, 0], [0, 0, 12, 0, 0, 43, 163, 92], [0, 0, 56, 0, 0, 0, 0, 0], + [0, 0, 67, 0, 0, 24, 0, 0], [0, 0, 174, 182, 182, 114, 10, 0], [0, 8, 0, 0, 0, 0, 0, 0], + [0, 133, 0, 87, 0, 150, 0, 178], [16, 162, 46, 86, 51, 80, 13, 111], + [20, 5, 61, 0, 120, 0, 0, 0], [26, 0, 0, 0, 0, 0, 0, 0], [32, 156, 176, 1, 126, 91, 69, 88], + [35, 82, 0, 71, 175, 14, 83, 131], [62, 0, 0, 0, 137, 0, 0, 0], + [66, 0, 0, 152, 72, 25, 134, 59], [73, 33, 0, 181, 125, 85, 122, 139], + [74, 151, 36, 84, 0, 0, 0, 0], [75, 0, 0, 0, 0, 0, 0, 0], + [78, 27, 0, 148, 138, 81, 44, 119], [102, 124, 165, 101, 0, 64, 0, 68], + [106, 135, 0, 112, 177, 107, 180, 168], [109, 0, 0, 0, 0, 0, 0, 0], + [113, 50, 108, 0, 0, 0, 0, 0], [115, 0, 0, 0, 141, 5, 0, 49], + [117, 21, 128, 19, 110, 147, 129, 9], [118, 0, 42, 144, 0, 0, 0, 0], + [123, 100, 123, 169, 155, 54, 4, 18], [140, 0, 0, 63, 127, 97, 0, 0], + [143, 95, 37, 121, 0, 0, 0, 0], [145, 34, 31, 0, 0, 0, 0, 0], [154, 0, 0, 58, 0, 0, 0, 0], + [158, 1, 105, 0, 65, 0, 0, 0], [161, 0, 103, 0, 160, 11, 30, 0], + [164, 55, 155, 53, 127, 52, 2, 28], [167, 99, 77, 0, 0, 0, 0, 0], + [170, 41, 153, 6, 0, 0, 159, 39], [171, 149, 132, 17, 98, 89, 146, 23], + [179, 182, 0, 0, 182, 182, 182, 79], ]; static BITSET: [u64; 183] = [ 0, 1, 2, 3, 4, 8, 13, 15, 28, 64, 176, 191, 1016, 1792, 2047, 4080, 4096, 8192, 8193, @@ -288,11 +280,12 @@ pub mod cased { #[rustfmt::skip] pub mod cc { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (0, 0); - static BITSET_CHUNKS_MAP: [u8; 0] = [ + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (2, 1); + static BITSET_CHUNKS_MAP: [u8; 2] = [ + 1, 2, ]; - static BITSET_INDEX_CHUNKS: [[u8; 5]; 1] = [ - [1, 2, 1, 0, 0], + static BITSET_INDEX_CHUNKS: [[u8; 1]; 3] = [ + [0], [1], [2], ]; static BITSET: [u64; 3] = [ 0, 4294967295, 9223372036854775808, @@ -311,46 +304,37 @@ pub mod cc { #[rustfmt::skip] pub mod grapheme_extend { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (896, 30); - static BITSET_CHUNKS_MAP: [u8; 123] = [ - 4, 15, 21, 27, 25, 3, 18, 23, 17, 0, 0, 14, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 2, 7, 10, 0, 8, 12, 29, 28, 24, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, - 11, 0, 9, 0, 19, 0, 13, + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 44); + static BITSET_CHUNKS_MAP: [u8; 245] = [ + 0, 8, 15, 22, 26, 33, 40, 32, 35, 3, 0, 7, 21, 23, 30, 0, 20, 0, 0, 0, 0, 0, 12, 0, 27, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 25, 29, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 5, 0, 28, 1, 10, 0, 0, 0, 37, 6, 17, 43, 34, 42, 38, 31, 36, 39, 13, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 14, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 18, 0, 0, + 0, 41, 0, 0, 24, 11, 0, 0, 9, ]; - static BITSET_INDEX_CHUNKS: [[u8; 16]; 31] = [ - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 20, 46], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 33, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 77, 74, 106, 31], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 143, 66, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 79, 87, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 107, 37, 70, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 65, 0, 0, 0, 0, 0, 37, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 121, 0, 0, 48, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 134, 82, 64, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 103, 0, 0, 0, 39, 0, 94, 0, 0], - [0, 0, 0, 0, 0, 133, 58, 0, 0, 5, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 49, 0, 0, 0, 0, 0, 55, 0, 0, 18, 0, 0], - [0, 0, 0, 21, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 71, 0, 118, 0, 142, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 9, 0, 0, 0, 129, 7, 26, 67, 0, 59, 140, 11, 68, 104], - [0, 0, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [12, 0, 0, 69, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [13, 0, 50, 0, 96, 0, 0, 0, 27, 123, 139, 1, 100, 75, 57, 72], - [51, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 0], - [54, 0, 0, 120, 61, 19, 105, 47, 85, 98, 131, 84, 0, 0, 0, 56], - [60, 28, 0, 141, 99, 45, 111, 109, 97, 83, 97, 136, 132, 44, 108, 22], - [63, 0, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [89, 0, 0, 91, 0, 0, 0, 135, 0, 0, 0, 0, 0, 0, 0, 0], - [93, 0, 0, 0, 113, 3, 0, 40, 115, 29, 24, 0, 0, 0, 0, 0], - [114, 78, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 0, 0], - [128, 0, 86, 0, 127, 8, 23, 0, 0, 0, 0, 76, 0, 0, 0, 0], - [130, 42, 122, 41, 112, 43, 2, 36, 95, 15, 101, 14, 90, 117, 102, 6], - [137, 34, 124, 4, 0, 0, 126, 32, 125, 1, 88, 0, 53, 0, 0, 0], - [138, 119, 92, 0, 81, 73, 116, 17, 110, 0, 0, 52, 112, 80, 0, 0], - [142, 143, 0, 0, 143, 143, 143, 66, 0, 0, 0, 0, 0, 0, 0, 0], + static BITSET_INDEX_CHUNKS: [[u8; 8]; 45] = [ + [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 103], [0, 0, 0, 0, 0, 16, 20, 46], + [0, 0, 0, 0, 0, 38, 0, 0], [0, 0, 0, 0, 0, 133, 58, 0], [0, 0, 0, 0, 33, 0, 0, 0], + [0, 0, 0, 0, 49, 0, 0, 0], [0, 0, 0, 0, 77, 74, 106, 31], [0, 0, 0, 0, 143, 66, 0, 0], + [0, 0, 0, 21, 0, 10, 0, 0], [0, 0, 0, 39, 0, 94, 0, 0], [0, 0, 0, 62, 0, 0, 0, 0], + [0, 0, 0, 71, 0, 118, 0, 142], [0, 0, 0, 76, 0, 0, 0, 0], [0, 0, 0, 79, 87, 0, 0, 0], + [0, 0, 9, 0, 0, 0, 129, 7], [0, 0, 35, 0, 0, 0, 0, 0], [0, 0, 55, 0, 0, 18, 0, 0], + [0, 5, 0, 0, 0, 0, 0, 0], [0, 107, 37, 70, 0, 0, 0, 0], [12, 0, 0, 69, 0, 0, 0, 0], + [13, 0, 50, 0, 96, 0, 0, 0], [26, 67, 0, 59, 140, 11, 68, 104], + [27, 123, 139, 1, 100, 75, 57, 72], [51, 0, 0, 0, 87, 0, 0, 0], + [54, 0, 0, 120, 61, 19, 105, 47], [60, 28, 0, 141, 99, 45, 111, 109], + [63, 0, 25, 0, 0, 0, 0, 0], [65, 0, 0, 0, 0, 0, 37, 0], [85, 98, 131, 84, 0, 0, 0, 56], + [89, 0, 0, 91, 0, 0, 0, 135], [93, 0, 0, 0, 113, 3, 0, 40], + [95, 15, 101, 14, 90, 117, 102, 6], [97, 83, 97, 136, 132, 44, 108, 22], + [110, 0, 0, 52, 112, 80, 0, 0], [114, 78, 30, 0, 0, 0, 0, 0], [115, 29, 24, 0, 0, 0, 0, 0], + [121, 0, 0, 48, 0, 0, 0, 0], [125, 1, 88, 0, 53, 0, 0, 0], [128, 0, 86, 0, 127, 8, 23, 0], + [130, 42, 122, 41, 112, 43, 2, 36], [134, 82, 64, 0, 0, 0, 0, 0], + [137, 34, 124, 4, 0, 0, 126, 32], [138, 119, 92, 0, 81, 73, 116, 17], + [142, 143, 0, 0, 143, 143, 143, 66], ]; static BITSET: [u64; 144] = [ 0, 1, 2, 8, 13, 28, 64, 182, 191, 1016, 2032, 2047, 4096, 14336, 16128, 32640, 32768, @@ -454,8 +438,8 @@ pub mod lowercase { #[rustfmt::skip] pub mod n { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (254, 0); - static BITSET_CHUNKS_MAP: [u8; 254] = [ + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (253, 2); + static BITSET_CHUNKS_MAP: [u8; 249] = [ 44, 0, 0, 29, 5, 31, 35, 26, 22, 6, 0, 12, 40, 20, 27, 0, 33, 0, 39, 7, 0, 0, 17, 0, 45, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41, 43, @@ -464,7 +448,7 @@ pub mod n { 30, 1, 0, 0, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 14, 0, 3, 0, 0, 0, 0, 4, 15, 0, 0, 11, 0, 38, 0, 8, 0, 0, 0, 0, 2, + 14, 0, 3, 0, 0, 0, 0, 4, 15, 0, 0, 11, 0, 38, 0, 8, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 47] = [ [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 11], [0, 0, 0, 0, 0, 0, 0, 47], diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index 5d4a4c0e04498..5f66bcbebaf03 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -67,7 +67,7 @@ impl RawEmitter { panic!("cannot pack {} into 8 bits", unique_words.len()); } // needed for the chunk mapping to work - assert_eq!(unique_words[0], 0, "first word is all zeros"); + assert_eq!(unique_words[0], 0, "has a zero word"); let word_indices = unique_words .iter() @@ -80,7 +80,7 @@ impl RawEmitter { let mut best = None; for length in 1..=64 { let mut temp = self.clone(); - temp.emit_chunk_map(&compressed_words, length); + temp.emit_chunk_map(word_indices[&0], &compressed_words, length); if let Some((_, size)) = best { if temp.bytes_used < size { best = Some((length, temp.bytes_used)); @@ -89,7 +89,7 @@ impl RawEmitter { best = Some((length, temp.bytes_used)); } } - self.emit_chunk_map(&compressed_words, best.unwrap().0); + self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0); writeln!( &mut self.file, @@ -101,12 +101,12 @@ impl RawEmitter { self.bytes_used += 8 * unique_words.len(); } - fn emit_chunk_map(&mut self, compressed_words: &[u8], chunk_length: usize) { + fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) { let mut compressed_words = compressed_words.to_vec(); for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) { // pad out bitset index with zero words so we have all chunks of // chunkchunk_length - compressed_words.push(0); + compressed_words.push(zero_at); } let mut chunks = BTreeSet::new(); @@ -123,6 +123,14 @@ impl RawEmitter { for chunk in compressed_words.chunks(chunk_length) { chunk_indices.push(chunk_map[chunk]); } + + // If one of the chunks has all of the entries point to the bitset + // word filled with zeros, then pop those off the end -- we know they + // are useless. + let zero_chunk_idx = chunks.iter().position(|chunk| chunk.iter().all(|e| *e == zero_at)); + while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx { + chunk_indices.pop(); + } writeln!( &mut self.file, "static BITSET_LAST_CHUNK_MAP: (u16, u8) = ({}, {});", @@ -131,9 +139,9 @@ impl RawEmitter { ) .unwrap(); self.bytes_used += 3; - // Strip out the empty pieces, presuming our above pop() made us now - // have some trailing zeros. - while let Some(0) = chunk_indices.last() { + // Try to pop again, now that we've recorded a non-zero pointing index + // into the LAST_CHUNK_MAP. + while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx { chunk_indices.pop(); } writeln!( From b0e121d9d588b334eaa1b68a127f5ee0fcda4296 Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Sat, 21 Mar 2020 10:16:01 -0400 Subject: [PATCH 05/14] Shrink bitset words through functional mapping Previously, all words in the (deduplicated) bitset would be stored raw -- a full 64 bits (8 bytes). Now, those words that are equivalent to others through a specific mapping are stored separately and "mapped" to the original when loading; this shrinks the table sizes significantly, as each mapped word is stored in 2 bytes (a 4x decrease from the previous). The new encoding is also potentially non-optimal: the "mapped" byte is frequently repeated, as in practice many mapped words use the same base word. Currently we only support two forms of mapping: rotation and inversion. Note that these are both guaranteed to map transitively if at all, and supporting mappings for which this is not true may require a more interesting algorithm for choosing the optimal pairing. Updated sizes: Alphabetic : 2622 bytes (- 414 bytes) Case_Ignorable : 1803 bytes (- 330 bytes) Cased : 808 bytes (- 126 bytes) Cc : 32 bytes Grapheme_Extend: 1508 bytes (- 252 bytes) Lowercase : 901 bytes (- 84 bytes) N : 1064 bytes (- 156 bytes) Uppercase : 838 bytes (- 96 bytes) White_Space : 91 bytes (- 6 bytes) Total table sizes: 9667 bytes (-1,464 bytes) --- src/libcore/unicode/mod.rs | 28 +- src/libcore/unicode/unicode_data.rs | 1347 ++++++++++++----- src/tools/unicode-table-generator/src/main.rs | 28 +- .../src/raw_emitter.rs | 240 ++- 4 files changed, 1211 insertions(+), 432 deletions(-) diff --git a/src/libcore/unicode/mod.rs b/src/libcore/unicode/mod.rs index d1c68863e1632..2a41685a48096 100644 --- a/src/libcore/unicode/mod.rs +++ b/src/libcore/unicode/mod.rs @@ -34,12 +34,19 @@ pub use unicode_data::uppercase::lookup as Uppercase; pub use unicode_data::white_space::lookup as White_Space; #[inline(always)] -fn range_search( +fn range_search< + const N: usize, + const CHUNK_SIZE: usize, + const N1: usize, + const CANONICAL: usize, + const CANONICALIZED: usize, +>( needle: u32, chunk_idx_map: &[u8; N], (last_chunk_idx, last_chunk_mapping): (u16, u8), bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], - bitset: &[u64; N2], + bitset_canonical: &[u64; CANONICAL], + bitset_canonicalized: &[(u8, u8); CANONICALIZED], ) -> bool { let bucket_idx = (needle / 64) as usize; let chunk_map_idx = bucket_idx / CHUNK_SIZE; @@ -53,7 +60,20 @@ fn range_search bool { @@ -127,85 +268,193 @@ pub mod alphabetic { &BITSET_CHUNKS_MAP, BITSET_LAST_CHUNK_MAP, &BITSET_INDEX_CHUNKS, - &BITSET, + &BITSET_CANONICAL, + &BITSET_MAPPING, ) } } #[rustfmt::skip] pub mod case_ignorable { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 51); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 24); static BITSET_CHUNKS_MAP: [u8; 250] = [ - 36, 19, 16, 26, 29, 40, 47, 38, 42, 5, 0, 9, 23, 25, 34, 3, 30, 0, 0, 0, 0, 0, 21, 31, 39, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 15, 22, 28, - 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 32, 1, 11, 0, 0, 0, 44, 8, 18, 50, 41, 49, 45, 37, 43, - 46, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 6, 20, 0, 0, 0, 48, 0, 0, 27, 12, 0, 0, 10, 0, 0, 0, 0, 2, + 12, 31, 34, 4, 7, 15, 22, 13, 17, 46, 50, 41, 28, 3, 11, 47, 8, 50, 50, 50, 50, 50, 29, 27, + 14, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 26, 50, 35, 25, 6, 10, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 42, 50, 9, 49, 36, 50, 50, 50, 19, 43, 33, 23, 16, 1, + 20, 51, 18, 21, 37, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 2, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 39, 50, 45, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 32, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 44, 30, 50, 50, 50, 0, 50, 50, 5, 38, 50, 50, 40, 50, 50, 50, 50, 48, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 52] = [ - [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 130], [0, 0, 0, 0, 0, 0, 0, 166], - [0, 0, 0, 0, 0, 0, 157, 142], [0, 0, 0, 0, 0, 22, 47, 57], [0, 0, 0, 0, 0, 45, 0, 0], - [0, 0, 0, 0, 0, 172, 70, 0], [0, 0, 0, 0, 40, 0, 173, 3], [0, 0, 0, 0, 60, 0, 0, 0], - [0, 0, 0, 0, 94, 90, 136, 38], [0, 0, 0, 29, 0, 15, 0, 0], [0, 0, 0, 48, 0, 116, 0, 0], - [0, 0, 0, 76, 0, 0, 0, 0], [0, 0, 0, 93, 0, 0, 0, 0], [0, 0, 0, 96, 104, 7, 0, 0], - [0, 0, 0, 135, 0, 0, 0, 0], [0, 0, 12, 0, 0, 43, 163, 92], [0, 0, 56, 0, 0, 0, 0, 0], - [0, 0, 67, 0, 0, 24, 0, 0], [0, 0, 174, 182, 182, 114, 10, 0], [0, 8, 0, 0, 0, 0, 0, 0], - [0, 133, 0, 87, 0, 150, 0, 178], [16, 162, 46, 86, 51, 80, 13, 111], - [20, 5, 61, 0, 120, 0, 0, 0], [26, 0, 0, 0, 0, 0, 0, 0], [32, 156, 176, 1, 126, 91, 69, 88], - [35, 82, 0, 71, 175, 14, 83, 131], [62, 0, 0, 0, 137, 0, 0, 0], - [66, 0, 0, 152, 72, 25, 134, 59], [73, 33, 0, 181, 125, 85, 122, 139], - [74, 151, 36, 84, 0, 0, 0, 0], [75, 0, 0, 0, 0, 0, 0, 0], - [78, 27, 0, 148, 138, 81, 44, 119], [102, 124, 165, 101, 0, 64, 0, 68], - [106, 135, 0, 112, 177, 107, 180, 168], [109, 0, 0, 0, 0, 0, 0, 0], - [113, 50, 108, 0, 0, 0, 0, 0], [115, 0, 0, 0, 141, 5, 0, 49], - [117, 21, 128, 19, 110, 147, 129, 9], [118, 0, 42, 144, 0, 0, 0, 0], - [123, 100, 123, 169, 155, 54, 4, 18], [140, 0, 0, 63, 127, 97, 0, 0], - [143, 95, 37, 121, 0, 0, 0, 0], [145, 34, 31, 0, 0, 0, 0, 0], [154, 0, 0, 58, 0, 0, 0, 0], - [158, 1, 105, 0, 65, 0, 0, 0], [161, 0, 103, 0, 160, 11, 30, 0], - [164, 55, 155, 53, 127, 52, 2, 28], [167, 99, 77, 0, 0, 0, 0, 0], - [170, 41, 153, 6, 0, 0, 159, 39], [171, 149, 132, 17, 98, 89, 146, 23], - [179, 182, 0, 0, 182, 182, 182, 79], + [2, 69, 56, 174, 174, 174, 174, 174], [4, 33, 106, 18, 174, 174, 111, 182], + [16, 174, 174, 174, 174, 174, 174, 174], [27, 109, 122, 128, 90, 63, 51, 61], + [29, 58, 174, 53, 121, 165, 5, 94], [45, 174, 174, 174, 168, 174, 174, 174], + [49, 174, 174, 105, 163, 167, 96, 44], [54, 159, 174, 127, 89, 60, 86, 99], + [55, 104, 30, 59, 174, 174, 174, 174], [57, 23, 174, 144, 98, 177, 146, 84], + [72, 88, 117, 71, 174, 47, 174, 50], [75, 15, 174, 79, 123, 76, 126, 0], + [80, 176, 77, 174, 174, 174, 174, 174], [82, 181, 92, 21, 78, 169, 93, 132], + [83, 174, 180, 3, 174, 174, 174, 174], [87, 70, 87, 118, 108, 40, 130, 20], + [100, 174, 174, 46, 91, 67, 174, 174], [102, 66, 31, 142, 174, 174, 174, 174], + [103, 28, 26, 174, 174, 174, 174, 174], [107, 174, 174, 147, 174, 174, 174, 174], + [110, 128, 74, 174, 48, 174, 174, 174], [113, 174, 73, 174, 112, 19, 25, 174], + [116, 41, 108, 39, 91, 38, 129, 24], [119, 164, 95, 134, 68, 141, 13, 22], + [125, 9, 174, 174, 9, 9, 9, 175], [133, 114, 154, 149, 37, 140, 158, 151], + [136, 174, 174, 174, 174, 174, 174, 174], [139, 174, 174, 174, 174, 174, 174, 174], + [153, 131, 17, 174, 85, 174, 174, 174], [174, 1, 174, 161, 174, 12, 174, 124], + [174, 157, 174, 174, 174, 174, 174, 174], [174, 174, 10, 9, 9, 81, 179, 174], + [174, 174, 42, 174, 174, 174, 174, 174], [174, 174, 148, 174, 174, 166, 174, 174], + [174, 174, 172, 174, 174, 34, 115, 64], [174, 174, 174, 15, 174, 174, 174, 174], + [174, 174, 174, 138, 174, 171, 174, 174], [174, 174, 174, 150, 174, 174, 174, 174], + [174, 174, 174, 156, 174, 174, 174, 174], [174, 174, 174, 170, 8, 152, 174, 174], + [174, 174, 174, 173, 174, 162, 174, 174], [174, 174, 174, 174, 65, 62, 97, 32], + [174, 174, 174, 174, 137, 174, 6, 145], [174, 174, 174, 174, 155, 174, 174, 174], + [174, 174, 174, 174, 174, 120, 52, 174], [174, 174, 174, 174, 174, 135, 35, 43], + [174, 174, 174, 174, 174, 160, 174, 174], [174, 174, 174, 174, 174, 174, 11, 101], + [174, 174, 174, 174, 174, 174, 174, 7], [174, 174, 174, 174, 174, 174, 174, 143], + [174, 174, 174, 174, 174, 174, 174, 174], [178, 174, 174, 174, 14, 131, 174, 36], + ]; + static BITSET_CANONICAL: [u64; 128] = [ + 0b1111101111111111111111111111111111111111111111111111111111111111, + 0b0011000000000000000000000000000000000000000000000000000000000000, + 0b1111100001111111111111111111111111111111111111111111111111111111, + 0b0111000000000000000000000000000000000000000000000000000000000000, + 0b1111111100000000000000000000000000000000000000000000000000000000, + 0b0000000000000001111111111100000000000000000000000000000000000000, + 0b1111111111111100000000000000000000000000000000000000000000000000, + 0b1111100000000000000000000000000000000000000000000000000000000000, + 0b0000000001111111000000000000000000000000000000000000000000000000, + 0b1111111111111111111111111111111111111111111111111111111111111111, + 0b1111111111111111000000000000000000000000000000000000000000000000, + 0b1010000000000000000000000000000000000000000000000000000000000000, + 0b1000000000000000100000000000000000000000000000000000000000000000, + 0b0111111111000000000000000000000000000000000000000000000000000011, + 0b0101100000000000000000000000000000000000000000000000000000000000, + 0b0011111100000000000000000000000000000000000000000000000000000000, + 0b0000000111111111000000000000000000000000000000000000000000000000, + 0b0000000000000000000000100000000000000000000000000000000001100000, + 0b0000000000000000000000000000000000000000000000000000000000001101, + 0b0000000000000000000000000000000000000000000000000000000010111111, + 0b0000000000000000000000000000000000000000000000000010000000000001, + 0b0000000000000000000000000000000000000000000000000011111101000000, + 0b0000000000000000000000000000000000000000000000001001111000000000, + 0b0000000000000000000000000000000000000000001001000000000000000000, + 0b0000000000000000000000000000000000000000010111000000010000000000, + 0b0000000000000000000000000000000000000000101000110000000000000000, + 0b0000000000000000000000000000000000000011011111111111110000000000, + 0b0000000000000000000000000000000000001001100000000000000000000000, + 0b0000000000000000000000000000000000001110011111100000000010000000, + 0b0000000000000000000000000000000000010111111111110000000000111111, + 0b0000000000000000000000000000000000011111111111110000000000000000, + 0b0000000000000000000000000000000000100000000000000010000001100100, + 0b0000000000000000000000000000000000100000100011111111111001000000, + 0b0000000000000000000000000000000001000000000000000000000001011100, + 0b0000000000000000000000000000000010000010000000000000000000000000, + 0b0000000000000000000000000000000011111111111111111000000000000000, + 0b0000000000000000000000000000000100001100111100000000000000000000, + 0b0000000000000000000000000000001111111111111111111111111111111111, + 0b0000000000000000000000000000110000000000000000000010000000011110, + 0b0000000000000000000000000000110000000000000000000011000001000000, + 0b0000000000000000000000000000110000000000011000000010000000011110, + 0b0000000000000000000000000000110000000000011000000011110111000001, + 0b0000000000000000000000000000111101100000000000000000000000000000, + 0b0000000000000000000000000001101100000000000000000000000000000000, + 0b0000000000000000000000000110000000000000000000001000000000000000, + 0b0000000000000000000001111101101111111001111111111111111101111111, + 0b0000000000000000000001111111100010000000000000000000000000000000, + 0b0000000000000000000011100000000011111000000000000000000000000000, + 0b0000000000000000000011111011110011100000000000000000000000000000, + 0b0000000000000000000100000110000000000000000000000000100001000100, + 0b0000000000000000001000010010000000000000000000000000000000000000, + 0b0000000000000000001110110011110000000000000000000000000000000011, + 0b0000000000000000001111000000000000000000000000000000111111100111, + 0b0000000000000000001111011111111110111111110000000000000000000000, + 0b0000000000000000001111111111111111111111110000000000000000000000, + 0b0000000000000000011111001001000000000011000000001111100000000000, + 0b0000000000000000111111111111111011111000000000000000000000010000, + 0b0000000000000000111111111111111100000000000010001111111111111111, + 0b0000000000000001000000000000000011111111111111111111100000000001, + 0b0000000000000001111111111111111111111111111111110000000000000000, + 0b0000000000000010000000000000110000000000111111100010000111111110, + 0b0000000000000011101000110100000000000000000000000000000000000000, + 0b0000000000001100000000000000000000000000000011000000000000000000, + 0b0000000000001111111110000000000000000000000000000000000000000100, + 0b0000000000010000000000000000000000000000000000000000000010110110, + 0b0000000000011100000000000000000000000000000111000000000000000000, + 0b0000000000011110000000000000000111000011000000000000000000000000, + 0b0000000000011111000111111100000000000000000000000000000000000001, + 0b0000000000011111111011111000000000000000000000000000000000000111, + 0b0000000000100000000111111111111111111111111111111111111111111111, + 0b0000000000100011000000000000000000000000000000100011100110000110, + 0b0000000001011000001100000000000000100000000000000000000000000010, + 0b0000000001100110011111100000000000000000000000000000000000000000, + 0b0000000001101101111111001111111111111111111111000000000000000000, + 0b0000000010111111001010000000000000000000000000000000000000000000, + 0b0000000011001111111100000000000000000000000000000000000000000000, + 0b0000000100000000000001111111111111111111111111111111111111111111, + 0b0000000110010000101000010000000000000000000000000000000000000000, + 0b0000001010100000000000000000000000000011000000000000000000000000, + 0b0000001100010000001000011111110111111111111101110000000000000000, + 0b0000010000000000010000001000000000000000000000000000000000000000, + 0b0000010000110000111111111111111111111111111111111111111111111111, + 0b0000011111110010000000000000000000000000000000000000000000000000, + 0b0000100000111110001111000000000000000000000000000000000000100000, + 0b0000111000000000000000000000100000000000000000000000000000000000, + 0b0000111000000100000000011000011100000000000000000000000000000000, + 0b0001000000000000000000000000000000000000000000000000000000000010, + 0b0001000000000000000000000000000000000000000000000000000000000110, + 0b0001000000000001000000000000000000000000000000000001000000001000, + 0b0001010000000000000000000000000000000000000000000000000000000111, + 0b0001011111010000000000000000000000000000000000000000000000001111, + 0b0001100000000000000000000000000000000000000000000000000000000011, + 0b0001111111110010000000000000000000000000000000000000000000000000, + 0b0001111111111111111111111111111111111110111111111110000011011111, + 0b0010010000111111111110000000000000000000000000000000000000000000, + 0b0010011001111000000000000000000000000000000000000000000000000011, + 0b0011001111001000000000000000000000000000000000000000000000000111, + 0b0011111110110000000000000000000000000000000000000000000000000000, + 0b0100000000000000000000000000000000000100000000000100000010000000, + 0b0100000000000000000000000000110000000000000000000010000000011110, + 0b0100000011010011100000000000000000000000000000000000000000000000, + 0b0110000000000000111000000000000011100000000000001110000000000011, + 0b0110011011111101111000000000000000000000000000000000000000000000, + 0b0111100111111000000000000000000000000000000000000000011111111110, + 0b1000000000000010111111111101111100000000000000000000000000000000, + 0b1000000000000011111111111111111100000000000000000000000000110000, + 0b1000010111111000000000000000000000000000000000000000000000000000, + 0b1000011100000000000000000000000000000000000000001111000001101110, + 0b1001000000000000000000000000000000000000000000000000000000000010, + 0b1001111111111000000111111110010101111111010000000000000000000000, + 0b1010011111111000000000000000000000000000000000000000000000000000, + 0b1011000000111100000000000000000000000000000000000000000000000000, + 0b1011010001111110000000000000000000000000000000000000000000000000, + 0b1011111101111111000000000000000000000000000000000000000000000000, + 0b1011111111110111100000000000000000000000000000000000000000000000, + 0b1011111111111111111111111111111111111111111111100000000000000000, + 0b1100000000000000000000000000000000000000000000000000000000010001, + 0b1100000110011101000000000000000000000000000000000000000000000000, + 0b1111110000000000000000000000110000000000000000000010000110111110, + 0b1111111100000000000000000000000000000000000000000000000000000010, + 0b1111111111111000000000111000000000000000000000000000000000000000, + 0b1111111111111111000000000000000000000000000000101000000000000000, + 0b1111111111111111000000001000000000000000000000000000000000000000, + 0b1111111111111111111100000000000000000000000000000000000000000000, + 0b1111111111111111111111111111111100000000000000000000000000000000, + 0b1111111111111111111111111111111100000000000000000000000000000010, + 0b1111111111111111111111111111111111111000000000000000000000000000, + 0b1111111111111111111111111111111111111111111110000000000000000000, ]; - static BITSET: [u64; 183] = [ - 0, 1, 2, 3, 4, 8, 13, 15, 28, 64, 176, 191, 1016, 1792, 2047, 4080, 4096, 8192, 8193, - 16192, 30720, 32704, 32768, 40448, 131008, 262016, 2097152, 2359296, 6030336, 8323072, - 10682368, 58719232, 159383552, 234881024, 243138688, 402587711, 536805376, 536879204, - 546307648, 805306369, 1073741824, 1073741916, 2113929216, 2181038080, 3221225472, - 3758096384, 4026531840, 4294934528, 4294967296, 4512022528, 5368709120, 17179869183, - 51539615774, 51539619904, 51545907230, 51545914817, 66035122176, 115964116992, 412316860416, - 412316893184, 1030792151040, 2199023255648, 8641373536127, 8763880767488, 15397323538432, - 17303886364672, 18004502906948, 26388279066624, 36421322670080, 65128884076547, - 65970697670631, 68168642985984, 70093866270720, 70368739983360, 136957967529984, - 140737488355328, 263882790666240, 281470547525648, 281470682333183, 281474976710655, - 281474976710656, 281474976710657, 281479271675905, 562675075514368, 562949953355776, - 563001509683710, 844424930131968, 985162418487296, 1023920203366400, 2251799813685248, - 3377699721314304, 4494803534348292, 4503599627370678, 6755399441055744, 7881299349733376, - 8444256867844096, 8725724278030336, 8760633772212225, 8989057312882695, 9042383626829823, - 9851624185018758, 24822575045541890, 28848986089586688, 30958948903026688, - 35747322042253312, 53805701016846336, 58529202969772032, 72066390130950143, - 112767012056334336, 143833713099145216, 189151184399892480, 216172782113783808, - 220713756545974272, 288301294651703296, 302022650010533887, 504262420777140224, - 558446353793941504, 572520102629474304, 593978171557150752, 1008806350890729472, - 1009933895770046464, 1152921504606846976, 1152921504606846978, 1152921504606846982, - 1153202979583561736, 1441151880758558727, 1715871458028158991, 1729382256910270467, - 2301902359539744768, 2305843009196908767, 2305843009213693952, 2612078987781865472, - 2771965570646540291, 3458764513820540928, 3731232291276455943, 4539628424389459968, - 4589168020290535424, 4611404543450677248, 4611686018494513280, 4611686069967003678, - 4671217976001691648, 6341068275337658368, 6917775322003857411, 7421334051581067264, - 8070450532247928832, 8788774672813524990, 9205357638345293827, 9222809086901354496, - 9223372036854775808, 9223372036854775935, 9223512774343131136, 9224216320050987008, - 9224497932466651184, 9653465801268658176, 9727775195120332910, 10376293541461622786, - 11526998316797657088, 11529215046068469760, 12103423998558208000, 12699025049277956096, - 13005832773892571136, 13798747783286489088, 13832665517980123136, 13835058055282032640, - 13835058055282163729, 13951307220663664640, 17870283321406128128, 17906312118425092095, - 18158513697557839871, 18158513749097456062, 18374686479671623680, 18374686479671623682, - 18444496122186563584, 18445618173802708992, 18446462598732840960, 18446462598733004800, - 18446463148488654848, 18446726481523507200, 18446744069414584320, 18446744069414584322, - 18446744073575333888, 18446744073709027328, 18446744073709551615, + static BITSET_MAPPING: [(u8, u8); 55] = [ + (0, 134), (0, 135), (0, 136), (0, 137), (0, 140), (0, 146), (0, 147), (0, 149), (0, 155), + (0, 164), (0, 166), (0, 181), (0, 182), (0, 185), (0, 130), (0, 131), (0, 133), (1, 4), + (1, 34), (1, 41), (1, 47), (1, 52), (1, 55), (1, 60), (2, 137), (2, 148), (2, 165), + (2, 173), (2, 181), (3, 6), (3, 12), (3, 29), (3, 33), (3, 51), (4, 12), (4, 46), (4, 7), + (5, 26), (5, 32), (5, 33), (6, 62), (6, 63), (7, 53), (7, 59), (8, 19), (8, 32), (9, 128), + (10, 128), (11, 33), (12, 1), (13, 57), (14, 9), (15, 33), (16, 22), (17, 23), ]; pub fn lookup(c: char) -> bool { @@ -214,57 +463,92 @@ pub mod case_ignorable { &BITSET_CHUNKS_MAP, BITSET_LAST_CHUNK_MAP, &BITSET_INDEX_CHUNKS, - &BITSET, + &BITSET_CANONICAL, + &BITSET_MAPPING, ) } } #[rustfmt::skip] pub mod cased { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (124, 6); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (124, 11); static BITSET_CHUNKS_MAP: [u8; 123] = [ - 13, 18, 0, 0, 12, 0, 0, 9, 14, 10, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 2, 0, 16, 0, 8, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, - 0, 0, 0, 7, + 18, 0, 17, 17, 5, 17, 17, 9, 4, 7, 17, 3, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 13, 14, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 16, 15, 17, 2, 17, 8, 17, 17, 6, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 12, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 1, 17, 17, 17, 17, 10, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 19] = [ - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 8, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 43, 62, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 10, 0, 50, 62, 58, 20], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 42, 44, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 62, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 31, 0, 62, 62, 62, 0, 62, 62, 62, 62, 54, 26, 27, 24], - [0, 0, 39, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 51, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 51, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 51, 25], - [0, 22, 19, 37, 62, 62, 36, 61, 62, 62, 18, 12, 0, 30, 49, 38], - [0, 29, 9, 0, 34, 52, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [46, 55, 62, 17, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [62, 6, 42, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [62, 56, 33, 60, 28, 57, 62, 62, 62, 62, 48, 35, 40, 45, 47, 5], - [62, 62, 59, 62, 41, 53, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [2, 2, 41, 2, 44, 5, 54, 51, 51, 51, 51, 51, 51, 51, 51, 51], + [2, 47, 33, 0, 28, 39, 2, 2, 2, 2, 8, 35, 49, 50, 1, 14], + [2, 59, 10, 24, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], + [45, 46, 2, 20, 18, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], + [51, 29, 62, 51, 34, 38, 57, 51, 51, 51, 51, 51, 51, 51, 51, 51], + [51, 51, 6, 32, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 6, 53], + [51, 51, 6, 55, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], + [51, 51, 11, 17, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], + [51, 51, 13, 13, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], + [51, 51, 31, 51, 2, 2, 2, 51, 2, 2, 2, 2, 4, 26, 27, 25], + [51, 51, 51, 51, 2, 52, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], + [51, 51, 51, 51, 10, 9, 60, 51, 51, 51, 51, 51, 51, 51, 51, 51], + [51, 51, 51, 51, 51, 51, 51, 51, 51, 2, 51, 51, 51, 51, 51, 51], + [51, 51, 51, 51, 51, 51, 51, 51, 51, 19, 56, 51, 7, 2, 40, 23], + [51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 10, 36, 2, 51], + [51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 12, 61, 51, 51], + [51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 15, 51, 51, 51], + [51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], + [51, 58, 22, 48, 2, 2, 42, 3, 2, 2, 21, 16, 51, 30, 37, 43], + ]; + static BITSET_CANONICAL: [u64; 42] = [ + 0b1111111111111111111111111111111111111111111111111111111111101111, + 0b1111111111111111111111011111111111111111111111111111110111111111, + 0b1111111111111111111111111111111111111111111111111111111111111111, + 0b1111111111111111111111111111111111111111111111111111111111110000, + 0b1111111111111111111111111111111100111111001111111111111111111111, + 0b1111111111111111111111111111111100000000011111111111111111111111, + 0b1111111111111111111111111111111100000000000000000000000000000000, + 0b1111111111111111111111111111110000000000000000000000000000000000, + 0b1111111111111111111111110011111111111111111111111111111111111111, + 0b1111111111111111000000111111111111111111111111110000001111111111, + 0b1111111111111111000000000000000000000000000000000000000000000000, + 0b1111111111000000000000000000000000000000000000000000000000000000, + 0b0000011111111111111111111111111000000000000000000000000000000000, + 0b0000000000000111111111111111111111111111111111111111111111111111, + 0b0000000000000000000000000000000000000000000000000000111111110111, + 0b0000000000000000000000000000000000000000111110000000000001111111, + 0b0000000000000000000000000001111100000000000000000000000000000011, + 0b0000000000000000000000111111111111111111111111111111111111111111, + 0b0000000000000000001000001011111111111111111111111111111111111111, + 0b0000000000000000001111111111111111111111111111111111111111111111, + 0b0000000000001100011110000001111111111111111111111111111111111111, + 0b0000000111111111111111111111111111111111111011111111111111111111, + 0b0000010000100000000001000000000000000000000000000000000000000000, + 0b0000011101100000000000000000000000000000000000000000011111111100, + 0b0000111111111111111111111111111111111111000011111111111111111111, + 0b0001111111011100000111111111111100001111110011110001111111011100, + 0b0011111111111111111111111111111110101010111111110011111100111111, + 0b0101111111011111111111111111111111111111111111111111111111111111, + 0b0111101111111111111111111111111111011111110111111110011110111111, + 0b1000000000000010000000000000000000000000000000000000000000000000, + 0b1011110011001111000000000000000000000000000000000000000000100000, + 0b1110011111111111111111111111111111111111111111110000000111111111, + 0b1110011111111111111111111111111111111111111111110010000010111111, + 0b1110101111111111110111100110010011011111111111111111111111111111, + 0b1111001000011111101111010101000000111110001011111111110010000100, + 0b1111011111111111111111111111111111110111111111111111111111111101, + 0b1111111111111111000000011111111111110111111111111111111111111111, + 0b1111111111111111111111111111101111111111111111111101011101000000, + 0b1111111111111111111111111111111100000000000000000100001111100000, + 0b1111111111111111111111111111111111111111111111011111110001011111, + 0b1111111111111111111111111111111111111111111111110111100011111111, + 0b1111111111111111111111111111111111111111111111111111110000000011, ]; - static BITSET: [u64; 63] = [ - 0, 15, 24, 511, 1023, 4087, 65535, 16253055, 134217726, 536805376, 1073741823, 4294967295, - 133143986179, 4398046511103, 36009005809663, 70368744177663, 2251799813685247, - 3509778554814463, 144115188074807295, 297241973452963840, 531424756029720572, - 576460743713488896, 576460743847706622, 1152921504591118335, 2295745090394464220, - 4557642822898941951, 4611686017001275199, 6908521828386340863, 8935141660164089791, - 9223934986808197120, 13605092999309557792, 16717361816799216127, 16717361816799223999, - 17005555242810474495, 17446871633794956420, 17870283321271910397, 17870283321406128127, - 18410715276682199039, 18428729675200069631, 18428729675200069632, 18437736874452713471, - 18446462598732840959, 18446462598732840960, 18446464797621878783, 18446466996779287551, - 18446603336221163519, 18446603336221196287, 18446741874686295551, 18446743249075830783, - 18446744056529672000, 18446744056529682432, 18446744069414584320, 18446744069414601696, - 18446744069422972927, 18446744070475743231, 18446744071562067967, 18446744073707454463, - 18446744073709419615, 18446744073709517055, 18446744073709550595, 18446744073709551599, - 18446744073709551600, 18446744073709551615, + static BITSET_MAPPING: [(u8, u8); 21] = [ + (0, 55), (0, 50), (0, 44), (0, 43), (0, 27), (0, 17), (1, 14), (1, 12), (1, 6), (2, 128), + (3, 128), (4, 32), (5, 169), (6, 32), (7, 30), (8, 157), (9, 17), (10, 16), (11, 10), + (12, 32), (13, 157), ]; pub fn lookup(c: char) -> bool { @@ -273,7 +557,8 @@ pub mod cased { &BITSET_CHUNKS_MAP, BITSET_LAST_CHUNK_MAP, &BITSET_INDEX_CHUNKS, - &BITSET, + &BITSET_CANONICAL, + &BITSET_MAPPING, ) } } @@ -287,8 +572,12 @@ pub mod cc { static BITSET_INDEX_CHUNKS: [[u8; 1]; 3] = [ [0], [1], [2], ]; - static BITSET: [u64; 3] = [ - 0, 4294967295, 9223372036854775808, + static BITSET_CANONICAL: [u64; 3] = [ + 0b0000000000000000000000000000000000000000000000000000000000000000, + 0b0000000000000000000000000000000011111111111111111111111111111111, + 0b1000000000000000000000000000000000000000000000000000000000000000, + ]; + static BITSET_MAPPING: [(u8, u8); 0] = [ ]; pub fn lookup(c: char) -> bool { @@ -297,73 +586,163 @@ pub mod cc { &BITSET_CHUNKS_MAP, BITSET_LAST_CHUNK_MAP, &BITSET_INDEX_CHUNKS, - &BITSET, + &BITSET_CANONICAL, + &BITSET_MAPPING, ) } } #[rustfmt::skip] pub mod grapheme_extend { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 44); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 20); static BITSET_CHUNKS_MAP: [u8; 245] = [ - 0, 8, 15, 22, 26, 33, 40, 32, 35, 3, 0, 7, 21, 23, 30, 0, 20, 0, 0, 0, 0, 0, 12, 0, 27, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 25, 29, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 5, 0, 28, 1, 10, 0, 0, 0, 37, 6, 17, 43, 34, 42, 38, 31, 36, 39, 13, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 14, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 18, 0, 0, - 0, 41, 0, 0, 24, 11, 0, 0, 9, + 42, 34, 28, 23, 6, 11, 18, 10, 13, 40, 42, 35, 22, 3, 9, 42, 21, 42, 42, 42, 42, 42, 30, + 42, 2, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 24, 5, 8, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 36, 42, 7, 41, 31, 42, 42, 42, 15, 37, 27, 19, 12, + 0, 16, 44, 14, 17, 29, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 42, 39, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 26, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 38, 25, 42, 42, 42, 1, 42, 42, 4, 32, 42, 42, 33, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 45] = [ - [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 103], [0, 0, 0, 0, 0, 16, 20, 46], - [0, 0, 0, 0, 0, 38, 0, 0], [0, 0, 0, 0, 0, 133, 58, 0], [0, 0, 0, 0, 33, 0, 0, 0], - [0, 0, 0, 0, 49, 0, 0, 0], [0, 0, 0, 0, 77, 74, 106, 31], [0, 0, 0, 0, 143, 66, 0, 0], - [0, 0, 0, 21, 0, 10, 0, 0], [0, 0, 0, 39, 0, 94, 0, 0], [0, 0, 0, 62, 0, 0, 0, 0], - [0, 0, 0, 71, 0, 118, 0, 142], [0, 0, 0, 76, 0, 0, 0, 0], [0, 0, 0, 79, 87, 0, 0, 0], - [0, 0, 9, 0, 0, 0, 129, 7], [0, 0, 35, 0, 0, 0, 0, 0], [0, 0, 55, 0, 0, 18, 0, 0], - [0, 5, 0, 0, 0, 0, 0, 0], [0, 107, 37, 70, 0, 0, 0, 0], [12, 0, 0, 69, 0, 0, 0, 0], - [13, 0, 50, 0, 96, 0, 0, 0], [26, 67, 0, 59, 140, 11, 68, 104], - [27, 123, 139, 1, 100, 75, 57, 72], [51, 0, 0, 0, 87, 0, 0, 0], - [54, 0, 0, 120, 61, 19, 105, 47], [60, 28, 0, 141, 99, 45, 111, 109], - [63, 0, 25, 0, 0, 0, 0, 0], [65, 0, 0, 0, 0, 0, 37, 0], [85, 98, 131, 84, 0, 0, 0, 56], - [89, 0, 0, 91, 0, 0, 0, 135], [93, 0, 0, 0, 113, 3, 0, 40], - [95, 15, 101, 14, 90, 117, 102, 6], [97, 83, 97, 136, 132, 44, 108, 22], - [110, 0, 0, 52, 112, 80, 0, 0], [114, 78, 30, 0, 0, 0, 0, 0], [115, 29, 24, 0, 0, 0, 0, 0], - [121, 0, 0, 48, 0, 0, 0, 0], [125, 1, 88, 0, 53, 0, 0, 0], [128, 0, 86, 0, 127, 8, 23, 0], - [130, 42, 122, 41, 112, 43, 2, 36], [134, 82, 64, 0, 0, 0, 0, 0], - [137, 34, 124, 4, 0, 0, 126, 32], [138, 119, 92, 0, 81, 73, 116, 17], - [142, 143, 0, 0, 143, 143, 143, 66], + [4, 24, 87, 13, 138, 138, 89, 143], [5, 52, 41, 138, 138, 138, 138, 138], + [11, 138, 115, 138, 138, 138, 138, 138], [20, 86, 8, 102, 67, 47, 37, 45], + [32, 138, 138, 138, 6, 138, 138, 138], [35, 138, 138, 83, 130, 127, 71, 111], + [40, 123, 138, 100, 66, 31, 77, 75], [42, 138, 138, 138, 138, 138, 117, 138], + [55, 65, 94, 54, 138, 138, 138, 36], [58, 138, 138, 60, 138, 138, 138, 0], + [62, 129, 68, 142, 59, 82, 69, 105], [64, 53, 64, 97, 95, 30, 74, 17], + [76, 138, 138, 33, 78, 50, 138, 138], [80, 49, 22, 138, 138, 138, 138, 138], + [81, 21, 19, 138, 138, 138, 138, 138], [84, 138, 138, 118, 138, 138, 138, 138], + [88, 102, 57, 138, 34, 138, 138, 138], [91, 138, 56, 138, 90, 15, 18, 138], + [93, 28, 85, 27, 78, 29, 103, 25], [98, 131, 61, 138, 51, 112, 9, 16], + [101, 7, 138, 138, 7, 7, 7, 139], [106, 138, 138, 44, 138, 138, 138, 138], + [122, 138, 12, 138, 63, 138, 138, 138], [128, 43, 138, 39, 99, 125, 3, 70], + [138, 73, 117, 120, 138, 138, 138, 138], [138, 121, 138, 138, 138, 138, 138, 138], + [138, 138, 116, 138, 138, 138, 138, 138], [138, 138, 119, 138, 138, 126, 138, 138], + [138, 138, 135, 138, 138, 138, 92, 14], [138, 138, 138, 1, 138, 138, 138, 138], + [138, 138, 138, 2, 138, 114, 138, 101], [138, 138, 138, 109, 138, 10, 138, 138], + [138, 138, 138, 134, 138, 138, 138, 138], [138, 138, 138, 137, 138, 136, 138, 138], + [138, 138, 138, 138, 7, 139, 138, 138], [138, 138, 138, 138, 48, 46, 72, 23], + [138, 138, 138, 138, 108, 138, 138, 138], [138, 138, 138, 138, 133, 138, 138, 138], + [138, 138, 138, 138, 138, 96, 38, 138], [138, 138, 138, 138, 138, 107, 132, 110], + [138, 138, 138, 138, 138, 124, 138, 138], [138, 138, 138, 138, 138, 138, 138, 113], + [138, 138, 138, 138, 138, 138, 138, 138], [138, 138, 138, 141, 6, 138, 138, 138], + [140, 138, 138, 138, 79, 104, 138, 26], + ]; + static BITSET_CANONICAL: [u64; 102] = [ + 0b1111101111111111111111111111111111111111111111111111111111111111, + 0b0000000000011000000000000000000000000000000000000000000000000000, + 0b0000000000000011100000000000000000000000000000000000000000000000, + 0b0000000000000001111111111100000000000000000000000000000000000000, + 0b1111111100000000000000000000000000000000000000000000000000000000, + 0b1111100001111111111111111111111111111111111111111111111111111111, + 0b0000000001111111000000000000000000000000000000000000000000000000, + 0b1111111111111111111111111111111111111111111111111111111111111111, + 0b1111111111111111000000000000000000000000000000000000000000000000, + 0b0111111111000000000000000000000000000000000000000000000000000011, + 0b0000011111000000000000000000000000000000000000000000000000000000, + 0b0000000000000000111111000000000000000000000000000000000000000000, + 0b0000000000000000000000100000000000000000000000000000000001100000, + 0b0000000000000000000000000000000000000000000000000000000000001101, + 0b0000000000000000000000000000000000000000000000000000000010110110, + 0b0000000000000000000000000000000000000000000000000000000010111111, + 0b0000000000000000000000000000000000000000000000001001111000000000, + 0b0000000000000000000000000000000000000000100000000010000000000001, + 0b0000000000000000000000000000000000000000101000110000000000000000, + 0b0000000000000000000000000000000000000011011111111111110000000000, + 0b0000000000000000000000000000000000001001100000000000000000000000, + 0b0000000000000000000000000000000000001110011111100000000010000000, + 0b0000000000000000000000000000000000100000000000000010000001100100, + 0b0000000000000000000000000000000000100000000011111111111001000000, + 0b0000000000000000000000000000000001000000000000000000000001011100, + 0b0000000000000000000000000000000010000000010111001000010000000000, + 0b0000000000000000000000000000000100001100111100000000000000000000, + 0b0000000000000000000000000000110000000000011000000011000001000100, + 0b0000000000000000000000000000110000000000011000000011110111000001, + 0b0000000000000000000000000000110000000000100000000010000000011110, + 0b0000000000000000000000000000110000000000111000000010000000011110, + 0b0000000000000000000000000000110000000000111111100010000111111110, + 0b0000000000000000000001111101101111111001111111111111111101111111, + 0b0000000000000000000001111111100010000000000000000000000000000000, + 0b0000000000000000000011111011110011100000000000000000000000000000, + 0b0000000000000000000100000110000000000000000000000000100001000100, + 0b0000000000000000001000010010000000000000000000000000000000000000, + 0b0000000000000000001110110011110000000000000000000000000000000011, + 0b0000000000000000001111000000000000000000000000000000111111100111, + 0b0000000000000000001111011001111110011111110000000000000000000000, + 0b0000000000000000001111101110111111111011110000000000000000000000, + 0b0000000000000000111111111111111011111000000000000000000000010000, + 0b0000000000000000111111111111111100000000000000001111111111111111, + 0b0000000000000001000000000000000011111111111111111111100000000000, + 0b0000000000000001111111111111111111111111111111110000000000000000, + 0b0000000000000011101000110100000000000000000000000000000000000000, + 0b0000000000001100000000000000000000000000000011000000000000000000, + 0b0000000000001111111110000000000000000000000000000000000000000100, + 0b0000000000011100000000000000000000000000000111000000000000000000, + 0b0000000000011110000000000000000111000011000000000000000000000000, + 0b0000000000011111000111111100000000000000100000000000000000000001, + 0b0000000000011111111011111000000000000000000000000000000000000111, + 0b0000000000100000000111111111111111111111111111111111111111111111, + 0b0000000000100011000000000000000000000000000000100011100110000110, + 0b0000000001000000001100000000000000000000000000000000000000000010, + 0b0000000001100110011111100000000000000000000000000000000000000000, + 0b0000000001101101111111001111111111111111111111000000000000000000, + 0b0000000010111111001010000000000000000000000000000000000000000000, + 0b0000000011001111111100000000000000000000000000000000000000000000, + 0b0000001010100000000000000000000000000011000000000000000000000000, + 0b0000001100010000001000011111110111111111111101110000000000000000, + 0b0000011001111000000000000000000000000000000000000000000000000011, + 0b0000011111110010000000000000000000000000000000000000000000000000, + 0b0000111000000100000000011000011100000000000000000000000000000000, + 0b0001000000000000000000000000000000000000000000000000000000000110, + 0b0001000000000000000000000000000000000000000000000001000000001000, + 0b0001010000000000000000000000000000000000000000000000000000000111, + 0b0001011111110000000000000000000000000000000000000000000000001111, + 0b0001111111110010000000000000000000000000000000000000000000000000, + 0b0001111111111111111111111111111111111110111111111110000011011111, + 0b0010000000001111111110000000000000000000000000000000000000000000, + 0b0011001111001000000000000000000000000000000000000000000000000111, + 0b0011111110110000000000000000000000000000000000000000000000000000, + 0b0011111111110111100000000000000000000000000000000000000000000000, + 0b0100000000000000000000000000000000000000000000000000000000000100, + 0b0100000000000000000000000000110000000000100000000010000000011110, + 0b0100000011010011100000000000000000000000000000000000000000000000, + 0b0101000000000000000000000000000000000000000000000000000000000010, + 0b0101100000000000000000000000000000000000000000000000000000000011, + 0b0101100000000001000000000000000000000000000000000000000000000000, + 0b0110011011111101111000000000000000000000000000000000000000000000, + 0b0111100111111000000000000000000000000000000000000000011111111110, + 0b0111111111111110000000000000000000000000000000000000000000000000, + 0b1000000000000011111111111111111100000000000000000000000000110000, + 0b1000011100000000000000000000000000000000000000001111000001101110, + 0b1001000000000000000000000000000000000000000000000000000000000010, + 0b1001111111111000000111111110010101111111010000000000000000000000, + 0b1010010111111001000000000000000000000000000000000000000000000000, + 0b1010011111111000000000000000000000000000000000000000000000000000, + 0b1011000000111100100000000000000000000000000000000000000000000000, + 0b1011010001111110000000000000000000000000000000000000000000000000, + 0b1011111101111111000000000000000000000000000000000000000000000000, + 0b1011111111111111111111111111111111111111111111100000000000000000, + 0b1100000000000000000000000000000000000000000000000000000000010001, + 0b1100000110011101000000000000000000000000000000000000000000000000, + 0b1101000000000000000000000000000000000000000000000000000000000010, + 0b1111100000000111110000111010000000000000000000000000000000000000, + 0b1111110000000000000000000000110000000000000000000010000110111110, + 0b1111111100000000000000000000000000000000000000000000000000000010, + 0b1111111111111111000000000000000000000000000000100000000000000000, + 0b1111111111111111111111111111101111111111111110000000000000000000, + 0b1111111111111111111111111111111100000000000000000000000000000000, ]; - static BITSET: [u64; 144] = [ - 0, 1, 2, 8, 13, 28, 64, 182, 191, 1016, 2032, 2047, 4096, 14336, 16128, 32640, 32768, - 40448, 131008, 262016, 491520, 8323072, 8396801, 10682368, 58719232, 100663296, 134152192, - 159383552, 234881024, 243138688, 536879204, 537919040, 805306369, 1073741824, 1073741916, - 1610612736, 2153546752, 3221225472, 3758096384, 4294967296, 4512022528, 51545911364, - 51545914817, 51548004382, 51554295838, 51556262398, 68719476736, 137438953472, 412316860416, - 1030792151040, 2199023255648, 8641373536127, 8763880767488, 17303886364672, 18004502906948, - 26388279066624, 36421322670080, 65128884076547, 65970697670631, 67755789254656, - 69200441769984, 70093866270720, 263882790666240, 277076930199552, 281470547525648, - 281470681808895, 281474976710655, 281479271675904, 562675075514368, 562949953355776, - 844424930131968, 985162418487296, 1023920203366400, 2251799813685248, 3377699721314304, - 4494803534348292, 6755399441055744, 7881299349733376, 8444256867844096, 8725724278030336, - 8760633780600833, 8989057312882695, 9042383626829823, 9851624185018758, 18067175067615234, - 28848986089586688, 30958948903026688, 35747322042253312, 53805701016846336, - 58529202969772032, 189151184399892480, 220713756545974272, 466122561432846339, - 504262420777140224, 558446353793941504, 572520102629474304, 1009933895770046464, - 1152921504606846982, 1152921504606851080, 1441151880758558727, 1724878657282899983, - 2301902359539744768, 2305843009196908767, 2305843009213693952, 2310337812748042240, - 3731232291276455943, 4589168020290535424, 4609293481125347328, 4611686018427387908, - 4611686069975392286, 4671217976001691648, 5764607523034234882, 6341068275337658371, - 6341349750314369024, 7421334051581067264, 8788774672813524990, 9205357638345293827, - 9222809086901354496, 9223372036854775808, 9223372036854775935, 9224497932466651184, - 9727775195120332910, 10376293541461622786, 11526998316797657088, 11959590285459062784, - 12103423998558208000, 12699165786766311424, 13005832773892571136, 13798747783286489088, - 13835058055282032640, 13835058055282163729, 13951307220663664640, 14987979559889010690, - 17872468738205286400, 17906312118425092095, 18158513697557839871, 18158513749097456062, - 18374686479671623680, 18374686479671623682, 18446462598732840960, 18446462598732972032, - 18446744056529158144, 18446744069414584320, 18446744073709551615, + static BITSET_MAPPING: [(u8, u8); 42] = [ + (0, 134), (0, 135), (0, 137), (0, 140), (0, 146), (0, 149), (0, 164), (0, 166), (0, 170), + (0, 171), (0, 185), (0, 131), (0, 133), (1, 38), (1, 42), (1, 43), (1, 50), (1, 56), + (1, 61), (2, 19), (2, 28), (2, 42), (2, 46), (3, 26), (3, 32), (3, 33), (3, 42), (4, 15), + (4, 46), (4, 7), (5, 152), (5, 173), (5, 181), (6, 19), (6, 20), (6, 32), (7, 128), + (8, 128), (9, 57), (10, 58), (11, 30), (12, 23), ]; pub fn lookup(c: char) -> bool { @@ -372,57 +751,106 @@ pub mod grapheme_extend { &BITSET_CHUNKS_MAP, BITSET_LAST_CHUNK_MAP, &BITSET_INDEX_CHUNKS, - &BITSET, + &BITSET_CANONICAL, + &BITSET_MAPPING, ) } } #[rustfmt::skip] pub mod lowercase { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (122, 6); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (122, 10); static BITSET_CHUNKS_MAP: [u8; 118] = [ - 12, 16, 0, 0, 10, 0, 0, 11, 13, 8, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 2, 1, 0, 17, 0, 9, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, + 5, 1, 16, 16, 8, 16, 16, 6, 4, 9, 16, 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 12, 13, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 14, 15, 16, 2, 16, 7, 16, 16, + 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 11, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 3, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 18] = [ - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 62, 71, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 9, 0, 50, 42, 44, 28], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 69, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 68, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 53, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 35], - [0, 0, 3, 0, 71, 71, 71, 0, 46, 46, 48, 46, 24, 37, 38, 23], - [0, 29, 27, 57, 39, 51, 52, 43, 41, 70, 26, 11, 0, 34, 64, 32], - [0, 40, 8, 0, 33, 60, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [22, 13, 54, 66, 25, 15, 56, 63, 30, 19, 12, 55, 58, 61, 65, 4], - [59, 36, 46, 21, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [59, 49, 45, 47, 18, 69, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [67, 5, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [4, 31, 40, 19, 16, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], + [4, 42, 69, 41, 18, 3, 10, 66, 66, 66, 66, 66, 66, 66, 66, 66], + [55, 68, 66, 6, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], + [62, 15, 47, 54, 22, 60, 49, 0, 26, 61, 70, 48, 64, 65, 1, 11], + [66, 35, 71, 66, 28, 51, 9, 66, 66, 66, 66, 66, 66, 66, 66, 66], + [66, 63, 24, 50, 34, 44, 45, 38, 36, 57, 23, 14, 66, 29, 53, 27], + [66, 66, 10, 66, 2, 2, 2, 66, 40, 40, 5, 40, 21, 32, 33, 20], + [66, 66, 66, 7, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], + [66, 66, 66, 46, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 30], + [66, 66, 66, 59, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], + [66, 66, 66, 66, 56, 8, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], + [66, 66, 66, 66, 66, 66, 66, 66, 66, 3, 66, 66, 66, 66, 66, 66], + [66, 66, 66, 66, 66, 66, 66, 66, 66, 17, 13, 66, 43, 37, 39, 25], + [66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 4, 52, 2, 66], + [66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 12, 66, 66, 66], + [66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 58, 66, 66], + [66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], + [66, 66, 66, 67, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], ]; - static BITSET: [u64; 72] = [ - 0, 15, 16, 511, 3063, 65535, 16253055, 134217726, 536805376, 984263338, 4294967295, - 133143986179, 274877905920, 1099509514240, 4398046445568, 17592185782272, 36009005809663, - 46912496118442, 187649984473770, 281474972516352, 2251799813685247, 2339875276368554, - 4503599560261632, 61925590106570972, 71777214282006783, 72057592964186127, - 144115188074807295, 297241973452963840, 522417556774978824, 576460743713488896, - 1152921487426978047, 1152921504590069760, 1814856824841797631, 3607524039012697088, - 4362299189061746720, 4539628424389459968, 4601013482110844927, 4611405638684049471, - 4674456033467236607, 6172933889249159850, 9223934986808197120, 10663022717737544362, - 10808545280696953514, 12261519110656315968, 12294970652241842346, 12297829382473033730, - 12297829382473034410, 12297829382473045332, 12297829382829550250, 12297829383904690175, - 12298110845996498944, 15324248332066007893, 16596095761559859497, 16717361816799215616, - 16987577794709946364, 17293822586148356092, 18158513701852807104, 18410715274543104000, - 18428729675466407935, 18446462598732840960, 18446462598732858304, 18446462598737002495, - 18446464797621878783, 18446673704966422527, 18446726481523572736, 18446739675663105535, - 18446739675663106031, 18446742974197923840, 18446744056529682432, 18446744069414584320, - 18446744073709529733, 18446744073709551615, + static BITSET_CANONICAL: [u64; 58] = [ + 0b1111111111111111110000000000000000000000000011111111111111111111, + 0b1111111111111111111111000000000000000000000000001111110111111111, + 0b1111111111111111111111111111111111111111111111111111111111111111, + 0b1111111111111111111111111111111100000000000000000000000000000000, + 0b1111111111111111000000000000000000000000000000000000000000000000, + 0b1010101010101010101010101010101010111111111010101010101010101010, + 0b0000111111111111111111111111111111111111000000000000000000000000, + 0b0000000000000111111111111111111111111111111111111111111111111111, + 0b0000000000000000000000000000000000000000000000000000000000001111, + 0b0000000000000000000000000000000000000000000000000000000000010000, + 0b0000000000000000000000000000000000000000000000000000000111111111, + 0b0000000000000000000000000000000000000000000000000000101111110111, + 0b0000000000000000000000000000000000000000111110000000000001111111, + 0b0000000000000000000000000000000000111010101010101010101010101010, + 0b0000000000000000000000000001111100000000000000000000000000000011, + 0b0000000000000000000000001111111111111111110111111100000000000000, + 0b0000000000000000001000001011111111111111111111111111111111111111, + 0b0000000000000000001010101010101010101010101010101010101010101010, + 0b0000000000000000101010101010101010101010101010101010101010101010, + 0b0000000000001000010100000001101010101010101010101010101010101010, + 0b0000000011011100000000001111111100000000110011110000000011011100, + 0b0000000011111111000000001111111100000000001111110000000011111111, + 0b0000000011111111111111111111111111000000000000000000000000001111, + 0b0000000111111111111111111111111111111111111011111111111111111111, + 0b0000010000100000000001000000000000000000000000000000000000000000, + 0b0000011101000000000000000000000000000000000000000000010100001000, + 0b0000111111111111111111111111110000000000000000000000000011111111, + 0b0001100100101111101010101010101010101010111000110111111111111111, + 0b0011001000010000100000000000000000000000000010001100010000000000, + 0b0011110010001010000000000000000000000000000000000000000000100000, + 0b0011111100000000000000000000000000000000000000000000000000000000, + 0b0011111111011010000101010110001001111111111111111111111111111111, + 0b0011111111111111000000001111111100000000111111110000000000111111, + 0b0100000011011111000000001111111100000000111111110000000011111111, + 0b0101010110101010101010101010101010101010101010101010101010101010, + 0b1000000000000010000000000000000000000000000000000000000000000000, + 0b1001001111111010101010101010101010101010101010101010101010101010, + 0b1001010111111111101010101010101010101010101010101010101010101010, + 0b1010101000101001101010101010101010110101010101010101001001000000, + 0b1010101010100000100000101010101010101010101110100101000010101010, + 0b1010101010101010101010101010101010101010101010101010101010101010, + 0b1010101010101010101010101010101010101010101010101101010101010100, + 0b1010101010101010101010101010101011111111111111111111111111111111, + 0b1010101010101011101010101010100000000000000000000000000000000000, + 0b1101010010101010101010101010101010101010101010101010101101010101, + 0b1110011001010001001011010010101001001110001001000011000100101001, + 0b1110011111111111111111111111111111111111111111110000000000000000, + 0b1110101111000000000000000000000000001111111111111111111111111100, + 0b1111000000000000000000000000001111110111111111111111111111111100, + 0b1111110000000000000000000000000011111111111111111111111111000000, + 0b1111111101111111111111111111111110000000000000000000000000000000, + 0b1111111111111111000000000000000000000000000000000100001111000000, + 0b1111111111111111000000011111111111110111111111111111111111111111, + 0b1111111111111111111100000000000000000000000000010000000000000000, + 0b1111111111111111111111000000000000000000000000001111111111101111, + 0b1111111111111111111111110000000000000000000000000000000000000000, + 0b1111111111111111111111111111110000000000000000000000000000000000, + 0b1111111111111111111111111111111111111111111111111010101010000101, + ]; + static BITSET_MAPPING: [(u8, u8); 14] = [ + (0, 173), (0, 188), (0, 190), (0, 130), (0, 134), (0, 141), (1, 12), (1, 6), (2, 128), + (3, 32), (4, 16), (5, 173), (6, 142), (7, 157), ]; pub fn lookup(c: char) -> bool { @@ -431,58 +859,108 @@ pub mod lowercase { &BITSET_CHUNKS_MAP, BITSET_LAST_CHUNK_MAP, &BITSET_INDEX_CHUNKS, - &BITSET, + &BITSET_CANONICAL, + &BITSET_MAPPING, ) } } #[rustfmt::skip] pub mod n { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (253, 2); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (253, 40); static BITSET_CHUNKS_MAP: [u8; 249] = [ - 44, 0, 0, 29, 5, 31, 35, 26, 22, 6, 0, 12, 40, 20, 27, 0, 33, 0, 39, 7, 0, 0, 17, 0, 45, - 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41, 43, - 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 10, 16, 21, 0, 37, 34, 18, 36, 32, 15, 25, 24, 13, 0, - 30, 1, 0, 0, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 14, 0, 3, 0, 0, 0, 0, 4, 15, 0, 0, 11, 0, 38, 0, 8, + 5, 41, 41, 21, 37, 23, 11, 18, 16, 35, 41, 27, 2, 45, 46, 41, 9, 41, 15, 34, 41, 41, 29, + 41, 1, 3, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, + 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, + 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 4, 6, 20, 41, 41, 41, 41, 41, 41, + 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, + 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 33, 32, 28, 25, 41, 13, 10, 26, 12, 8, 30, + 19, 17, 43, 41, 7, 38, 41, 41, 0, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, + 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 22, 41, 24, + 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, + 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, + 41, 41, 41, 31, 41, 39, 41, 41, 41, 41, 36, 30, 41, 41, 44, 41, 14, 41, 42, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 47] = [ - [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 11], [0, 0, 0, 0, 0, 0, 0, 47], - [0, 0, 0, 0, 0, 0, 0, 72], [0, 0, 0, 0, 0, 2, 0, 0], [0, 0, 0, 0, 0, 31, 0, 45], - [0, 0, 0, 0, 0, 53, 0, 0], [0, 0, 0, 0, 0, 65, 9, 0], [0, 0, 0, 0, 6, 0, 0, 0], - [0, 0, 0, 0, 15, 0, 0, 0], [0, 0, 0, 0, 37, 44, 4, 0], [0, 0, 0, 7, 0, 15, 0, 0], - [0, 0, 0, 33, 0, 0, 0, 49], [0, 0, 0, 35, 0, 15, 0, 0], [0, 0, 0, 36, 0, 43, 0, 0], - [0, 0, 0, 47, 0, 0, 0, 0], [0, 0, 0, 52, 23, 3, 0, 13], [0, 0, 0, 54, 0, 0, 0, 0], - [0, 0, 0, 62, 47, 0, 0, 0], [0, 0, 14, 0, 0, 0, 0, 0], [0, 0, 16, 0, 0, 15, 47, 0], - [0, 0, 25, 0, 0, 0, 0, 0], [0, 2, 15, 0, 0, 0, 0, 0], [0, 15, 0, 0, 0, 0, 0, 47], - [0, 15, 0, 2, 51, 0, 0, 0], [0, 15, 0, 15, 0, 0, 0, 0], [0, 15, 0, 15, 36, 0, 0, 0], - [0, 16, 0, 0, 0, 0, 0, 0], [0, 25, 0, 0, 0, 22, 0, 0], [0, 25, 0, 47, 0, 0, 0, 2], - [0, 26, 0, 0, 0, 15, 25, 0], [0, 31, 0, 31, 0, 41, 0, 34], [0, 32, 0, 47, 65, 0, 0, 39], - [0, 46, 2, 0, 0, 71, 1, 0], [0, 57, 20, 28, 0, 64, 29, 0], [0, 59, 0, 31, 0, 42, 0, 31], - [0, 60, 0, 0, 24, 10, 0, 5], [0, 63, 30, 61, 18, 0, 55, 70], [0, 66, 38, 0, 56, 0, 0, 0], - [0, 69, 19, 68, 0, 0, 0, 0], [15, 0, 0, 0, 0, 8, 0, 17], [25, 0, 0, 31, 0, 0, 0, 0], - [25, 21, 67, 0, 0, 0, 0, 0], [40, 0, 0, 15, 2, 0, 0, 48], [47, 0, 58, 0, 0, 0, 0, 0], - [50, 0, 0, 0, 0, 0, 12, 0], [73, 27, 0, 0, 0, 0, 0, 0], + [7, 23, 67, 67, 67, 67, 67, 67], [32, 67, 67, 67, 67, 67, 66, 67], + [50, 67, 67, 67, 67, 49, 67, 18], [52, 20, 8, 67, 67, 67, 67, 67], + [52, 67, 67, 53, 67, 67, 67, 67], [54, 67, 38, 67, 67, 67, 67, 67], + [58, 67, 67, 50, 48, 67, 67, 31], [67, 22, 67, 67, 67, 50, 52, 67], + [67, 24, 67, 54, 0, 67, 67, 26], [67, 30, 48, 67, 67, 46, 14, 67], + [67, 37, 72, 60, 67, 42, 64, 67], [67, 39, 67, 53, 67, 28, 67, 53], + [67, 40, 67, 67, 51, 65, 67, 63], [67, 41, 13, 3, 57, 67, 56, 1], + [67, 43, 25, 67, 36, 67, 67, 67], [67, 45, 19, 44, 67, 67, 67, 67], + [67, 48, 50, 67, 67, 67, 67, 67], [67, 50, 67, 48, 33, 67, 67, 67], + [67, 50, 67, 50, 62, 67, 67, 67], [67, 50, 67, 50, 67, 67, 67, 67], + [67, 50, 67, 67, 67, 67, 67, 54], [67, 52, 67, 54, 67, 67, 67, 48], + [67, 52, 67, 67, 67, 21, 67, 67], [67, 53, 67, 53, 67, 27, 67, 11], + [67, 67, 17, 67, 67, 67, 67, 67], [67, 67, 52, 67, 67, 67, 67, 67], + [67, 67, 67, 2, 54, 67, 67, 67], [67, 67, 67, 12, 67, 67, 67, 9], + [67, 67, 67, 34, 6, 15, 67, 59], [67, 67, 67, 35, 67, 67, 67, 67], + [67, 67, 67, 54, 67, 67, 67, 67], [67, 67, 67, 62, 67, 68, 67, 67], + [67, 67, 67, 67, 10, 5, 55, 67], [67, 67, 67, 67, 50, 67, 67, 67], + [67, 67, 67, 67, 67, 0, 61, 67], [67, 67, 67, 67, 67, 4, 67, 67], + [67, 67, 67, 67, 67, 48, 67, 67], [67, 67, 67, 67, 67, 53, 67, 29], + [67, 67, 67, 67, 67, 67, 67, 16], [67, 67, 67, 67, 67, 67, 67, 47], + [67, 67, 67, 67, 67, 67, 67, 54], [67, 67, 67, 67, 67, 67, 67, 67], + [67, 67, 67, 67, 71, 67, 67, 67], [67, 67, 67, 70, 67, 50, 67, 67], + [67, 67, 67, 73, 67, 50, 67, 67], [67, 67, 69, 67, 67, 50, 54, 67], + [67, 69, 67, 67, 67, 67, 67, 67], + ]; + static BITSET_CANONICAL: [u64; 48] = [ + 0b1111111111000000000000000000000000000000000000000000000000000000, + 0b1111111111111111111111111111111111111111111111001111111111111111, + 0b1111110000000000000000000000000000000000000000000000000000000000, + 0b1111100000000000000000000000000000000000000000000000000000000000, + 0b0001111111111111111111100000000000000000000000000000000000000000, + 0b0000000111111111111111111111111111111111111111111111111111111111, + 0b0000000000000000000000000000111100000000000000000000000000000000, + 0b1111111111111111111111111111111111111111111111111111111111111111, + 0b1111111111111110000000000000000000000000000000000000001111111111, + 0b0000001111111111000000111111111100000000000000000000000000000000, + 0b0000000000001111111111111111111111111111111111111111111110000000, + 0b0000000000000111111111111100000000000000000000000000000000000000, + 0b0000000000000001110000000000000000000000000000000000000000000000, + 0b0000000000000000111111111000000000000000000000000000000000000000, + 0b0000000000000000000000000000000000000000000000000000001111100111, + 0b0000000000000000000000000000000000000000000000000000010000000010, + 0b0000000000000000000000000000000000000000000111111111111111111111, + 0b0000000000000000000000000000000000000000011111111111111111111111, + 0b0000000000000000000000000000000000000111111111110000000000000000, + 0b0000000000000000000000000000000000001111111111111111111111111111, + 0b0000000000000000000000000000000011111111111111101111111100000000, + 0b0000000000000000000000000000001111111011111111110000000000000000, + 0b0000000000000000000111111111111111111111111111110000000000000000, + 0b0000000000000000011111111111111111111111111111111111111111111111, + 0b0000000000000000111111111111111111111111111111000000000000000000, + 0b0000000000011110111011111111111111111111111111111111111111111111, + 0b0000000000011111111111111111111000000011111111110000000000000000, + 0b0000000011111100111111111100000000000000000000000000000000000000, + 0b0000000111111111111111111100000001111111000000000000000000000000, + 0b0000001111110000111111111100000000000000000000000000000000000000, + 0b0000001111110001000000000000000000000000000000000000000000000000, + 0b0000001111111111000000000000000000000011111111110000000000000000, + 0b0000011100000000000000111111111000000000000000000000000010000000, + 0b0000111111111111000000000000000000000000000000000000000000000000, + 0b0000111111111111111111111111111000000000000000000000000000000000, + 0b0010000000000000000000000000000000000000000000000000000000000000, + 0b0011111111111111101111111111111111111111111111111111111111111110, + 0b0110000000000000000000000000000000000000000000000000000111111111, + 0b0111001000001100000000000000000000000000000000000000000000000000, + 0b0111111100000000111111111100000000000000000000000000000000000000, + 0b0111111111111111111111111111111100000000000000000000000000000000, + 0b1111111000000000000000000000000011111111000000000000000000000000, + 0b1111111100000000000000000000000011111111000000000000000000000000, + 0b1111111111111110000000000000000000000000000000000000000000000000, + 0b1111111111111111111111000000000000000000000000000000000000000000, + 0b1111111111111111111111111111111100000000000000000000000000000000, + 0b1111111111111111111111111111111111111111111111110000000000000000, + 0b1111111111111111111111111111111111111111111111111100000000000000, ]; - static BITSET: [u64; 74] = [ - 0, 999, 1023, 1026, 3072, 4064, 8191, 65408, 65472, 1048575, 1966080, 2097151, 3932160, - 4063232, 8388607, 67043328, 67044351, 134152192, 264241152, 268435455, 3758096384, - 4294901504, 17112694784, 64424509440, 549218942976, 4393751543808, 35184372023296, - 140737488355327, 272678883688448, 279275953455104, 280925220896768, 281200098803712, - 281474976448512, 492581209243648, 2251524935778304, 2251795518717952, 4503595332403200, - 4503599627370368, 8708132091985919, 9007190731849728, 17732923532771328, 71212894229889024, - 144114915328655360, 144115183780888576, 144115188075855871, 284007976623144960, - 284008251501051904, 287948901175001088, 287948901242044416, 287953294926544896, - 504407547722072192, 1152640029630136320, 1152921496016912384, 2305840810190438400, - 2305843009213693952, 3458764513820540928, 4611615649683210238, 6917529027641082367, - 8217943420044312576, 9151595642915651584, 9223372032559808512, 17870283321406128128, - 18158513697557839872, 18302628889911885824, 18374686483949813760, 18428729675200069632, - 18446181123756130304, 18446181123756131327, 18446739675663040512, 18446744069414584320, - 18446744073709355007, 18446744073709486080, 18446744073709535232, 18446744073709551615, + static BITSET_MAPPING: [(u8, u8); 26] = [ + (0, 10), (0, 16), (0, 26), (0, 39), (0, 42), (0, 48), (0, 58), (1, 186), (1, 172), (2, 28), + (2, 54), (3, 22), (3, 48), (4, 23), (4, 55), (5, 140), (5, 176), (6, 49), (6, 50), (7, 128), + (8, 47), (9, 32), (10, 172), (11, 26), (12, 47), (13, 32), ]; pub fn lookup(c: char) -> bool { @@ -491,55 +969,97 @@ pub mod n { &BITSET_CHUNKS_MAP, BITSET_LAST_CHUNK_MAP, &BITSET_INDEX_CHUNKS, - &BITSET, + &BITSET_CANONICAL, + &BITSET_MAPPING, ) } } #[rustfmt::skip] pub mod uppercase { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (124, 6); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (124, 3); static BITSET_CHUNKS_MAP: [u8; 123] = [ - 12, 15, 0, 0, 11, 0, 0, 8, 5, 9, 0, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, 13, 0, 7, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, - 0, 0, 4, + 12, 16, 4, 4, 2, 4, 4, 11, 8, 0, 4, 14, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 5, 4, 13, 4, 10, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 15, 4, 4, + 4, 4, 9, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [ - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 33, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 9, 0, 38, 46, 44, 28], - [0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 51, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 60, 62, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 54, 0, 0, 0, 0, 0, 43, 43, 40, 43, 56, 22, 34, 35], - [0, 0, 57, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 66, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 66, 30], - [0, 10, 0, 11, 50, 37, 36, 45, 47, 5, 0, 0, 0, 49, 17, 53], - [14, 0, 60, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [21, 52, 43, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [24, 39, 42, 41, 59, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [58, 65, 29, 16, 48, 63, 31, 19, 55, 61, 64, 32, 27, 20, 15, 3], + [8, 8, 2, 57, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], + [8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], + [8, 8, 4, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 60], + [8, 8, 8, 8, 1, 49, 59, 8, 8, 8, 8, 8, 8, 8, 8, 8], + [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], + [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 54, 8, 8, 8], + [8, 8, 8, 8, 8, 8, 8, 8, 8, 17, 13, 8, 31, 37, 35, 23], + [8, 8, 8, 8, 8, 8, 8, 8, 8, 63, 8, 8, 8, 8, 8, 8], + [8, 8, 8, 8, 42, 20, 66, 8, 8, 8, 8, 8, 8, 8, 8, 8], + [8, 8, 8, 8, 64, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], + [8, 8, 22, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], + [8, 8, 45, 8, 8, 8, 8, 8, 34, 34, 65, 34, 47, 19, 27, 28], + [8, 51, 8, 14, 41, 30, 29, 36, 38, 10, 8, 8, 8, 40, 16, 44], + [15, 8, 1, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], + [18, 43, 34, 21, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], + [55, 0, 24, 52, 39, 50, 25, 53, 46, 56, 5, 26, 3, 62, 61, 7], + [58, 32, 6, 33, 48, 12, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], ]; - static BITSET: [u64; 67] = [ - 0, 8, 1023, 1024, 8383, 21882, 65535, 1048575, 8388607, 89478485, 134217726, 2139095039, - 4294967295, 17179869183, 1099511627775, 2199023190016, 4398046445568, 17575006099264, - 23456248059221, 70368743129088, 140737484161024, 140737488355327, 280378317225728, - 281470681743392, 281474976710655, 1169903278445909, 2251799813685247, 9007198986305536, - 9007199254741748, 17977448100528131, 18014398509481983, 288230371856744511, - 576460735123554305, 576460743713488896, 1080863910568919040, 1080897995681042176, - 1274187559846268630, 3122495741643543722, 6148633210533183488, 6148914689804861440, - 6148914690880001365, 6148914691236506283, 6148914691236516865, 6148914691236517205, - 6151773421467674709, 6184099063146390672, 7638198793012598101, 7783721355972007253, - 8863084067199903664, 9242793810247811072, 12273810184460391765, 13839347594782259332, - 13845730589451223040, 16613872850358272000, 16717361816799215616, 17293822586282573568, - 18374966856193736448, 18428729675200069632, 18442240474149289983, 18446274948748367189, - 18446462598732840960, 18446462598737035263, 18446466996779287551, 18446726481523637343, - 18446742974197924863, 18446742974197940223, 18446744069414584320, + static BITSET_CANONICAL: [u64; 51] = [ + 0b1111111111111111111111110000000000000000000000000011111111111111, + 0b1111111111111111000000000000000000000000000000000000000000000000, + 0b1111111111000000000000000000000000000000000000000000000000000000, + 0b0000000000011111111111111111111111110000000000000000000000000000, + 0b1111111111111111111111111111111100000000000000000000000000000000, + 0b1111111111111111111111110000000000000000000000000000001111111111, + 0b0101010101010101010101010101010101010101010101010101010000000001, + 0b0000000000000000000000000000000000000000000000000000010000000000, + 0b0000000000000000000000000000000000000000000000000000000000000000, + 0b0000000000000000000000000000000000000000000000000010000010111111, + 0b0000000000000000000000000000000000000000000000000101010101111010, + 0b0000000000000000000000000000000000000000000011111111111111111111, + 0b0000000000000000000000000000000000000000011111111111111111111111, + 0b0000000000000000000000000000000000000101010101010101010101010101, + 0b0000000000000000000000000000000001111111011111111111111111111111, + 0b0000000000000000000000001111111111111111111111111111111111111111, + 0b0000000000000000000011111111101111111111111111101101011101000000, + 0b0000000000000000000101010101010101010101010101010101010101010101, + 0b0000000000000000011111111111111111111111111111111111111111111111, + 0b0000000000000000111111110000000010101010000000000011111100000000, + 0b0000000000000000111111111111111100000000000000000000000000100000, + 0b0000000000000100001010000000010101010101010101010101010101010101, + 0b0000000000000111111111111111111111111111111111111111111111111111, + 0b0000000000100000000000000000000000000000000000000000001011110100, + 0b0000000000111111110111100110010011010000000000000000000000000011, + 0b0000001111111111111111111111111100000000000000000000000000111111, + 0b0000011111111111111111111111110000000000000000000000000000000001, + 0b0000111100000000000000000000000000000000000000000000000000000000, + 0b0000111100000000000111110000000000001111000000000000111100000000, + 0b0001000110101110110100101101010110110001110110111100111011010110, + 0b0010101101010101010101010101010101010101010101010101010010101010, + 0b0101010101010100010101010101010000000000000000000000000000000000, + 0b0101010101010101010101010101010100000000000000000000000000000000, + 0b0101010101010101010101010101010101010101010101010010101010101011, + 0b0101010101010101010101010101010101010101010101010101010101010101, + 0b0101010101011111011111010101010101010101010001010010100001010101, + 0b0101010111010010010101010101010101001010101010101010010010010000, + 0b0110101000000000010101010101010101010101010101010101010101010101, + 0b0110110000000101010101010101010101010101010101010101010101010101, + 0b0111101100000000000000000000000000011111110111111110011110110000, + 0b1000000001000101000000000000000000000000000000000000000000000000, + 0b1010101001010101010101010101010101010101010101010101010101010101, + 0b1100000000001111001111010101000000111110001001110011100010000100, + 0b1100000000100101111010101001110100000000000000000000000000000000, + 0b1110011010010000010101010101010101010101000111001000000000000000, + 0b1110011111111111111111111111111111111111111111110000000000000000, + 0b1111000000000000000000000000001111111111111111111111111100000000, + 0b1111111100000000111111110000000000111111000000001111111100000000, + 0b1111111111111110010101010101010101010101010101010101010101010101, + 0b1111111111111111000000111111111111111111111111110000001111111111, + 0b1111111111111111111100000000000000000000000000011111110001011111, + ]; + static BITSET_MAPPING: [(u8, u8); 16] = [ + (0, 179), (0, 130), (0, 134), (0, 147), (0, 12), (0, 8), (1, 16), (1, 128), (2, 10), + (2, 128), (3, 52), (3, 58), (4, 32), (5, 24), (6, 20), (7, 57), ]; pub fn lookup(c: char) -> bool { @@ -548,22 +1068,30 @@ pub mod uppercase { &BITSET_CHUNKS_MAP, BITSET_LAST_CHUNK_MAP, &BITSET_INDEX_CHUNKS, - &BITSET, + &BITSET_CANONICAL, + &BITSET_MAPPING, ) } } #[rustfmt::skip] pub mod white_space { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (32, 2); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (32, 3); static BITSET_CHUNKS_MAP: [u8; 22] = [ - 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, + 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, ]; static BITSET_INDEX_CHUNKS: [[u8; 6]; 4] = [ - [0, 0, 0, 0, 0, 0], [0, 0, 5, 2, 0, 0], [1, 0, 0, 0, 0, 0], [4, 0, 3, 0, 0, 0], + [1, 1, 1, 1, 1, 1], [1, 1, 4, 0, 1, 1], [3, 1, 2, 1, 1, 1], [5, 1, 1, 1, 1, 1], + ]; + static BITSET_CANONICAL: [u64; 5] = [ + 0b0000000000000000000000000000000010000000000000000000000000000000, + 0b0000000000000000000000000000000000000000000000000000000000000000, + 0b0000000000000000000000000000000100000000000000000000000000100000, + 0b0000000000000000000000000000000100000000000000000011111000000000, + 0b0000000000000000100000110000000000000000000000000000011111111111, ]; - static BITSET: [u64; 6] = [ - 0, 1, 2147483648, 4294967328, 4294983168, 144036023240703, + static BITSET_MAPPING: [(u8, u8); 1] = [ + (0, 33), ]; pub fn lookup(c: char) -> bool { @@ -572,7 +1100,8 @@ pub mod white_space { &BITSET_CHUNKS_MAP, BITSET_LAST_CHUNK_MAP, &BITSET_INDEX_CHUNKS, - &BITSET, + &BITSET_CANONICAL, + &BITSET_MAPPING, ) } } diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 39c288dfc61e8..5e8865fc9e3b5 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -254,12 +254,19 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec>)]) -> String s.push_str( " #[inline(always)] -fn range_search( +fn range_search< + const N: usize, + const CHUNK_SIZE: usize, + const N1: usize, + const CANONICAL: usize, + const CANONICALIZED: usize, +>( needle: u32, chunk_idx_map: &[u8; N], (last_chunk_idx, last_chunk_mapping): (u16, u8), bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], - bitset: &[u64; N2], + bitset_canonical: &[u64; CANONICAL], + bitset_canonicalized: &[(u8, u8); CANONICALIZED], ) -> bool { let bucket_idx = (needle / 64) as usize; let chunk_map_idx = bucket_idx / CHUNK_SIZE; @@ -273,8 +280,21 @@ fn range_search [u8; 16]). +//! further group these by some constant N (between 1 and 64 per group), and +//! again deduplicate and store in an array (u8 -> [u8; N]). The constant is +//! chosen to be optimal in bytes-in-memory for the given dataset. //! //! The indices into this array represent ranges of 64*16 = 1024 codepoints. //! @@ -37,9 +38,9 @@ //! down considerably. use crate::fmt_list; -use std::collections::{BTreeSet, HashMap}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::convert::TryFrom; -use std::fmt::Write; +use std::fmt::{self, Write}; use std::ops::Range; #[derive(Clone)] @@ -61,6 +62,10 @@ impl RawEmitter { } fn emit_bitset(&mut self, words: &[u64]) { + let mut words = words.to_vec(); + // Ensure that there's a zero word in the dataset, used for padding and + // such. + words.push(0); let unique_words = words.iter().cloned().collect::>().into_iter().collect::>(); if unique_words.len() > u8::max_value() as usize { @@ -68,13 +73,9 @@ impl RawEmitter { } // needed for the chunk mapping to work assert_eq!(unique_words[0], 0, "has a zero word"); + let canonicalized = Canonicalized::canonicalize(&unique_words); - let word_indices = unique_words - .iter() - .cloned() - .enumerate() - .map(|(idx, word)| (word, u8::try_from(idx).unwrap())) - .collect::>(); + let word_indices = canonicalized.unique_mapping.clone(); let compressed_words = words.iter().map(|w| word_indices[w]).collect::>(); let mut best = None; @@ -91,14 +92,32 @@ impl RawEmitter { } self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0); + struct Bits(u64); + impl fmt::Debug for Bits { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "0b{:064b}", self.0) + } + } + + writeln!( + &mut self.file, + "static BITSET_CANONICAL: [u64; {}] = [{}];", + canonicalized.canonical_words.len(), + fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))), + ) + .unwrap(); + self.bytes_used += 8 * canonicalized.canonical_words.len(); writeln!( &mut self.file, - "static BITSET: [u64; {}] = [{}];", - unique_words.len(), - fmt_list(&unique_words), + "static BITSET_MAPPING: [(u8, u8); {}] = [{}];", + canonicalized.canonicalized_words.len(), + fmt_list(&canonicalized.canonicalized_words), ) .unwrap(); - self.bytes_used += 8 * unique_words.len(); + // 8 bit index into shifted words, 7 bits for shift + optional flip + // We only need it for the words that we removed by applying a shift and + // flip to them. + self.bytes_used += 2 * canonicalized.canonicalized_words.len(); } fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) { @@ -170,7 +189,8 @@ impl RawEmitter { writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap(); writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap(); - writeln!(&mut self.file, " &BITSET,").unwrap(); + writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap(); + writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap(); writeln!(&mut self.file, " )").unwrap(); writeln!(&mut self.file, "}}").unwrap(); } @@ -196,3 +216,193 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range]) { emitter.blank_line(); emitter.emit_lookup(); } + +struct Canonicalized { + canonical_words: Vec, + canonicalized_words: Vec<(u8, u8)>, + + /// Maps an input unique word to the associated index (u8) which is into + /// canonical_words or canonicalized_words (in order). + unique_mapping: HashMap, +} + +impl Canonicalized { + fn canonicalize(unique_words: &[u64]) -> Self { + #[derive(Copy, Clone, Debug)] + enum Mapping { + Rotate(u32), + Invert, + RotateAndInvert(u32), + } + + // key is the word being mapped to + let mut mappings: BTreeMap> = BTreeMap::new(); + for &a in unique_words { + 'b: for &b in unique_words { + // skip self + if a == b { + continue; + } + + // All possible distinct rotations + for rotation in 1..64 { + if a.rotate_right(rotation) == b { + mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation))); + // We're not interested in further mappings between a and b + continue 'b; + } + } + + if (!a) == b { + mappings.entry(b).or_default().push((a, Mapping::Invert)); + // We're not interested in further mappings between a and b + continue 'b; + } + + // All possible distinct rotations, inverted + for rotation in 1..64 { + if (!a.rotate_right(rotation)) == b { + mappings + .entry(b) + .or_default() + .push((a, Mapping::RotateAndInvert(rotation))); + // We're not interested in further mappings between a and b + continue 'b; + } + } + } + } + // These are the bitset words which will be represented "raw" (as a u64) + let mut canonical_words = Vec::new(); + // These are mapped words, which will be represented by an index into + // the canonical_words and a Mapping; u16 when encoded. + let mut canonicalized_words = Vec::new(); + let mut unique_mapping = HashMap::new(); + + #[derive(Debug, PartialEq, Eq)] + enum UniqueMapping { + Canonical(usize), + Canonicalized(usize), + } + + while let Some((&to, _)) = mappings.iter().max_by_key(|m| m.1.len()) { + // Get the mapping with the most entries. Currently, no mapping can + // only exist transitively (i.e., there is no A, B, C such that A + // does not map to C and but A maps to B maps to C), so this is + // guaranteed to be acceptable. + // + // In the future, we may need a more sophisticated algorithm to + // identify which keys to prefer as canonical. + let mapped_from = mappings.remove(&to).unwrap(); + for (from, how) in &mapped_from { + // Remove the entries which mapped to this one. + // Noting that it should be associated with the Nth canonical word. + // + // We do not assert that this is present, because there may be + // no mappings to the `from` word; that's fine. + mappings.remove(from); + assert_eq!( + unique_mapping + .insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())), + None + ); + canonicalized_words.push((canonical_words.len(), *how)); + + // Remove the now-canonicalized word from other mappings, + // to ensure that we deprioritize them in the next iteration of + // the while loop. + for (_, mapped) in &mut mappings { + let mut i = 0; + while i != mapped.len() { + if mapped[i].0 == *from { + mapped.remove(i); + } else { + i += 1; + } + } + } + } + assert!( + unique_mapping + .insert(to, UniqueMapping::Canonical(canonical_words.len())) + .is_none() + ); + canonical_words.push(to); + + // Remove the now-canonical word from other mappings, to ensure that + // we deprioritize them in the next iteration of the while loop. + for (_, mapped) in &mut mappings { + let mut i = 0; + while i != mapped.len() { + if mapped[i].0 == to { + mapped.remove(i); + } else { + i += 1; + } + } + } + } + + // Any words which we couldn't shrink, just stick into the canonical + // words. + // + // FIXME: work harder -- there are more possibilities for mapping + // functions (e.g., multiplication, shifting instead of rotation, etc.) + // We'll probably always have some slack though so this loop will still + // be needed. + for &w in unique_words { + if !unique_mapping.contains_key(&w) { + assert!( + unique_mapping + .insert(w, UniqueMapping::Canonical(canonical_words.len())) + .is_none() + ); + canonical_words.push(w); + } + } + assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len()); + assert_eq!(unique_mapping.len(), unique_words.len()); + + let unique_mapping = unique_mapping + .into_iter() + .map(|(key, value)| { + ( + key, + match value { + UniqueMapping::Canonicalized(idx) => { + u8::try_from(canonical_words.len() + idx).unwrap() + } + UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(), + }, + ) + }) + .collect::>(); + + let mut distinct_indices = BTreeSet::new(); + for &w in unique_words { + let idx = unique_mapping.get(&w).unwrap(); + assert!(distinct_indices.insert(idx)); + } + + let canonicalized_words = canonicalized_words + .into_iter() + .map(|v| { + ( + u8::try_from(v.0).unwrap(), + match v.1 { + Mapping::RotateAndInvert(amount) => { + assert!(amount < (1 << 7)); + 1 << 7 | (amount as u8) + } + Mapping::Rotate(amount) => { + assert!(amount < (1 << 7)); + amount as u8 + } + Mapping::Invert => 1 << 7, + }, + ) + }) + .collect::>(); + Canonicalized { unique_mapping, canonical_words, canonicalized_words } + } +} From 7b29b70d6ea52e9324f9328bed9beb6cf516c1ce Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Sat, 21 Mar 2020 12:11:47 -0400 Subject: [PATCH 06/14] Add a right shift mapping This saves less bytes - by far - and is likely not the best operator to choose. But for now, it works -- a better choice may arise later. Alphabetic : 2538 bytes (- 84 bytes) Case_Ignorable : 1773 bytes (- 30 bytes) Cased : 790 bytes (- 18 bytes) Cc : 26 bytes (- 6 bytes) Grapheme_Extend: 1490 bytes (- 18 bytes) Lowercase : 865 bytes (- 36 bytes) N : 1040 bytes (- 24 bytes) Uppercase : 778 bytes (- 60 bytes) White_Space : 85 bytes (- 6 bytes) Total table sizes: 9385 bytes (-282 bytes) --- src/libcore/unicode/mod.rs | 4 +- src/libcore/unicode/unicode_data.rs | 1603 ++++++++--------- src/tools/unicode-table-generator/src/main.rs | 13 +- .../src/raw_emitter.rs | 27 +- 4 files changed, 814 insertions(+), 833 deletions(-) diff --git a/src/libcore/unicode/mod.rs b/src/libcore/unicode/mod.rs index 2a41685a48096..39532166a0b66 100644 --- a/src/libcore/unicode/mod.rs +++ b/src/libcore/unicode/mod.rs @@ -66,12 +66,12 @@ fn range_search< } else { let (real_idx, mapping) = bitset_canonicalized[idx - CANONICAL]; let mut word = bitset_canonical[real_idx as usize]; - let should_invert = mapping & (1 << 7) != 0; + let should_invert = mapping & (1 << 6) != 0; if should_invert { word = !word; } // Unset the inversion bit - let rotate_by = mapping & !(1 << 7); + let rotate_by = mapping & !(1 << 6); word = word.rotate_left(rotate_by as u32); word }; diff --git a/src/libcore/unicode/unicode_data.rs b/src/libcore/unicode/unicode_data.rs index 555a0437f7bcb..bae6d8ea95365 100644 --- a/src/libcore/unicode/unicode_data.rs +++ b/src/libcore/unicode/unicode_data.rs @@ -5,261 +5,248 @@ pub const UNICODE_VERSION: (u32, u32, u32) = (13, 0, 0); #[rustfmt::skip] pub mod alphabetic { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (393, 11); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (393, 13); static BITSET_CHUNKS_MAP: [u8; 393] = [ - 73, 17, 1, 52, 39, 48, 49, 58, 50, 26, 0, 29, 51, 57, 60, 12, 63, 72, 66, 72, 72, 72, 55, - 53, 42, 72, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 4, 23, - 38, 36, 61, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 9, 72, 72, 72, - 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 70, 28, 14, 64, 47, 65, 3, 16, 46, 40, - 32, 67, 30, 43, 24, 54, 35, 45, 44, 68, 4, 10, 31, 72, 72, 72, 72, 72, 4, 4, 59, 72, 72, 72, - 72, 72, 72, 72, 4, 33, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 4, - 34, 72, 62, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 21, 72, 72, 72, 72, 72, 72, 72, 72, - 72, 72, 72, 72, 72, 72, 72, 72, 72, 15, 18, 72, 72, 72, 72, 25, 72, 72, 72, 72, 72, 72, 72, - 72, 72, 72, 72, 27, 22, 72, 72, 72, 72, 37, 69, 72, 72, 19, 72, 72, 41, 71, 72, 72, 72, 72, - 72, 72, 72, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 20, 4, 4, 4, 4, - 4, 4, 4, 13, 56, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 7, - 72, 72, 72, 72, 72, 72, 4, 74, 72, 72, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 61, 18, 2, 35, 46, 39, 38, 74, 37, 25, 70, 34, 36, 73, 66, 5, 52, 58, 54, 58, 58, 58, 69, + 64, 43, 58, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 6, 6, 23, + 47, 49, 65, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 58, 58, 58, + 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 57, 33, 17, 51, 40, 53, 4, 16, 41, 45, + 30, 55, 28, 42, 27, 0, 67, 71, 1, 56, 6, 12, 31, 58, 58, 58, 58, 58, 6, 6, 63, 58, 58, 58, + 58, 58, 58, 58, 6, 29, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 6, + 68, 58, 50, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 6, 6, 20, 58, 58, 58, 58, 58, 58, 58, 58, + 58, 58, 58, 58, 58, 58, 58, 58, 58, 14, 22, 58, 58, 58, 58, 26, 58, 58, 58, 58, 58, 58, 58, + 58, 58, 58, 58, 32, 24, 58, 58, 58, 58, 48, 60, 58, 58, 19, 58, 58, 44, 59, 58, 58, 58, 58, + 58, 58, 58, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 21, 6, 6, 6, 6, + 6, 6, 6, 15, 72, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, + 58, 58, 58, 58, 58, 58, 6, 62, 58, 58, 6, 6, 6, 6, 6, 6, 6, 6, 6, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 75] = [ - [0, 16, 16, 16, 16, 16, 16, 16], [16, 16, 7, 16, 196, 175, 130, 69], - [16, 16, 11, 16, 16, 16, 16, 16], [16, 16, 12, 96, 227, 48, 244, 244], - [16, 16, 16, 16, 16, 16, 16, 16], [16, 16, 16, 16, 16, 16, 16, 226], - [16, 16, 16, 16, 16, 16, 16, 229], [16, 16, 16, 16, 16, 16, 16, 241], - [16, 16, 16, 16, 16, 16, 16, 244], [16, 16, 16, 16, 16, 16, 19, 97], - [16, 16, 16, 16, 16, 16, 249, 244], [16, 16, 16, 16, 16, 252, 244, 244], - [16, 16, 16, 16, 17, 108, 113, 101], [16, 16, 16, 16, 24, 16, 16, 16], - [16, 16, 16, 16, 201, 4, 207, 94], [16, 16, 16, 16, 240, 159, 16, 16], - [16, 16, 16, 16, 243, 51, 244, 244], [16, 16, 16, 62, 244, 129, 170, 193], - [16, 16, 16, 213, 244, 244, 244, 244], [16, 16, 16, 232, 16, 29, 244, 244], - [16, 16, 16, 253, 16, 16, 16, 16], [16, 16, 16, 254, 242, 244, 244, 244], - [16, 16, 205, 145, 221, 222, 5, 31], [16, 16, 208, 237, 16, 16, 16, 16], - [16, 45, 16, 28, 244, 244, 117, 224], [16, 102, 42, 244, 244, 244, 244, 244], - [16, 179, 116, 182, 181, 36, 219, 245], [16, 199, 143, 200, 114, 184, 16, 16], - [16, 203, 16, 249, 112, 185, 234, 183], [16, 204, 177, 87, 74, 73, 202, 37], - [16, 223, 15, 54, 210, 81, 16, 38], [16, 231, 16, 16, 16, 212, 244, 244], - [16, 242, 210, 210, 53, 244, 244, 244], [16, 250, 244, 244, 244, 244, 244, 244], - [22, 240, 244, 25, 217, 133, 216, 244], [22, 244, 173, 14, 125, 228, 167, 49], - [52, 247, 16, 142, 163, 103, 195, 115], [55, 244, 244, 244, 107, 187, 244, 244], - [59, 123, 16, 217, 171, 16, 1, 153], [61, 56, 152, 161, 191, 155, 134, 98], - [80, 20, 248, 50, 239, 70, 236, 244], [93, 111, 95, 244, 244, 244, 244, 244], - [100, 0, 172, 192, 157, 16, 9, 218], [110, 244, 160, 251, 136, 47, 244, 244], - [119, 154, 82, 244, 127, 168, 35, 244], [120, 4, 40, 22, 244, 244, 244, 244], - [124, 84, 240, 77, 88, 244, 6, 244], [128, 41, 16, 233, 244, 24, 244, 244], - [131, 79, 137, 99, 135, 64, 132, 34], [139, 46, 138, 68, 140, 148, 105, 71], - [147, 4, 178, 146, 16, 16, 16, 16], [173, 22, 10, 239, 86, 75, 214, 238], - [176, 166, 16, 126, 4, 2, 234, 90], [188, 244, 244, 244, 244, 244, 244, 244], - [190, 27, 85, 244, 57, 244, 244, 244], [197, 198, 16, 72, 164, 63, 118, 180], - [206, 16, 16, 16, 16, 16, 16, 16], [215, 76, 121, 186, 194, 30, 149, 67], - [225, 32, 106, 43, 186, 156, 104, 244], [231, 244, 244, 244, 244, 244, 244, 244], - [243, 109, 141, 91, 16, 16, 16, 235], [243, 150, 190, 78, 165, 162, 16, 58], - [244, 16, 244, 244, 16, 3, 44, 65], [244, 122, 209, 244, 144, 174, 242, 244], - [244, 151, 16, 229, 21, 169, 190, 39], [244, 244, 8, 230, 211, 92, 206, 33], - [244, 244, 13, 26, 244, 244, 244, 244], [244, 244, 66, 244, 158, 223, 218, 83], - [244, 244, 244, 23, 244, 244, 189, 244], [244, 244, 244, 60, 244, 244, 244, 244], - [244, 244, 244, 244, 16, 16, 16, 16], [244, 244, 244, 244, 218, 18, 238, 244], - [244, 244, 244, 244, 244, 244, 244, 244], [244, 246, 89, 220, 16, 16, 16, 16], - [253, 244, 244, 244, 244, 244, 244, 244], + [0, 252, 121, 172, 14, 172, 172, 172], [13, 51, 125, 172, 79, 35, 166, 172], + [15, 15, 7, 15, 221, 27, 76, 138], [15, 15, 10, 15, 15, 15, 15, 15], + [15, 15, 11, 108, 247, 194, 172, 172], [15, 15, 15, 15, 8, 96, 91, 104], + [15, 15, 15, 15, 15, 15, 15, 15], [15, 15, 15, 15, 15, 15, 15, 172], + [15, 15, 15, 15, 15, 15, 15, 193], [15, 15, 15, 15, 15, 15, 15, 210], + [15, 15, 15, 15, 15, 15, 15, 214], [15, 15, 15, 15, 15, 15, 47, 238], + [15, 15, 15, 15, 15, 15, 188, 172], [15, 15, 15, 15, 15, 181, 172, 172], + [15, 15, 15, 15, 192, 45, 15, 15], [15, 15, 15, 15, 207, 15, 15, 15], + [15, 15, 15, 15, 209, 153, 172, 172], [15, 15, 15, 15, 215, 5, 232, 110], + [15, 15, 15, 145, 172, 77, 33, 218], [15, 15, 15, 176, 15, 170, 172, 172], + [15, 15, 15, 187, 179, 172, 172, 172], [15, 15, 15, 191, 15, 15, 15, 15], + [15, 15, 15, 213, 172, 172, 172, 172], [15, 15, 182, 251, 15, 15, 15, 15], + [15, 15, 230, 61, 235, 236, 237, 234], [15, 22, 88, 19, 20, 189, 244, 248], + [15, 103, 161, 172, 172, 172, 172, 172], [15, 158, 15, 171, 172, 172, 87, 245], + [15, 177, 118, 151, 205, 126, 15, 164], [15, 178, 172, 172, 172, 172, 172, 172], + [15, 179, 205, 205, 195, 172, 172, 172], [15, 200, 15, 15, 15, 175, 172, 172], + [15, 224, 63, 225, 90, 17, 15, 15], [15, 228, 15, 188, 92, 16, 204, 18], + [15, 229, 25, 119, 133, 134, 1, 165], [26, 37, 15, 80, 5, 4, 204, 115], + [30, 211, 40, 208, 120, 132, 239, 180], [59, 5, 23, 60, 15, 15, 15, 15], + [67, 157, 68, 139, 66, 58, 99, 136], [75, 128, 69, 106, 71, 143, 74, 167], + [78, 254, 15, 212, 172, 207, 172, 172], [82, 122, 192, 130, 117, 172, 6, 172], + [94, 172, 44, 196, 70, 156, 172, 172], [105, 226, 31, 217, 48, 15, 28, 243], + [111, 93, 109, 172, 172, 172, 172, 172], [127, 102, 190, 154, 208, 137, 186, 172], + [147, 149, 53, 43, 216, 50, 72, 107], [148, 12, 15, 202, 32, 15, 233, 52], + [150, 172, 172, 172, 97, 183, 172, 172], [152, 206, 15, 64, 41, 101, 220, 89], + [172, 15, 172, 172, 15, 3, 159, 142], [172, 54, 15, 214, 113, 34, 0, 163], + [172, 83, 241, 172, 62, 29, 179, 172], [172, 172, 24, 185, 242, 112, 231, 168], + [172, 172, 55, 197, 172, 172, 172, 172], [172, 172, 141, 172, 46, 177, 243, 123], + [172, 172, 172, 124, 172, 172, 203, 172], [172, 172, 172, 172, 15, 15, 15, 15], + [172, 172, 172, 172, 172, 172, 172, 172], [172, 172, 172, 172, 243, 9, 180, 172], + [172, 172, 172, 199, 172, 172, 172, 172], [172, 249, 116, 2, 15, 15, 15, 15], + [191, 172, 172, 172, 172, 172, 172, 172], [200, 172, 172, 172, 172, 172, 172, 172], + [201, 172, 172, 172, 172, 172, 172, 172], [209, 56, 0, 129, 38, 42, 15, 198], + [209, 95, 65, 114, 15, 15, 15, 250], [211, 172, 30, 85, 81, 174, 36, 155], + [211, 192, 172, 146, 202, 73, 184, 172], [222, 223, 15, 135, 39, 144, 86, 21], + [226, 15, 15, 15, 15, 15, 15, 15], [227, 5, 162, 211, 172, 172, 172, 172], + [231, 15, 15, 15, 15, 15, 15, 15], [240, 131, 84, 173, 219, 253, 57, 140], + [246, 169, 98, 160, 173, 49, 100, 172], ]; - static BITSET_CANONICAL: [u64; 186] = [ - 0b1111111111111111111111111111111111111111111111111111111111111110, - 0b1111111111111111111111111111111111111111111111111111100111111111, - 0b1111111111111111111111111111111111111111111111111110000000000000, + static BITSET_CANONICAL: [u64; 172] = [ + 0b0111111111111111111111111111111111111111111111111111111111111111, + 0b1111111111001111111111111111111111111111111111111111111111111111, + 0b1111111101111111111111111111111111111111011111111111111111111111, 0b1111111111111111111111111111111111111111111111111000011111111111, + 0b1111111111111111111111111111111111111111111111111110000000000000, 0b1111111111111111111111111111111111111111111111110000000000000000, - 0b1111111111111111111111011111111111111111111111111111110111111111, 0b1100000011111111111111111111111111111111111111111111111111111111, 0b1111111111111111111111111111111111111111111111111111110000000011, + 0b1111111111111111111111111111111100111111001111111111111111111111, + 0b1111111111111111000000111111111111111111111111110000001111111111, + 0b1111111111111111000000000000001111111111111111111111111111111111, + 0b1111111111111111000000000000000000111111111111111111111111111111, + 0b1000111111110000011111111111111111111111111111111111111111111111, + 0b0111111101111111111111111111111111111111111111111111110111111111, + 0b0000000000000000000001111111111111100111111111111111111111111111, + 0b1111111111111111111111111111111111111111111111111111111111111111, + 0b1111111111111111111111111111111111111111111111111111111111011011, + 0b1111111111111111111111111111111111111111111111011111110001011111, + 0b1111111111111111111111111111111111111111111110000000000000000000, + 0b1111111111111111111111111111111111111111011111111111111100111101, + 0b1111111111111111111111111111111111111111001111011111111111111111, + 0b1111111111111111111111111111111101111111011111110111111101111111, + 0b1111111111111111111111111111111100111101011111110011110111111111, + 0b1111111111111111111111111111111100111100000000001111111111111111, 0b1111111111111111111111111111111100011111111111111111111111111111, + 0b1111111111111111111111111111111100000111111111111111111111111110, + 0b1111111111111111111111111111111100000111111111110000000000000000, + 0b1111111111111111111111111111111100000010011111111111111111111111, 0b1111111111111111111111111111111100000000000000000111111111111111, + 0b1111111111111111111111111111111100000000000000000100001111100000, + 0b1111111111111111111111111111111100000000000000000000000000000000, + 0b1111111111111111111111111111111011100000011111111111111111111111, + 0b1111111111111111111111111111110011111111100000000000000000000000, + 0b1111111111111111111111111111101111111111111111111101011101000000, + 0b1111111111111111111111111100000000000111111111111111111111111110, + 0b1111111111111111111111011011111100000000000000000000000011001011, + 0b1111111111111111111111001111111100000000000000000000000000000000, + 0b1111111111111111110000000000000011111110111111111111111111111111, + 0b1111111111111111011111110111111100000000011111100111111001111110, + 0b1111111111111111001000001011111111111111111111111111111111111111, 0b1111111111111111000001111111111111111111111111111111111111111111, - 0b1111111111111111000000000000001111111111111111111111111111111111, - 0b1111111111111111000000000000000000111111111111111111111111111111, + 0b1111111111111111000001111111111111111111111111111111110000000000, + 0b1111111111111111000000111111111111110111111111111111111111111111, + 0b1111111111111111000000111111100011111111111100000000000011111111, + 0b1111111111111111000000011111111110111111111111111011110101111111, + 0b1111111111111111000000001111000000000000000001110000000000000000, + 0b1111111111111111000000001000000000011111111111111111111111111111, + 0b1111111111111111000000000000111111111111111111111111111111111111, + 0b1111111111111110111111111111111111111111111111111111111111100000, + 0b1111111111111110000111111111111111111111111111111111111011111111, + 0b1111111111111110000000000000111111111111111000011101111111111111, + 0b1111111111111100000000000000000000000000000000000000000000000001, + 0b1111111111100000000000000000000000000000000000000000011111111100, + 0b1111111111011111111111111111111100000000000000000000000000000000, + 0b1111111111011111000000000000000000000000000000000000000000000000, 0b1111111111000000000000000000000000000000000000000000000000000000, + 0b1111110001111111111111111111111100000000000000000011111111111111, + 0b1111110000000000111100111111111111111111111111111111111111111111, + 0b1111110000000000000000000000111110000000111100000101110111011111, + 0b1111100101111111111111111111111111111111111111111111111111111111, + 0b1111011111111111111111111111111111111111111111110010000010111111, + 0b1111011111111111111111111111111111110111111111111111111111111101, + 0b1111001111111111101111010101000000111110001011111111110010000100, + 0b1110101111111111110111100110010011011111111111111111111111111111, + 0b1110100011111100000000000000000000000000000000000000000000101111, + 0b1110011111111111111111111111111111111111111111110000000111111111, + 0b1110011111111111111111111111111111111111111111011101111111111111, + 0b1110001111111111111111011111111111111111111111011101111111101111, + 0b1110001111101111111111011111111111111111111111011101111111101111, + 0b1110001111101101111111011111111111111111111110111011111111101110, + 0b1110001111101101111111011111111111111111111110011001111111101111, + 0b1110001111101101111111011111111111111111111110011001111111101110, + 0b1110001111000101111111011111111111111111111110011001111111101111, + 0b1110000011111111111111111111100000000000000000000000000000001111, + 0b1100001111111111110001110001100011010110001111011100011111101100, + 0b1100001101101101111111011111111111111111111110011000011111101110, + 0b1011111111111111000000000000000000000000000000000000000111111111, + 0b1011110011011111000000000000000000000000000000000000000000100000, + 0b1011011111111111111111110111111111111111111111111110111111111111, + 0b1011010001111111111111111111111111111111111111111111101101111111, + 0b1001110000000000111000011111111000011111111011111111111111111111, + 0b1001100110111111111111111111111111111111011011111111001001111111, + 0b1001000110111111111111111111111111111111111111111111110100111111, + 0b1000000000000010000000000000000000000000000000000000000000000000, + 0b1000000000000000000000001000000000000000000000000000000000000000, 0b1000000000000000000000000000000011111111111111111111111111111111, - 0b0000000111111111111111111111111111111111111111111111111111111100, - 0b1111111111111111111111111111111111111111111111111111111111111111, - 0b1111111111111111111111111111111100111111001111111111111111111111, - 0b1111111111111111000000111111111111111111111111110000001111111111, - 0b1111111111111111000000000000111111111111111111111111111111111111, + 0b0111111101111111011111110111111100000000011111111111111111111111, + 0b0111111100111111111111111111111111111111111111111111111111111111, + 0b0111111100111101111111111111111111111111111111110011110111111111, + 0b0111110000000000111111111111111100000000000000001000000000000000, + 0b0111101111111111111111111111111111011111110111111110011110111111, + 0b0101111111011111111111111111111111111111111111111111111111111111, + 0b0101111101111111111111011111111111100000111110000000000001111111, + 0b0101111011110111111101111001011010101010100101101110101010000100, + 0b0100000010011111111111111111111111111111111110111111111111111111, + 0b0011111111111111111111111111111111111100000000001110000000000000, + 0b0011111111111111111111111111111110101010111111110011111100111111, + 0b0011111110000000000111111111111111111111111111111111111111111111, + 0b0011101111111111111111111010111111111111111111111111011111010110, + 0b0010111111111011111111111111111111111100011111111111111111101110, + 0b0001111111111111111111111111111111111110111111111111111100000011, + 0b0001111111111111111111111111111100000000000001111111111111111111, 0b0001111111111111111111111111111100000000000000000000000000000000, + 0b0001111111111111000001111111111111111111111111111111111111111111, + 0b0001111111011100000111111111111100001111110011110001111111011100, + 0b0001111100111110000000111111111000000000000000000000000011100000, + 0b0001111000000000000000000000111100000000000000010001101110111111, + 0b0001000000000011000000000000111110110000100000000101100110011111, + 0b0000111111111111111111111111111111111111000011111111111111111111, + 0b0000111111111111111110111110111000001111111111111111101111111111, + 0b0000111111111111000000000000000000000000000000000000000011111111, + 0b0000101011110111111111101001011011111111111111111111111111101111, + 0b0000011111111111111111111111111111111111111111110000011111111111, 0b0000011111111111111111111111111000000000000000000000000000000000, - 0b0000000111111111111111111111111111111111111111111111111111111111, + 0b0000010001101111110111100000000000000000000000000000000000000000, + 0b0000010000110000000001111111111111111111111111111111110000000000, + 0b0000010000100000000001000000000000000000000000000000000000000000, + 0b0000001111111111111111111111111100000000001111111111111111111111, + 0b0000000111111111111111111111111111111111111111111111111111111100, + 0b0000000111111111110001111111111111111111111111111111111111111111, + 0b0000000111111111000011111111111101111111111111111111111111111111, + 0b0000000100111111111111111111111111111111111111111111111111111111, + 0b0000000001111111111111111111111100000000001111111111111111111111, + 0b0000000001111111111111111111111100000000000000000000000000011111, 0b0000000001111111111111111111111100000000000000000000000000000000, - 0b0000000000011111111111111111111111111111111111111111111111111111, + 0b0000000001111111111111101111111111111111111111001111111111111111, + 0b0000000001000111111111111111111111111111111111110000000011110000, + 0b0000000000111111111111111111111111111110111011111111000001101111, + 0b0000000000111111000000000000000001011110000000100001100110000111, + 0b0000000000111100111111111111111100111000000000000000000000000101, + 0b0000000000110111111111111111111100000000000000000000000000000000, + 0b0000000000011111111111111111111001111111111111111111111111111111, + 0b0000000000011111001111111111111111111111111111110000000000000000, + 0b0000000000001111111111111111111100000000000011111101111111111111, + 0b0000000000001101110111111111111100000000000011111111111111111111, + 0b0000000000001100011110000001111111111111111111111111111111111111, + 0b0000000000001100000000000000000011111111010111111000000001111111, + 0b0000000000000111111111111111111100000000001111111111111111111111, + 0b0000000000000111100001111111111111111111111111110000000010110110, + 0b0000000000000110000000000000111101000000011000000001110111011111, + 0b0000000000000011111111111011111111111111111111111111111111111111, + 0b0000000000000011000110111111111111111111111111111111111111111111, + 0b0000000000000011000000000000101100000000000000000000000000000000, + 0b0000000000000010000000000000111110110000110000000001100110011111, + 0b0000000000000000100000001111111111111111111111111111111111111111, + 0b0000000000000000010100000001111100000000000000111111111111000011, 0b0000000000000000001111111111111111111111111111110000000000000000, - 0b0000000000000000000000111111111111111111111111111111111111111111, - 0b0000000000000000000000000000000000000000000000000000000000010001, - 0b0000000000000000000000000000000000000000000000000000000010110011, - 0b0000000000000000000000000000000000000000000000000000100010001111, - 0b0000000000000000000000000000000000000000000000000000111111101111, - 0b0000000000000000000000000000000000000000000000000000111111110111, - 0b0000000000000000000000000000000000000000000000000010000001111111, - 0b0000000000000000000000000000000000000000001111101111111100001111, - 0b0000000000000000000000000000000000000000100000010001110111000111, - 0b0000000000000000000000000000000000000001011110110111111111111111, - 0b0000000000000000000000000000000000000111111111111111111111111111, - 0b0000000000000000000000000000000000010000100000000000000111111111, - 0b0000000000000000000000000000000000010100000000001100000000011110, - 0b0000000000000000000000000000000000011100111111001111110011111100, - 0b0000000000000000000000000000000000100000111111111111111111111111, - 0b0000000000000000000000000000000000111111111111110011111111111111, - 0b0000000000000000000000000000000001000011111111110000000111111111, - 0b0000000000000000000000000000000011110000000000000010000001011111, - 0b0000000000000000000000000000000011111111111111111000000011111111, - 0b0000000000000000000000000000001110000000000000000000011110111011, - 0b0000000000000000000000000000111100000111011000000001110111011111, - 0b0000000000000000000000000000111111100000100000010001100110011111, - 0b0000000000000000000000000000111111111111111111111111111111111111, - 0b0000000000000000000000000001101011111100111111111111111111111111, - 0b0000000000000000000000000001111111111111111111111111111011111111, - 0b0000000000000000000000001111111100000000001111111111111111111111, - 0b0000000000000000000000001111111111111111111111111111111110111111, - 0b0000000000000000000000001111111111111111111111111111111111111111, - 0b0000000000000000000000011111111111111111111111110000000000000000, - 0b0000000000000000000001111101101111111001111111111111111101111111, - 0b0000000000000000000001111111111100000001111111111111111111111111, - 0b0000000000000000000001111111111111100111111111111111111111111111, - 0b0000000000000000000001111111111111111111111111111111111111111111, - 0b0000000000000000000011000000000011111111111111110001111111111111, - 0b0000000000000000000011111111111111111111111111111111111111111111, 0b0000000000000000000111111111111111111100111111111111111111111111, - 0b0000000000000000010100000001111100000000000000111111111111000011, - 0b0000000000000000100000001111111111111111111111111111111111111111, - 0b0000000000000010000000000000111110110000110000000001100110011111, - 0b0000000000000011000000000000101100000000000000000000000000000000, - 0b0000000000000011000110111111111111111111111111111111111111111111, - 0b0000000000000011111111111011111111111111111111111111111111111111, - 0b0000000000000110000000000000111101000000011000000001110111011111, - 0b0000000000000111100001111111111111111111111111110000000010110110, - 0b0000000000000111111111111111111100000000001111111111111111111111, - 0b0000000000001100000000000000000011111111010111111000000001111111, - 0b0000000000001100011110000001111111111111111111111111111111111111, - 0b0000000000001101110111111111111100000000000011111111111111111111, - 0b0000000000001111111111111111111100000000000011111101111111111111, - 0b0000000000011111001111111111111111111111111111110000000000000000, - 0b0000000000011111111111111111111001111111111111111111111111111111, - 0b0000000000110111111111111111111100000000000000000000000000000000, - 0b0000000000111100111111111111111100111000000000000000000000000101, - 0b0000000000111111000000000000000001011110000000100001100110000111, - 0b0000000000111111111111111111111111111110111011111111000001101111, - 0b0000000001000111111111111111111111111111111111110000000011110000, - 0b0000000001111111111111101111111111111111111111001111111111111111, - 0b0000000001111111111111111111111100000000000000000000000000011111, - 0b0000000001111111111111111111111100000000001111111111111111111111, - 0b0000000100111111111111111111111111111111111111111111111111111111, - 0b0000000111111111000011111111111101111111111111111111111111111111, - 0b0000000111111111110001111111111111111111111111111111111111111111, - 0b0000001111111111111111111111111100000000001111111111111111111111, - 0b0000010000100000000001000000000000000000000000000000000000000000, - 0b0000010000110000000001111111111111111111111111111111110000000000, - 0b0000010001101111110111100000000000000000000000000000000000000000, - 0b0000011111111111111111111111111111111111111111110000011111111111, - 0b0000101011110111111111101001011011111111111111111111111111101111, - 0b0000111111111111000000000000000000000000000000000000000011111111, - 0b0000111111111111111110111110111000001111111111111111101111111111, - 0b0000111111111111111111111111111111111111000011111111111111111111, - 0b0000111111111111111111111111111111111111111111111111100001111111, - 0b0001000000000011000000000000111110110000100000000101100110011111, - 0b0001111000000000000000000000111100000000000000010001101110111111, - 0b0001111100111110000000111111111000000000000000000000000011100000, - 0b0001111111011100000111111111111100001111110011110001111111011100, - 0b0001111111111111000001111111111111111111111111111111111111111111, - 0b0001111111111111111111111111111100000000000001111111111111111111, - 0b0001111111111111111111111111111111111110111111111111111100000011, - 0b0010111111111011111111111111111111111100011111111111111111101110, - 0b0011101111111111111111111010111111111111111111111111011111010110, - 0b0011111110000000000111111111111111111111111111111111111111111111, - 0b0011111111111111111111111111111110101010111111110011111100111111, - 0b0011111111111111111111111111111111111100000000001110000000000000, - 0b0100000010011111111111111111111111111111111110111111111111111111, - 0b0101111011110111111101111001011010101010100101101110101010000100, - 0b0101111101111111111111011111111111100000111110000000000001111111, - 0b0101111111011111111111111111111111111111111111111111111111111111, - 0b0111101111111111111111111111111111011111110111111110011110111111, - 0b0111110000000000111111111111111100000000000000001000000000000000, - 0b0111111100111101111111111111111111111111111111110011110111111111, - 0b0111111100111111111111111111111111111111111111111111111111111111, - 0b0111111101111111011111110111111100000000011111111111111111111111, - 0b0111111101111111111111111111111111111111111111111111110111111111, - 0b0111111111100111111111111111111111111111111111111111111111111111, - 0b1000000000000000000000001000000000000000000000000000000000000000, - 0b1000000000000010000000000000000000000000000000000000000000000000, - 0b1000111111110000011111111111111111111111111111111111111111111111, - 0b1001000110111111111111111111111111111111111111111111110100111111, - 0b1001100110111111111111111111111111111111011011111111001001111111, - 0b1001110000000000111000011111111000011111111011111111111111111111, - 0b1011010001111111111111111111111111111111111111111111101101111111, - 0b1011011111111111111111110111111111111111111111111110111111111111, - 0b1011110011011111000000000000000000000000000000000000000000100000, - 0b1011111111111111000000000000000000000000000000000000000111111111, - 0b1100001101101101111111011111111111111111111110011000011111101110, - 0b1100001111111111110001110001100011010110001111011100011111101100, - 0b1110000011111111111111111111100000000000000000000000000000001111, - 0b1110001111000101111111011111111111111111111110011001111111101111, - 0b1110001111101101111111011111111111111111111110011001111111101110, - 0b1110001111101101111111011111111111111111111110011001111111101111, - 0b1110001111101101111111011111111111111111111110111011111111101110, - 0b1110001111101111111111011111111111111111111111011101111111101111, - 0b1110001111111111111111011111111111111111111111011101111111101111, - 0b1110011111111111111111111111111111111111111111011101111111111111, - 0b1110011111111111111111111111111111111111111111110000000111111111, - 0b1110100011111100000000000000000000000000000000000000000000101111, - 0b1110101111111111110111100110010011011111111111111111111111111111, - 0b1111001111111111101111010101000000111110001011111111110010000100, - 0b1111011111111111111111111111111111110111111111111111111111111101, - 0b1111011111111111111111111111111111111111111111110010000010111111, - 0b1111100101111111111111111111111111111111111111111111111111111111, - 0b1111110000000000000000000000111110000000111100000101110111011111, - 0b1111110000000000111100111111111111111111111111111111111111111111, - 0b1111110001111111111111111111111100000000000000000011111111111111, - 0b1111111111011111000000000000000000000000000000000000000000000000, - 0b1111111111011111111111111111111100000000000000000000000000000000, - 0b1111111111100000000000000000000000000000000000000000011111111100, - 0b1111111111111100000000000000000000000000000000000000000000000001, - 0b1111111111111110000000000000111111111111111000011101111111111111, - 0b1111111111111110000111111111111111111111111111111111111011111111, - 0b1111111111111110111111111111111111111111111111111111111111100000, - 0b1111111111111111000000001000000000011111111111111111111111111111, - 0b1111111111111111000000001111000000000000000001110000000000000000, - 0b1111111111111111000000011111111110111111111111111011110101111111, - 0b1111111111111111000000111111100011111111111100000000000011111111, - 0b1111111111111111000000111111111111110111111111111111111111111111, - 0b1111111111111111000001111111111111111111111111111111110000000000, - 0b1111111111111111001000001011111111111111111111111111111111111111, - 0b1111111111111111011111110111111100000000011111100111111001111110, - 0b1111111111111111110000000000000011111110111111111111111111111111, - 0b1111111111111111111111001111111100000000000000000000000000000000, - 0b1111111111111111111111011011111100000000000000000000000011001011, - 0b1111111111111111111111111100000000000111111111111111111111111110, - 0b1111111111111111111111111111101111111111111111111101011101000000, - 0b1111111111111111111111111111110011111111100000000000000000000000, - 0b1111111111111111111111111111111011100000011111111111111111111111, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111111111111111100000000000000000100001111100000, - 0b1111111111111111111111111111111100000010011111111111111111111111, - 0b1111111111111111111111111111111100000111111111110000000000000000, - 0b1111111111111111111111111111111100000111111111111111111111111110, - 0b1111111111111111111111111111111100111100000000001111111111111111, - 0b1111111111111111111111111111111100111101011111110011110111111111, - 0b1111111111111111111111111111111101111111011111110111111101111111, - 0b1111111111111111111111111111111111111111001111011111111111111111, - 0b1111111111111111111111111111111111111111011111111111111100111101, - 0b1111111111111111111111111111111111111111111110000000000000000000, - 0b1111111111111111111111111111111111111111111111011111110001011111, - 0b1111111111111111111111111111111111111111111111111111111111011011, + 0b0000000000000000000011000000000011111111111111110001111111111111, + 0b0000000000000000000001111111111100000001111111111111111111111111, + 0b0000000000000000000001111101101111111001111111111111111101111111, + 0b0000000000000000000000011111111111111111111111110000000000000000, + 0b0000000000000000000000001111111111111111111111111111111110111111, + 0b0000000000000000000000001111111100000000001111111111111111111111, + 0b0000000000000000000000000001111111111111111111111111111011111111, + 0b0000000000000000000000000001101011111100111111111111111111111111, + 0b0000000000000000000000000000111111100000100000010001100110011111, + 0b0000000000000000000000000000111100000111011000000001110111011111, + 0b0000000000000000000000000000001110000000000000000000011110111011, + 0b0000000000000000000000000000000011111111111111111000000011111111, + 0b0000000000000000000000000000000011110000000000000010000001011111, + 0b0000000000000000000000000000000001000011111111110000000111111111, + 0b0000000000000000000000000000000000100000111111111111111111111111, + 0b0000000000000000000000000000000000011100111111001111110011111100, + 0b0000000000000000000000000000000000010100000000001100000000011110, + 0b0000000000000000000000000000000000010000100000000000000111111111, + 0b0000000000000000000000000000000000000001011110110111111111111111, + 0b0000000000000000000000000000000000000000100000010001110111000111, + 0b0000000000000000000000000000000000000000001111101111111100001111, + 0b0000000000000000000000000000000000000000000000000010000001111111, + 0b0000000000000000000000000000000000000000000000000000100010001111, + 0b0000000000000000000000000000000000000000000000000000000010110011, ]; - static BITSET_MAPPING: [(u8, u8); 69] = [ - (0, 128), (0, 142), (0, 175), (0, 176), (0, 63), (0, 60), (0, 59), (0, 54), (0, 52), - (0, 51), (0, 48), (0, 47), (0, 31), (0, 21), (0, 4), (1, 53), (1, 43), (1, 37), (1, 36), - (1, 29), (1, 21), (1, 7), (2, 128), (2, 144), (2, 51), (2, 32), (3, 181), (3, 49), (3, 33), - (3, 17), (4, 128), (4, 48), (4, 176), (4, 16), (5, 14), (5, 12), (5, 6), (6, 136), (6, 160), - (6, 3), (7, 54), (7, 38), (8, 163), (8, 32), (9, 177), (9, 32), (10, 149), (10, 16), - (11, 16), (11, 133), (12, 162), (12, 32), (13, 10), (13, 128), (14, 160), (14, 1), - (15, 135), (15, 62), (16, 128), (17, 32), (18, 17), (19, 16), (20, 32), (21, 31), (22, 135), - (23, 137), (24, 139), (25, 48), (26, 150), + static BITSET_MAPPING: [(u8, u8); 83] = [ + (0, 191), (0, 65), (0, 188), (0, 187), (0, 186), (0, 185), (0, 184), (0, 182), (0, 181), + (0, 180), (0, 178), (0, 79), (0, 175), (0, 174), (0, 173), (0, 169), (0, 165), (0, 164), + (0, 162), (0, 161), (0, 160), (0, 158), (0, 155), (0, 151), (0, 150), (0, 149), (0, 148), + (0, 147), (0, 144), (0, 112), (0, 143), (0, 113), (0, 141), (0, 140), (0, 139), (0, 138), + (0, 137), (0, 136), (0, 135), (0, 134), (0, 132), (0, 131), (0, 130), (0, 129), (0, 61), + (0, 60), (0, 55), (0, 53), (0, 52), (0, 49), (0, 48), (0, 32), (0, 22), (0, 5), (0, 1), + (1, 129), (1, 58), (1, 57), (1, 50), (1, 42), (1, 28), (1, 21), (2, 180), (2, 30), (2, 24), + (2, 18), (3, 132), (3, 33), (3, 17), (4, 80), (4, 32), (5, 112), (5, 16), (6, 96), (6, 3), + (7, 38), (8, 32), (9, 17), (10, 69), (11, 32), (12, 187), (13, 179), (14, 141), ]; pub fn lookup(c: char) -> bool { @@ -276,185 +263,181 @@ pub mod alphabetic { #[rustfmt::skip] pub mod case_ignorable { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 24); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 2); static BITSET_CHUNKS_MAP: [u8; 250] = [ - 12, 31, 34, 4, 7, 15, 22, 13, 17, 46, 50, 41, 28, 3, 11, 47, 8, 50, 50, 50, 50, 50, 29, 27, - 14, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, - 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, - 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 26, 50, 35, 25, 6, 10, 50, 50, 50, 50, 50, 50, 50, - 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, - 50, 50, 50, 50, 50, 50, 50, 50, 50, 42, 50, 9, 49, 36, 50, 50, 50, 19, 43, 33, 23, 16, 1, - 20, 51, 18, 21, 37, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 2, 50, 50, 50, 50, 50, 50, 50, - 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 39, 50, 45, 50, - 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, - 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 32, 50, 50, 50, 50, 50, 50, 50, 50, - 50, 44, 30, 50, 50, 50, 0, 50, 50, 5, 38, 50, 50, 40, 50, 50, 50, 50, 48, + 14, 28, 47, 22, 19, 11, 4, 13, 9, 40, 39, 32, 49, 23, 15, 36, 18, 39, 39, 39, 39, 39, 27, + 26, 12, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 25, 39, 30, 24, 20, 16, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 33, 39, 17, 38, 31, 39, 39, 39, 7, 41, 46, 3, 10, 1, + 6, 51, 8, 5, 42, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 50, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 44, 39, 35, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 29, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 34, 48, 39, 39, 39, 0, 39, 39, 21, 43, 39, 39, 45, 39, 39, 39, 39, 37, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 52] = [ - [2, 69, 56, 174, 174, 174, 174, 174], [4, 33, 106, 18, 174, 174, 111, 182], - [16, 174, 174, 174, 174, 174, 174, 174], [27, 109, 122, 128, 90, 63, 51, 61], - [29, 58, 174, 53, 121, 165, 5, 94], [45, 174, 174, 174, 168, 174, 174, 174], - [49, 174, 174, 105, 163, 167, 96, 44], [54, 159, 174, 127, 89, 60, 86, 99], - [55, 104, 30, 59, 174, 174, 174, 174], [57, 23, 174, 144, 98, 177, 146, 84], - [72, 88, 117, 71, 174, 47, 174, 50], [75, 15, 174, 79, 123, 76, 126, 0], - [80, 176, 77, 174, 174, 174, 174, 174], [82, 181, 92, 21, 78, 169, 93, 132], - [83, 174, 180, 3, 174, 174, 174, 174], [87, 70, 87, 118, 108, 40, 130, 20], - [100, 174, 174, 46, 91, 67, 174, 174], [102, 66, 31, 142, 174, 174, 174, 174], - [103, 28, 26, 174, 174, 174, 174, 174], [107, 174, 174, 147, 174, 174, 174, 174], - [110, 128, 74, 174, 48, 174, 174, 174], [113, 174, 73, 174, 112, 19, 25, 174], - [116, 41, 108, 39, 91, 38, 129, 24], [119, 164, 95, 134, 68, 141, 13, 22], - [125, 9, 174, 174, 9, 9, 9, 175], [133, 114, 154, 149, 37, 140, 158, 151], - [136, 174, 174, 174, 174, 174, 174, 174], [139, 174, 174, 174, 174, 174, 174, 174], - [153, 131, 17, 174, 85, 174, 174, 174], [174, 1, 174, 161, 174, 12, 174, 124], - [174, 157, 174, 174, 174, 174, 174, 174], [174, 174, 10, 9, 9, 81, 179, 174], - [174, 174, 42, 174, 174, 174, 174, 174], [174, 174, 148, 174, 174, 166, 174, 174], - [174, 174, 172, 174, 174, 34, 115, 64], [174, 174, 174, 15, 174, 174, 174, 174], - [174, 174, 174, 138, 174, 171, 174, 174], [174, 174, 174, 150, 174, 174, 174, 174], - [174, 174, 174, 156, 174, 174, 174, 174], [174, 174, 174, 170, 8, 152, 174, 174], - [174, 174, 174, 173, 174, 162, 174, 174], [174, 174, 174, 174, 65, 62, 97, 32], - [174, 174, 174, 174, 137, 174, 6, 145], [174, 174, 174, 174, 155, 174, 174, 174], - [174, 174, 174, 174, 174, 120, 52, 174], [174, 174, 174, 174, 174, 135, 35, 43], - [174, 174, 174, 174, 174, 160, 174, 174], [174, 174, 174, 174, 174, 174, 11, 101], - [174, 174, 174, 174, 174, 174, 174, 7], [174, 174, 174, 174, 174, 174, 174, 143], - [174, 174, 174, 174, 174, 174, 174, 174], [178, 174, 174, 174, 14, 131, 174, 36], + [3, 75, 88, 142, 142, 142, 142, 142], [5, 110, 38, 181, 142, 142, 12, 182], + [21, 4, 142, 142, 4, 4, 4, 160], [28, 163, 50, 131, 76, 138, 6, 120], + [31, 103, 14, 105, 54, 106, 124, 119], [33, 142, 17, 142, 34, 175, 118, 142], + [35, 123, 71, 142, 96, 142, 142, 142], [37, 142, 142, 144, 142, 142, 142, 142], + [41, 115, 117, 142, 142, 142, 142, 142], [42, 78, 112, 139, 142, 142, 142, 142], + [45, 142, 142, 98, 54, 77, 142, 142], [58, 74, 58, 29, 14, 104, 126, 122], + [62, 142, 180, 2, 142, 142, 142, 142], [63, 164, 53, 121, 67, 168, 52, 129], + [65, 177, 68, 142, 142, 142, 142, 142], [70, 16, 142, 66, 23, 69, 20, 0], + [72, 57, 30, 73, 142, 97, 142, 94], [87, 178, 142, 141, 47, 179, 143, 61], + [89, 40, 113, 85, 142, 142, 142, 142], [90, 151, 142, 19, 56, 84, 59, 46], + [95, 142, 142, 39, 162, 174, 49, 100], [99, 142, 142, 142, 167, 142, 142, 142], + [114, 86, 142, 91, 25, 158, 10, 51], [116, 36, 24, 123, 55, 81, 93, 83], + [130, 32, 155, 146, 159, 137, 150, 148], [133, 142, 142, 142, 142, 142, 142, 142], + [136, 142, 142, 142, 142, 142, 142, 142], [142, 1, 142, 153, 142, 15, 142, 22], + [142, 142, 26, 4, 4, 64, 176, 142], [142, 142, 102, 142, 142, 142, 142, 142], + [142, 142, 142, 16, 142, 142, 142, 142], [142, 142, 142, 135, 142, 170, 142, 142], + [142, 142, 142, 142, 79, 82, 48, 111], [142, 142, 142, 142, 134, 142, 7, 125], + [142, 142, 142, 142, 142, 27, 92, 142], [142, 142, 142, 142, 142, 132, 108, 101], + [142, 142, 142, 142, 142, 142, 13, 43], [142, 142, 142, 142, 142, 142, 142, 8], + [142, 142, 142, 142, 142, 142, 142, 140], [142, 142, 142, 142, 142, 142, 142, 142], + [142, 142, 142, 142, 142, 152, 142, 142], [142, 142, 142, 142, 156, 142, 142, 142], + [142, 142, 142, 147, 142, 142, 142, 142], [142, 142, 142, 157, 142, 142, 142, 142], + [142, 142, 142, 169, 9, 128, 142, 142], [142, 142, 142, 172, 142, 161, 142, 142], + [142, 142, 145, 142, 142, 173, 142, 142], [142, 142, 171, 142, 142, 109, 11, 80], + [142, 149, 142, 142, 142, 142, 142, 142], [154, 127, 18, 142, 60, 142, 142, 142], + [165, 142, 142, 142, 142, 142, 142, 142], [166, 142, 142, 142, 44, 127, 142, 107], ]; - static BITSET_CANONICAL: [u64; 128] = [ + static BITSET_CANONICAL: [u64; 123] = [ 0b1111101111111111111111111111111111111111111111111111111111111111, 0b0011000000000000000000000000000000000000000000000000000000000000, - 0b1111100001111111111111111111111111111111111111111111111111111111, 0b0111000000000000000000000000000000000000000000000000000000000000, + 0b1111100001111111111111111111111111111111111111111111111111111111, + 0b1111111111111111111111111111111111111111111111111111111111111111, 0b1111111100000000000000000000000000000000000000000000000000000000, - 0b0000000000000001111111111100000000000000000000000000000000000000, + 0b0111111111000000000000000000000000000000000000000000000000000011, 0b1111111111111100000000000000000000000000000000000000000000000000, 0b1111100000000000000000000000000000000000000000000000000000000000, 0b0000000001111111000000000000000000000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111111111111111111111, - 0b1111111111111111000000000000000000000000000000000000000000000000, + 0b0000000000000001111111111100000000000000000000000000000000000000, + 0b1011111111111111111111111111111111111111111111100000000000000000, + 0b1011000000111100000000000000000000000000000000000000000000000000, 0b1010000000000000000000000000000000000000000000000000000000000000, + 0b1001000000000000000000000000000000000000000000000000000000000010, 0b1000000000000000100000000000000000000000000000000000000000000000, - 0b0111111111000000000000000000000000000000000000000000000000000011, - 0b0101100000000000000000000000000000000000000000000000000000000000, 0b0011111100000000000000000000000000000000000000000000000000000000, - 0b0000000111111111000000000000000000000000000000000000000000000000, - 0b0000000000000000000000100000000000000000000000000000000001100000, - 0b0000000000000000000000000000000000000000000000000000000000001101, - 0b0000000000000000000000000000000000000000000000000000000010111111, - 0b0000000000000000000000000000000000000000000000000010000000000001, - 0b0000000000000000000000000000000000000000000000000011111101000000, - 0b0000000000000000000000000000000000000000000000001001111000000000, - 0b0000000000000000000000000000000000000000001001000000000000000000, - 0b0000000000000000000000000000000000000000010111000000010000000000, - 0b0000000000000000000000000000000000000000101000110000000000000000, - 0b0000000000000000000000000000000000000011011111111111110000000000, - 0b0000000000000000000000000000000000001001100000000000000000000000, - 0b0000000000000000000000000000000000001110011111100000000010000000, - 0b0000000000000000000000000000000000010111111111110000000000111111, - 0b0000000000000000000000000000000000011111111111110000000000000000, - 0b0000000000000000000000000000000000100000000000000010000001100100, - 0b0000000000000000000000000000000000100000100011111111111001000000, - 0b0000000000000000000000000000000001000000000000000000000001011100, - 0b0000000000000000000000000000000010000010000000000000000000000000, - 0b0000000000000000000000000000000011111111111111111000000000000000, - 0b0000000000000000000000000000000100001100111100000000000000000000, - 0b0000000000000000000000000000001111111111111111111111111111111111, - 0b0000000000000000000000000000110000000000000000000010000000011110, - 0b0000000000000000000000000000110000000000000000000011000001000000, - 0b0000000000000000000000000000110000000000011000000010000000011110, - 0b0000000000000000000000000000110000000000011000000011110111000001, - 0b0000000000000000000000000000111101100000000000000000000000000000, - 0b0000000000000000000000000001101100000000000000000000000000000000, - 0b0000000000000000000000000110000000000000000000001000000000000000, - 0b0000000000000000000001111101101111111001111111111111111101111111, - 0b0000000000000000000001111111100010000000000000000000000000000000, - 0b0000000000000000000011100000000011111000000000000000000000000000, - 0b0000000000000000000011111011110011100000000000000000000000000000, - 0b0000000000000000000100000110000000000000000000000000100001000100, - 0b0000000000000000001000010010000000000000000000000000000000000000, - 0b0000000000000000001110110011110000000000000000000000000000000011, - 0b0000000000000000001111000000000000000000000000000000111111100111, - 0b0000000000000000001111011111111110111111110000000000000000000000, - 0b0000000000000000001111111111111111111111110000000000000000000000, - 0b0000000000000000011111001001000000000011000000001111100000000000, - 0b0000000000000000111111111111111011111000000000000000000000010000, - 0b0000000000000000111111111111111100000000000010001111111111111111, - 0b0000000000000001000000000000000011111111111111111111100000000001, - 0b0000000000000001111111111111111111111111111111110000000000000000, - 0b0000000000000010000000000000110000000000111111100010000111111110, - 0b0000000000000011101000110100000000000000000000000000000000000000, - 0b0000000000001100000000000000000000000000000011000000000000000000, - 0b0000000000001111111110000000000000000000000000000000000000000100, - 0b0000000000010000000000000000000000000000000000000000000010110110, - 0b0000000000011100000000000000000000000000000111000000000000000000, - 0b0000000000011110000000000000000111000011000000000000000000000000, - 0b0000000000011111000111111100000000000000000000000000000000000001, - 0b0000000000011111111011111000000000000000000000000000000000000111, - 0b0000000000100000000111111111111111111111111111111111111111111111, - 0b0000000000100011000000000000000000000000000000100011100110000110, - 0b0000000001011000001100000000000000100000000000000000000000000010, - 0b0000000001100110011111100000000000000000000000000000000000000000, 0b0000000001101101111111001111111111111111111111000000000000000000, - 0b0000000010111111001010000000000000000000000000000000000000000000, - 0b0000000011001111111100000000000000000000000000000000000000000000, - 0b0000000100000000000001111111111111111111111111111111111111111111, - 0b0000000110010000101000010000000000000000000000000000000000000000, - 0b0000001010100000000000000000000000000011000000000000000000000000, - 0b0000001100010000001000011111110111111111111101110000000000000000, - 0b0000010000000000010000001000000000000000000000000000000000000000, - 0b0000010000110000111111111111111111111111111111111111111111111111, - 0b0000011111110010000000000000000000000000000000000000000000000000, - 0b0000100000111110001111000000000000000000000000000000000000100000, - 0b0000111000000000000000000000100000000000000000000000000000000000, - 0b0000111000000100000000011000011100000000000000000000000000000000, - 0b0001000000000000000000000000000000000000000000000000000000000010, - 0b0001000000000000000000000000000000000000000000000000000000000110, - 0b0001000000000001000000000000000000000000000000000001000000001000, - 0b0001010000000000000000000000000000000000000000000000000000000111, - 0b0001011111010000000000000000000000000000000000000000000000001111, - 0b0001100000000000000000000000000000000000000000000000000000000011, - 0b0001111111110010000000000000000000000000000000000000000000000000, - 0b0001111111111111111111111111111111111110111111111110000011011111, - 0b0010010000111111111110000000000000000000000000000000000000000000, - 0b0010011001111000000000000000000000000000000000000000000000000011, - 0b0011001111001000000000000000000000000000000000000000000000000111, - 0b0011111110110000000000000000000000000000000000000000000000000000, - 0b0100000000000000000000000000000000000100000000000100000010000000, - 0b0100000000000000000000000000110000000000000000000010000000011110, - 0b0100000011010011100000000000000000000000000000000000000000000000, - 0b0110000000000000111000000000000011100000000000001110000000000011, - 0b0110011011111101111000000000000000000000000000000000000000000000, - 0b0111100111111000000000000000000000000000000000000000011111111110, - 0b1000000000000010111111111101111100000000000000000000000000000000, - 0b1000000000000011111111111111111100000000000000000000000000110000, - 0b1000010111111000000000000000000000000000000000000000000000000000, - 0b1000011100000000000000000000000000000000000000001111000001101110, - 0b1001000000000000000000000000000000000000000000000000000000000010, - 0b1001111111111000000111111110010101111111010000000000000000000000, - 0b1010011111111000000000000000000000000000000000000000000000000000, - 0b1011000000111100000000000000000000000000000000000000000000000000, - 0b1011010001111110000000000000000000000000000000000000000000000000, - 0b1011111101111111000000000000000000000000000000000000000000000000, - 0b1011111111110111100000000000000000000000000000000000000000000000, - 0b1011111111111111111111111111111111111111111111100000000000000000, - 0b1100000000000000000000000000000000000000000000000000000000010001, - 0b1100000110011101000000000000000000000000000000000000000000000000, - 0b1111110000000000000000000000110000000000000000000010000110111110, - 0b1111111100000000000000000000000000000000000000000000000000000010, - 0b1111111111111000000000111000000000000000000000000000000000000000, - 0b1111111111111111000000000000000000000000000000101000000000000000, - 0b1111111111111111000000001000000000000000000000000000000000000000, - 0b1111111111111111111100000000000000000000000000000000000000000000, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111111111111111100000000000000000000000000000010, - 0b1111111111111111111111111111111111111000000000000000000000000000, + 0b0000000000000000000000100000000000000000000000000000000001100000, 0b1111111111111111111111111111111111111111111110000000000000000000, + 0b1111111111111111111111111111111111111000000000000000000000000000, + 0b1111111111111111111111111111111100000000000000000000000000000010, + 0b1111111111111111111111111111111100000000000000000000000000000000, + 0b1111111111111111111100000000000000000000000000000000000000000000, + 0b1111111111111111000000001000000000000000000000000000000000000000, + 0b1111111111111111000000000000000000000000000000101000000000000000, + 0b1111111111111111000000000000000000000000000000000000000000000000, + 0b1111111111111000000000111000000000000000000000000000000000000000, + 0b1111111100000000000000000000000000000000000000000000000000000010, + 0b1111110000000000000000000000110000000000000000000010000110111110, + 0b1100000110011101000000000000000000000000000000000000000000000000, + 0b1100000000000000000000000000000000000000000000000000000000010001, + 0b1011111111110111100000000000000000000000000000000000000000000000, + 0b1011111101111111000000000000000000000000000000000000000000000000, + 0b1011010001111110000000000000000000000000000000000000000000000000, + 0b1010011111111000000000000000000000000000000000000000000000000000, + 0b1001111111111000000111111110010101111111010000000000000000000000, + 0b1000011100000000000000000000000000000000000000001111000001101110, + 0b1000010111111000000000000000000000000000000000000000000000000000, + 0b1000000000000011111111111111111100000000000000000000000000110000, + 0b1000000000000010111111111101111100000000000000000000000000000000, + 0b0111100111111000000000000000000000000000000000000000011111111110, + 0b0110011011111101111000000000000000000000000000000000000000000000, + 0b0110000000000000111000000000000011100000000000001110000000000011, + 0b0101100000000000000000000000000000000000000000000000000000000000, + 0b0100000011010011100000000000000000000000000000000000000000000000, + 0b0100000000000000000000000000110000000000000000000010000000011110, + 0b0100000000000000000000000000000000000100000000000100000010000000, + 0b0011111110110000000000000000000000000000000000000000000000000000, + 0b0011001111001000000000000000000000000000000000000000000000000111, + 0b0010011001111000000000000000000000000000000000000000000000000011, + 0b0010010000111111111110000000000000000000000000000000000000000000, + 0b0001111111111111111111111111111111111110111111111110000011011111, + 0b0001111111110010000000000000000000000000000000000000000000000000, + 0b0001100000000000000000000000000000000000000000000000000000000011, + 0b0001011111010000000000000000000000000000000000000000000000001111, + 0b0001010000000000000000000000000000000000000000000000000000000111, + 0b0001000000000001000000000000000000000000000000000001000000001000, + 0b0001000000000000000000000000000000000000000000000000000000000110, + 0b0001000000000000000000000000000000000000000000000000000000000010, + 0b0000111000000100000000011000011100000000000000000000000000000000, + 0b0000111000000000000000000000100000000000000000000000000000000000, + 0b0000100000111110001111000000000000000000000000000000000000100000, + 0b0000011111110010000000000000000000000000000000000000000000000000, + 0b0000010000110000111111111111111111111111111111111111111111111111, + 0b0000010000000000010000001000000000000000000000000000000000000000, + 0b0000001100010000001000011111110111111111111101110000000000000000, + 0b0000001010100000000000000000000000000011000000000000000000000000, + 0b0000000110010000101000010000000000000000000000000000000000000000, + 0b0000000100000000000001111111111111111111111111111111111111111111, + 0b0000000011001111111100000000000000000000000000000000000000000000, + 0b0000000010111111001010000000000000000000000000000000000000000000, + 0b0000000001100110011111100000000000000000000000000000000000000000, + 0b0000000001011000001100000000000000100000000000000000000000000010, + 0b0000000000100011000000000000000000000000000000100011100110000110, + 0b0000000000100000000111111111111111111111111111111111111111111111, + 0b0000000000011111111011111000000000000000000000000000000000000111, + 0b0000000000011111000111111100000000000000000000000000000000000001, + 0b0000000000011110000000000000000111000011000000000000000000000000, + 0b0000000000011100000000000000000000000000000111000000000000000000, + 0b0000000000010000000000000000000000000000000000000000000010110110, + 0b0000000000001111111110000000000000000000000000000000000000000100, + 0b0000000000001100000000000000000000000000000011000000000000000000, + 0b0000000000000011101000110100000000000000000000000000000000000000, + 0b0000000000000010000000000000110000000000111111100010000111111110, + 0b0000000000000001111111111111111111111111111111110000000000000000, + 0b0000000000000001000000000000000011111111111111111111100000000001, + 0b0000000000000000111111111111111100000000000010001111111111111111, + 0b0000000000000000111111111111111011111000000000000000000000010000, + 0b0000000000000000011111001001000000000011000000001111100000000000, + 0b0000000000000000001111111111111111111111110000000000000000000000, + 0b0000000000000000001111011111111110111111110000000000000000000000, + 0b0000000000000000001111000000000000000000000000000000111111100111, + 0b0000000000000000001110110011110000000000000000000000000000000011, + 0b0000000000000000001000010010000000000000000000000000000000000000, + 0b0000000000000000000100000110000000000000000000000000100001000100, + 0b0000000000000000000011111011110011100000000000000000000000000000, + 0b0000000000000000000011100000000011111000000000000000000000000000, + 0b0000000000000000000001111111100010000000000000000000000000000000, + 0b0000000000000000000001111101101111111001111111111111111101111111, + 0b0000000000000000000000000110000000000000000000001000000000000000, + 0b0000000000000000000000000001101100000000000000000000000000000000, + 0b0000000000000000000000000000111101100000000000000000000000000000, + 0b0000000000000000000000000000110000000000011000000011110111000001, + 0b0000000000000000000000000000110000000000011000000010000000011110, + 0b0000000000000000000000000000110000000000000000000011000001000000, + 0b0000000000000000000000000000110000000000000000000010000000011110, + 0b0000000000000000000000000000000100001100111100000000000000000000, + 0b0000000000000000000000000000000011111111111111111000000000000000, + 0b0000000000000000000000000000000010000010000000000000000000000000, + 0b0000000000000000000000000000000001000000000000000000000001011100, + 0b0000000000000000000000000000000000100000100011111111111001000000, + 0b0000000000000000000000000000000000100000000000000010000001100100, + 0b0000000000000000000000000000000000011111111111110000000000000000, + 0b0000000000000000000000000000000000010111111111110000000000111111, + 0b0000000000000000000000000000000000001110011111100000000010000000, + 0b0000000000000000000000000000000000001001100000000000000000000000, + 0b0000000000000000000000000000000000000011011111111111110000000000, + 0b0000000000000000000000000000000000000000101000110000000000000000, + 0b0000000000000000000000000000000000000000010111000000010000000000, + 0b0000000000000000000000000000000000000000000000001001111000000000, + 0b0000000000000000000000000000000000000000000000000011111101000000, + 0b0000000000000000000000000000000000000000000000000010000000000001, ]; - static BITSET_MAPPING: [(u8, u8); 55] = [ - (0, 134), (0, 135), (0, 136), (0, 137), (0, 140), (0, 146), (0, 147), (0, 149), (0, 155), - (0, 164), (0, 166), (0, 181), (0, 182), (0, 185), (0, 130), (0, 131), (0, 133), (1, 4), - (1, 34), (1, 41), (1, 47), (1, 52), (1, 55), (1, 60), (2, 137), (2, 148), (2, 165), - (2, 173), (2, 181), (3, 6), (3, 12), (3, 29), (3, 33), (3, 51), (4, 12), (4, 46), (4, 7), - (5, 26), (5, 32), (5, 33), (6, 62), (6, 63), (7, 53), (7, 59), (8, 19), (8, 32), (9, 128), - (10, 128), (11, 33), (12, 1), (13, 57), (14, 9), (15, 33), (16, 22), (17, 23), + static BITSET_MAPPING: [(u8, u8); 60] = [ + (0, 70), (0, 71), (0, 190), (0, 72), (0, 73), (0, 188), (0, 76), (0, 82), (0, 83), (0, 85), + (0, 91), (0, 100), (0, 102), (0, 117), (0, 118), (0, 121), (0, 66), (0, 67), (0, 69), + (1, 190), (1, 34), (1, 41), (1, 47), (1, 52), (1, 55), (1, 60), (2, 6), (2, 12), (2, 29), + (2, 33), (2, 51), (3, 84), (3, 101), (3, 109), (3, 117), (4, 181), (4, 158), (4, 144), + (5, 12), (5, 46), (5, 7), (6, 176), (6, 134), (6, 57), (7, 62), (7, 63), (8, 53), (8, 59), + (9, 19), (9, 32), (10, 32), (10, 33), (11, 184), (12, 184), (13, 33), (14, 170), (15, 1), + (16, 33), (17, 179), (18, 23), ]; pub fn lookup(c: char) -> bool { @@ -471,84 +454,81 @@ pub mod case_ignorable { #[rustfmt::skip] pub mod cased { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (124, 11); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (124, 12); static BITSET_CHUNKS_MAP: [u8; 123] = [ - 18, 0, 17, 17, 5, 17, 17, 9, 4, 7, 17, 3, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, - 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 13, 14, 17, 17, 17, 17, - 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 16, 15, 17, 2, 17, 8, 17, 17, 6, - 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 12, 17, 17, - 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, - 1, 17, 17, 17, 17, 10, + 4, 0, 18, 18, 6, 18, 18, 9, 5, 8, 18, 3, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 14, 15, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 16, 18, 1, 18, 10, 18, 18, + 7, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 13, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 2, 18, 18, 18, 18, 11, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 19] = [ - [2, 2, 41, 2, 44, 5, 54, 51, 51, 51, 51, 51, 51, 51, 51, 51], - [2, 47, 33, 0, 28, 39, 2, 2, 2, 2, 8, 35, 49, 50, 1, 14], - [2, 59, 10, 24, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], - [45, 46, 2, 20, 18, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], - [51, 29, 62, 51, 34, 38, 57, 51, 51, 51, 51, 51, 51, 51, 51, 51], - [51, 51, 6, 32, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 6, 53], - [51, 51, 6, 55, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], - [51, 51, 11, 17, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], - [51, 51, 13, 13, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], - [51, 51, 31, 51, 2, 2, 2, 51, 2, 2, 2, 2, 4, 26, 27, 25], - [51, 51, 51, 51, 2, 52, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], - [51, 51, 51, 51, 10, 9, 60, 51, 51, 51, 51, 51, 51, 51, 51, 51], - [51, 51, 51, 51, 51, 51, 51, 51, 51, 2, 51, 51, 51, 51, 51, 51], - [51, 51, 51, 51, 51, 51, 51, 51, 51, 19, 56, 51, 7, 2, 40, 23], - [51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 10, 36, 2, 51], - [51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 12, 61, 51, 51], - [51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 15, 51, 51, 51], - [51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51], - [51, 58, 22, 48, 2, 2, 42, 3, 2, 2, 21, 16, 51, 30, 37, 43], + [5, 5, 7, 5, 50, 10, 40, 58, 58, 58, 58, 58, 58, 58, 58, 58], + [5, 42, 16, 24, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], + [5, 53, 38, 0, 20, 9, 5, 5, 5, 5, 4, 18, 55, 56, 57, 54], + [51, 52, 5, 29, 30, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], + [58, 2, 27, 1, 5, 5, 48, 6, 5, 5, 28, 31, 58, 35, 14, 49], + [58, 34, 32, 58, 19, 11, 62, 58, 58, 58, 58, 58, 58, 58, 58, 58], + [58, 58, 12, 37, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 12, 61], + [58, 58, 12, 44, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], + [58, 58, 17, 45, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], + [58, 58, 36, 58, 5, 5, 5, 58, 5, 5, 5, 5, 3, 22, 21, 23], + [58, 58, 47, 47, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], + [58, 58, 58, 58, 5, 39, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], + [58, 58, 58, 58, 16, 60, 41, 58, 58, 58, 58, 58, 58, 58, 58, 58], + [58, 58, 58, 58, 58, 58, 58, 58, 58, 5, 58, 58, 58, 58, 58, 58], + [58, 58, 58, 58, 58, 58, 58, 58, 58, 46, 43, 58, 13, 5, 8, 26], + [58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 16, 15, 5, 58], + [58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 25, 59, 58, 58], + [58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 33, 58, 58, 58], + [58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], ]; - static BITSET_CANONICAL: [u64; 42] = [ + static BITSET_CANONICAL: [u64; 39] = [ 0b1111111111111111111111111111111111111111111111111111111111101111, - 0b1111111111111111111111011111111111111111111111111111110111111111, + 0b1111111101111111111111111111111111111111011111111111111111111111, + 0b0000011111111111111111111111111000000111111111111111111111111110, + 0b1111111111111111111111111111111100111111001111111111111111111111, + 0b1111111111111111111111110011111111111111111111111111111111111111, 0b1111111111111111111111111111111111111111111111111111111111111111, 0b1111111111111111111111111111111111111111111111111111111111110000, - 0b1111111111111111111111111111111100111111001111111111111111111111, + 0b1111111111111111111111111111111111111111111111111111110000000011, + 0b1111111111111111111111111111111111111111111111110111100011111111, + 0b1111111111111111111111111111111111111111111111011111110001011111, 0b1111111111111111111111111111111100000000011111111111111111111111, + 0b1111111111111111111111111111111100000000000000000100001111100000, 0b1111111111111111111111111111111100000000000000000000000000000000, 0b1111111111111111111111111111110000000000000000000000000000000000, - 0b1111111111111111111111110011111111111111111111111111111111111111, - 0b1111111111111111000000111111111111111111111111110000001111111111, + 0b1111111111111111111111111111101111111111111111111101011101000000, + 0b1111111111111111000000011111111111110111111111111111111111111111, 0b1111111111111111000000000000000000000000000000000000000000000000, 0b1111111111000000000000000000000000000000000000000000000000000000, + 0b1111011111111111111111111111111111110111111111111111111111111101, + 0b1111001000011111101111010101000000111110001011111111110010000100, + 0b0111101111111111111111111111111111011111110111111110011110111111, + 0b0101111111011111111111111111111111111111111111111111111111111111, + 0b0011111111111111111111111111111110101010111111110011111100111111, + 0b0001111111011100000111111111111100001111110011110001111111011100, + 0b0000111111111111111111111111111111111111000011111111111111111111, 0b0000011111111111111111111111111000000000000000000000000000000000, - 0b0000000000000111111111111111111111111111111111111111111111111111, - 0b0000000000000000000000000000000000000000000000000000111111110111, - 0b0000000000000000000000000000000000000000111110000000000001111111, - 0b0000000000000000000000000001111100000000000000000000000000000011, - 0b0000000000000000000000111111111111111111111111111111111111111111, - 0b0000000000000000001000001011111111111111111111111111111111111111, - 0b0000000000000000001111111111111111111111111111111111111111111111, - 0b0000000000001100011110000001111111111111111111111111111111111111, - 0b0000000111111111111111111111111111111111111011111111111111111111, - 0b0000010000100000000001000000000000000000000000000000000000000000, 0b0000011101100000000000000000000000000000000000000000011111111100, - 0b0000111111111111111111111111111111111111000011111111111111111111, - 0b0001111111011100000111111111111100001111110011110001111111011100, - 0b0011111111111111111111111111111110101010111111110011111100111111, - 0b0101111111011111111111111111111111111111111111111111111111111111, - 0b0111101111111111111111111111111111011111110111111110011110111111, + 0b0000010000100000000001000000000000000000000000000000000000000000, + 0b0000000111111111111111111111111111111111111011111111111111111111, + 0b0000000000001100011110000001111111111111111111111111111111111111, + 0b0000000000000000001000001011111111111111111111111111111111111111, + 0b0000000000000000000000000001111100000000000000000000000000000011, + 0b0000000000000000000000000000000000011111111111110000000000000000, + 0b0000000000000000000000000000000000000000111110000000000001111111, 0b1000000000000010000000000000000000000000000000000000000000000000, 0b1011110011001111000000000000000000000000000000000000000000100000, 0b1110011111111111111111111111111111111111111111110000000111111111, 0b1110011111111111111111111111111111111111111111110010000010111111, 0b1110101111111111110111100110010011011111111111111111111111111111, - 0b1111001000011111101111010101000000111110001011111111110010000100, - 0b1111011111111111111111111111111111110111111111111111111111111101, - 0b1111111111111111000000011111111111110111111111111111111111111111, - 0b1111111111111111111111111111101111111111111111111101011101000000, - 0b1111111111111111111111111111111100000000000000000100001111100000, - 0b1111111111111111111111111111111111111111111111011111110001011111, - 0b1111111111111111111111111111111111111111111111110111100011111111, - 0b1111111111111111111111111111111111111111111111111111110000000011, ]; - static BITSET_MAPPING: [(u8, u8); 21] = [ - (0, 55), (0, 50), (0, 44), (0, 43), (0, 27), (0, 17), (1, 14), (1, 12), (1, 6), (2, 128), - (3, 128), (4, 32), (5, 169), (6, 32), (7, 30), (8, 157), (9, 17), (10, 16), (11, 10), - (12, 32), (13, 157), + static BITSET_MAPPING: [(u8, u8); 24] = [ + (0, 188), (0, 183), (0, 182), (0, 176), (0, 162), (0, 160), (0, 150), (0, 146), (0, 141), + (0, 55), (0, 50), (0, 44), (0, 43), (0, 27), (0, 17), (1, 180), (1, 30), (1, 24), (1, 18), + (2, 187), (2, 160), (2, 15), (3, 32), (4, 93), ]; pub fn lookup(c: char) -> bool { @@ -565,19 +545,19 @@ pub mod cased { #[rustfmt::skip] pub mod cc { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (2, 1); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (2, 0); static BITSET_CHUNKS_MAP: [u8; 2] = [ - 1, 2, + 0, 1, ]; static BITSET_INDEX_CHUNKS: [[u8; 1]; 3] = [ [0], [1], [2], ]; - static BITSET_CANONICAL: [u64; 3] = [ - 0b0000000000000000000000000000000000000000000000000000000000000000, + static BITSET_CANONICAL: [u64; 2] = [ 0b0000000000000000000000000000000011111111111111111111111111111111, 0b1000000000000000000000000000000000000000000000000000000000000000, ]; - static BITSET_MAPPING: [(u8, u8); 0] = [ + static BITSET_MAPPING: [(u8, u8); 1] = [ + (0, 160), ]; pub fn lookup(c: char) -> bool { @@ -594,155 +574,152 @@ pub mod cc { #[rustfmt::skip] pub mod grapheme_extend { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 20); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 3); static BITSET_CHUNKS_MAP: [u8; 245] = [ - 42, 34, 28, 23, 6, 11, 18, 10, 13, 40, 42, 35, 22, 3, 9, 42, 21, 42, 42, 42, 42, 42, 30, - 42, 2, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, - 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, - 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 24, 5, 8, 42, 42, 42, 42, 42, 42, - 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, - 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 36, 42, 7, 41, 31, 42, 42, 42, 15, 37, 27, 19, 12, - 0, 16, 44, 14, 17, 29, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, - 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 42, 39, - 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, - 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 26, 42, 42, 42, 42, 42, 42, 42, - 42, 42, 38, 25, 42, 42, 42, 1, 42, 42, 4, 32, 42, 42, 33, + 34, 30, 41, 44, 17, 11, 0, 12, 9, 36, 34, 29, 43, 20, 13, 34, 21, 34, 34, 34, 34, 34, 26, + 34, 16, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 23, 18, 14, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 31, 34, 15, 35, 27, 34, 34, 34, 7, 37, 25, 4, 10, + 22, 6, 2, 8, 5, 28, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 40, 34, 33, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 24, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 32, 42, 34, 34, 34, 1, 34, 34, 19, 38, 34, 34, 39, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 45] = [ - [4, 24, 87, 13, 138, 138, 89, 143], [5, 52, 41, 138, 138, 138, 138, 138], - [11, 138, 115, 138, 138, 138, 138, 138], [20, 86, 8, 102, 67, 47, 37, 45], - [32, 138, 138, 138, 6, 138, 138, 138], [35, 138, 138, 83, 130, 127, 71, 111], - [40, 123, 138, 100, 66, 31, 77, 75], [42, 138, 138, 138, 138, 138, 117, 138], - [55, 65, 94, 54, 138, 138, 138, 36], [58, 138, 138, 60, 138, 138, 138, 0], - [62, 129, 68, 142, 59, 82, 69, 105], [64, 53, 64, 97, 95, 30, 74, 17], - [76, 138, 138, 33, 78, 50, 138, 138], [80, 49, 22, 138, 138, 138, 138, 138], - [81, 21, 19, 138, 138, 138, 138, 138], [84, 138, 138, 118, 138, 138, 138, 138], - [88, 102, 57, 138, 34, 138, 138, 138], [91, 138, 56, 138, 90, 15, 18, 138], - [93, 28, 85, 27, 78, 29, 103, 25], [98, 131, 61, 138, 51, 112, 9, 16], - [101, 7, 138, 138, 7, 7, 7, 139], [106, 138, 138, 44, 138, 138, 138, 138], - [122, 138, 12, 138, 63, 138, 138, 138], [128, 43, 138, 39, 99, 125, 3, 70], - [138, 73, 117, 120, 138, 138, 138, 138], [138, 121, 138, 138, 138, 138, 138, 138], - [138, 138, 116, 138, 138, 138, 138, 138], [138, 138, 119, 138, 138, 126, 138, 138], - [138, 138, 135, 138, 138, 138, 92, 14], [138, 138, 138, 1, 138, 138, 138, 138], - [138, 138, 138, 2, 138, 114, 138, 101], [138, 138, 138, 109, 138, 10, 138, 138], - [138, 138, 138, 134, 138, 138, 138, 138], [138, 138, 138, 137, 138, 136, 138, 138], - [138, 138, 138, 138, 7, 139, 138, 138], [138, 138, 138, 138, 48, 46, 72, 23], - [138, 138, 138, 138, 108, 138, 138, 138], [138, 138, 138, 138, 133, 138, 138, 138], - [138, 138, 138, 138, 138, 96, 38, 138], [138, 138, 138, 138, 138, 107, 132, 110], - [138, 138, 138, 138, 138, 124, 138, 138], [138, 138, 138, 138, 138, 138, 138, 113], - [138, 138, 138, 138, 138, 138, 138, 138], [138, 138, 138, 141, 6, 138, 138, 138], - [140, 138, 138, 138, 79, 104, 138, 26], + [1, 85, 27, 86, 34, 84, 100, 88], [4, 60, 71, 120, 120, 120, 120, 120], + [7, 120, 120, 120, 33, 101, 120, 87], [13, 8, 120, 120, 8, 8, 8, 139], + [17, 0, 51, 120, 61, 111, 138, 97], [21, 120, 56, 120, 22, 141, 95, 120], + [24, 99, 55, 120, 79, 120, 120, 120], [28, 120, 120, 116, 120, 120, 120, 120], + [31, 92, 94, 120, 120, 120, 120, 120], [32, 63, 91, 120, 120, 120, 120, 120], + [36, 120, 120, 80, 34, 62, 120, 120], [48, 59, 48, 9, 19, 83, 38, 96], + [50, 104, 44, 140, 53, 30, 43, 102], [54, 120, 120, 52, 120, 120, 120, 6], + [57, 47, 20, 58, 120, 120, 120, 77], [70, 120, 120, 120, 120, 120, 115, 120], + [72, 120, 113, 120, 120, 120, 120, 120], [73, 123, 120, 14, 46, 82, 35, 37], + [78, 120, 120, 29, 110, 127, 41, 109], [81, 120, 120, 120, 5, 120, 120, 120], + [93, 26, 16, 99, 45, 64, 76, 66], [103, 120, 120, 68, 120, 120, 120, 120], + [112, 89, 25, 137, 120, 120, 23, 143], [120, 39, 115, 118, 120, 120, 120, 120], + [120, 120, 114, 120, 120, 120, 120, 120], [120, 120, 117, 120, 120, 126, 120, 120], + [120, 120, 120, 67, 120, 136, 120, 13], [120, 120, 120, 107, 120, 11, 120, 120], + [120, 120, 120, 119, 120, 120, 120, 120], [120, 120, 120, 120, 2, 65, 40, 90], + [120, 120, 120, 120, 8, 139, 120, 120], [120, 120, 120, 120, 106, 120, 120, 120], + [120, 120, 120, 120, 120, 18, 75, 120], [120, 120, 120, 120, 120, 105, 129, 108], + [120, 120, 120, 120, 120, 120, 120, 120], [120, 120, 120, 120, 120, 120, 120, 135], + [120, 120, 120, 120, 120, 124, 120, 120], [120, 120, 120, 120, 130, 120, 120, 120], + [120, 120, 120, 131, 120, 120, 120, 120], [120, 120, 120, 134, 120, 133, 120, 120], + [120, 120, 120, 142, 5, 120, 120, 120], [120, 120, 132, 120, 120, 120, 10, 98], + [120, 121, 120, 120, 120, 120, 120, 120], [122, 120, 12, 120, 49, 120, 120, 120], + [128, 69, 120, 74, 15, 125, 3, 42], ]; - static BITSET_CANONICAL: [u64; 102] = [ - 0b1111101111111111111111111111111111111111111111111111111111111111, - 0b0000000000011000000000000000000000000000000000000000000000000000, - 0b0000000000000011100000000000000000000000000000000000000000000000, + static BITSET_CANONICAL: [u64; 99] = [ + 0b1000000000000000000000000000000000000000000000000000000001111111, + 0b1100000000000000000000000000000000000000000000000000000000010001, + 0b0000000000011100000000000000000000000000000111000000000000000000, 0b0000000000000001111111111100000000000000000000000000000000000000, - 0b1111111100000000000000000000000000000000000000000000000000000000, 0b1111100001111111111111111111111111111111111111111111111111111111, 0b0000000001111111000000000000000000000000000000000000000000000000, + 0b1111101111111111111111111111111111111111111111111111111111111111, + 0b0000011011111111100000000000000000000000000000000000000000000000, 0b1111111111111111111111111111111111111111111111111111111111111111, - 0b1111111111111111000000000000000000000000000000000000000000000000, - 0b0111111111000000000000000000000000000000000000000000000000000011, + 0b1111110000000000000000000000110000000000000000000010000110111110, + 0b1011111111111111111111111111111111111111111111100000000000000000, 0b0000011111000000000000000000000000000000000000000000000000000000, - 0b0000000000000000111111000000000000000000000000000000000000000000, 0b0000000000000000000000100000000000000000000000000000000001100000, - 0b0000000000000000000000000000000000000000000000000000000000001101, - 0b0000000000000000000000000000000000000000000000000000000010110110, - 0b0000000000000000000000000000000000000000000000000000000010111111, - 0b0000000000000000000000000000000000000000000000001001111000000000, - 0b0000000000000000000000000000000000000000100000000010000000000001, - 0b0000000000000000000000000000000000000000101000110000000000000000, - 0b0000000000000000000000000000000000000011011111111111110000000000, - 0b0000000000000000000000000000000000001001100000000000000000000000, - 0b0000000000000000000000000000000000001110011111100000000010000000, - 0b0000000000000000000000000000000000100000000000000010000001100100, - 0b0000000000000000000000000000000000100000000011111111111001000000, - 0b0000000000000000000000000000000001000000000000000000000001011100, - 0b0000000000000000000000000000000010000000010111001000010000000000, - 0b0000000000000000000000000000000100001100111100000000000000000000, - 0b0000000000000000000000000000110000000000011000000011000001000100, - 0b0000000000000000000000000000110000000000011000000011110111000001, - 0b0000000000000000000000000000110000000000100000000010000000011110, - 0b0000000000000000000000000000110000000000111000000010000000011110, - 0b0000000000000000000000000000110000000000111111100010000111111110, - 0b0000000000000000000001111101101111111001111111111111111101111111, - 0b0000000000000000000001111111100010000000000000000000000000000000, - 0b0000000000000000000011111011110011100000000000000000000000000000, - 0b0000000000000000000100000110000000000000000000000000100001000100, - 0b0000000000000000001000010010000000000000000000000000000000000000, - 0b0000000000000000001110110011110000000000000000000000000000000011, - 0b0000000000000000001111000000000000000000000000000000111111100111, - 0b0000000000000000001111011001111110011111110000000000000000000000, - 0b0000000000000000001111101110111111111011110000000000000000000000, - 0b0000000000000000111111111111111011111000000000000000000000010000, - 0b0000000000000000111111111111111100000000000000001111111111111111, - 0b0000000000000001000000000000000011111111111111111111100000000000, - 0b0000000000000001111111111111111111111111111111110000000000000000, - 0b0000000000000011101000110100000000000000000000000000000000000000, - 0b0000000000001100000000000000000000000000000011000000000000000000, - 0b0000000000001111111110000000000000000000000000000000000000000100, - 0b0000000000011100000000000000000000000000000111000000000000000000, - 0b0000000000011110000000000000000111000011000000000000000000000000, - 0b0000000000011111000111111100000000000000100000000000000000000001, - 0b0000000000011111111011111000000000000000000000000000000000000111, - 0b0000000000100000000111111111111111111111111111111111111111111111, - 0b0000000000100011000000000000000000000000000000100011100110000110, - 0b0000000001000000001100000000000000000000000000000000000000000010, - 0b0000000001100110011111100000000000000000000000000000000000000000, - 0b0000000001101101111111001111111111111111111111000000000000000000, - 0b0000000010111111001010000000000000000000000000000000000000000000, - 0b0000000011001111111100000000000000000000000000000000000000000000, - 0b0000001010100000000000000000000000000011000000000000000000000000, - 0b0000001100010000001000011111110111111111111101110000000000000000, - 0b0000011001111000000000000000000000000000000000000000000000000011, - 0b0000011111110010000000000000000000000000000000000000000000000000, - 0b0000111000000100000000011000011100000000000000000000000000000000, - 0b0001000000000000000000000000000000000000000000000000000000000110, - 0b0001000000000000000000000000000000000000000000000001000000001000, - 0b0001010000000000000000000000000000000000000000000000000000000111, - 0b0001011111110000000000000000000000000000000000000000000000001111, - 0b0001111111110010000000000000000000000000000000000000000000000000, - 0b0001111111111111111111111111111111111110111111111110000011011111, - 0b0010000000001111111110000000000000000000000000000000000000000000, - 0b0011001111001000000000000000000000000000000000000000000000000111, - 0b0011111110110000000000000000000000000000000000000000000000000000, - 0b0011111111110111100000000000000000000000000000000000000000000000, - 0b0100000000000000000000000000000000000000000000000000000000000100, - 0b0100000000000000000000000000110000000000100000000010000000011110, - 0b0100000011010011100000000000000000000000000000000000000000000000, - 0b0101000000000000000000000000000000000000000000000000000000000010, - 0b0101100000000000000000000000000000000000000000000000000000000011, - 0b0101100000000001000000000000000000000000000000000000000000000000, - 0b0110011011111101111000000000000000000000000000000000000000000000, - 0b0111100111111000000000000000000000000000000000000000011111111110, - 0b0111111111111110000000000000000000000000000000000000000000000000, - 0b1000000000000011111111111111111100000000000000000000000000110000, - 0b1000011100000000000000000000000000000000000000001111000001101110, - 0b1001000000000000000000000000000000000000000000000000000000000010, - 0b1001111111111000000111111110010101111111010000000000000000000000, - 0b1010010111111001000000000000000000000000000000000000000000000000, - 0b1010011111111000000000000000000000000000000000000000000000000000, - 0b1011000000111100100000000000000000000000000000000000000000000000, - 0b1011010001111110000000000000000000000000000000000000000000000000, - 0b1011111101111111000000000000000000000000000000000000000000000000, - 0b1011111111111111111111111111111111111111111111100000000000000000, - 0b1100000000000000000000000000000000000000000000000000000000010001, - 0b1100000110011101000000000000000000000000000000000000000000000000, - 0b1101000000000000000000000000000000000000000000000000000000000010, - 0b1111100000000111110000111010000000000000000000000000000000000000, - 0b1111110000000000000000000000110000000000000000000010000110111110, - 0b1111111100000000000000000000000000000000000000000000000000000010, - 0b1111111111111111000000000000000000000000000000100000000000000000, - 0b1111111111111111111111111111101111111111111110000000000000000000, 0b1111111111111111111111111111111100000000000000000000000000000000, + 0b1111111111111111111111111111101111111111111110000000000000000000, + 0b1111111111111111000000000000000000000000000000100000000000000000, + 0b1111111111111111000000000000000000000000000000000000000000000000, + 0b1111111100000000000000000000000000000000000000000000000000000010, + 0b1111100000000111110000111010000000000000000000000000000000000000, + 0b1101000000000000000000000000000000000000000000000000000000000010, + 0b1100000110011101000000000000000000000000000000000000000000000000, + 0b1011111101111111000000000000000000000000000000000000000000000000, + 0b1011010001111110000000000000000000000000000000000000000000000000, + 0b1011000000111100100000000000000000000000000000000000000000000000, + 0b1010011111111000000000000000000000000000000000000000000000000000, + 0b1010010111111001000000000000000000000000000000000000000000000000, + 0b1001111111111000000111111110010101111111010000000000000000000000, + 0b1001000000000000000000000000000000000000000000000000000000000010, + 0b1000011100000000000000000000000000000000000000001111000001101110, + 0b1000000000000011111111111111111100000000000000000000000000110000, + 0b0111111111111110000000000000000000000000000000000000000000000000, + 0b0111100111111000000000000000000000000000000000000000011111111110, + 0b0110011011111101111000000000000000000000000000000000000000000000, + 0b0101100000000001000000000000000000000000000000000000000000000000, + 0b0101100000000000000000000000000000000000000000000000000000000011, + 0b0101000000000000000000000000000000000000000000000000000000000010, + 0b0100000011010011100000000000000000000000000000000000000000000000, + 0b0100000000000000000000000000110000000000100000000010000000011110, + 0b0100000000000000000000000000000000000000000000000000000000000100, + 0b0011111111110111100000000000000000000000000000000000000000000000, + 0b0011111110110000000000000000000000000000000000000000000000000000, + 0b0011001111001000000000000000000000000000000000000000000000000111, + 0b0010000000001111111110000000000000000000000000000000000000000000, + 0b0001111111111111111111111111111111111110111111111110000011011111, + 0b0001111111110010000000000000000000000000000000000000000000000000, + 0b0001011111110000000000000000000000000000000000000000000000001111, + 0b0001010000000000000000000000000000000000000000000000000000000111, + 0b0001000000000000000000000000000000000000000000000001000000001000, + 0b0001000000000000000000000000000000000000000000000000000000000110, + 0b0000111000000100000000011000011100000000000000000000000000000000, + 0b0000011111110010000000000000000000000000000000000000000000000000, + 0b0000011001111000000000000000000000000000000000000000000000000011, + 0b0000001100010000001000011111110111111111111101110000000000000000, + 0b0000001010100000000000000000000000000011000000000000000000000000, + 0b0000000011001111111100000000000000000000000000000000000000000000, + 0b0000000010111111001010000000000000000000000000000000000000000000, + 0b0000000001101101111111001111111111111111111111000000000000000000, + 0b0000000001100110011111100000000000000000000000000000000000000000, + 0b0000000001000000001100000000000000000000000000000000000000000010, + 0b0000000000100011000000000000000000000000000000100011100110000110, + 0b0000000000100000000111111111111111111111111111111111111111111111, + 0b0000000000011111111011111000000000000000000000000000000000000111, + 0b0000000000011111000111111100000000000000100000000000000000000001, + 0b0000000000011110000000000000000111000011000000000000000000000000, + 0b0000000000001111111110000000000000000000000000000000000000000100, + 0b0000000000001100000000000000000000000000000011000000000000000000, + 0b0000000000000011101000110100000000000000000000000000000000000000, + 0b0000000000000011100000000000000000000000000000000000000000000000, + 0b0000000000000001111111111111111111111111111111110000000000000000, + 0b0000000000000001000000000000000011111111111111111111100000000000, + 0b0000000000000000111111111111111100000000000000001111111111111111, + 0b0000000000000000111111111111111011111000000000000000000000010000, + 0b0000000000000000111111000000000000000000000000000000000000000000, + 0b0000000000000000001111101110111111111011110000000000000000000000, + 0b0000000000000000001111011001111110011111110000000000000000000000, + 0b0000000000000000001111000000000000000000000000000000111111100111, + 0b0000000000000000001110110011110000000000000000000000000000000011, + 0b0000000000000000001000010010000000000000000000000000000000000000, + 0b0000000000000000000100000110000000000000000000000000100001000100, + 0b0000000000000000000011111011110011100000000000000000000000000000, + 0b0000000000000000000001111111100010000000000000000000000000000000, + 0b0000000000000000000001111101101111111001111111111111111101111111, + 0b0000000000000000000000000000110000000000111111100010000111111110, + 0b0000000000000000000000000000110000000000111000000010000000011110, + 0b0000000000000000000000000000110000000000100000000010000000011110, + 0b0000000000000000000000000000110000000000011000000011110111000001, + 0b0000000000000000000000000000110000000000011000000011000001000100, + 0b0000000000000000000000000000000100001100111100000000000000000000, + 0b0000000000000000000000000000000010000000010111001000010000000000, + 0b0000000000000000000000000000000001000000000000000000000001011100, + 0b0000000000000000000000000000000000100000000011111111111001000000, + 0b0000000000000000000000000000000000100000000000000010000001100100, + 0b0000000000000000000000000000000000001110011111100000000010000000, + 0b0000000000000000000000000000000000001001100000000000000000000000, + 0b0000000000000000000000000000000000000011011111111111110000000000, + 0b0000000000000000000000000000000000000000101000110000000000000000, + 0b0000000000000000000000000000000000000000100000000010000000000001, + 0b0000000000000000000000000000000000000000000000001001111000000000, + 0b0000000000000000000000000000000000000000000000000000000010110110, ]; - static BITSET_MAPPING: [(u8, u8); 42] = [ - (0, 134), (0, 135), (0, 137), (0, 140), (0, 146), (0, 149), (0, 164), (0, 166), (0, 170), - (0, 171), (0, 185), (0, 131), (0, 133), (1, 38), (1, 42), (1, 43), (1, 50), (1, 56), - (1, 61), (2, 19), (2, 28), (2, 42), (2, 46), (3, 26), (3, 32), (3, 33), (3, 42), (4, 15), - (4, 46), (4, 7), (5, 152), (5, 173), (5, 181), (6, 19), (6, 20), (6, 32), (7, 128), - (8, 128), (9, 57), (10, 58), (11, 30), (12, 23), + static BITSET_MAPPING: [(u8, u8); 45] = [ + (0, 191), (0, 190), (0, 188), (0, 185), (0, 179), (0, 8), (0, 176), (0, 161), (0, 159), + (0, 155), (0, 154), (0, 39), (0, 140), (0, 57), (1, 165), (1, 161), (1, 160), (1, 153), + (1, 147), (1, 142), (1, 139), (2, 181), (2, 176), (2, 167), (2, 153), (2, 149), (3, 26), + (3, 32), (3, 33), (3, 42), (4, 88), (4, 109), (4, 117), (5, 19), (5, 20), (5, 32), (6, 67), + (6, 69), (7, 183), (7, 7), (8, 144), (9, 178), (10, 184), (11, 58), (12, 23), ]; pub fn lookup(c: char) -> bool { @@ -759,98 +736,91 @@ pub mod grapheme_extend { #[rustfmt::skip] pub mod lowercase { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (122, 10); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (122, 5); static BITSET_CHUNKS_MAP: [u8; 118] = [ - 5, 1, 16, 16, 8, 16, 16, 6, 4, 9, 16, 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 12, 13, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 14, 15, 16, 2, 16, 7, 16, 16, - 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 11, 16, - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, - 16, 3, + 16, 2, 9, 9, 4, 9, 9, 15, 3, 12, 9, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 11, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 8, 10, 9, 0, 9, 14, 9, 9, 13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 17, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 18] = [ - [4, 31, 40, 19, 16, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], - [4, 42, 69, 41, 18, 3, 10, 66, 66, 66, 66, 66, 66, 66, 66, 66], - [55, 68, 66, 6, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], - [62, 15, 47, 54, 22, 60, 49, 0, 26, 61, 70, 48, 64, 65, 1, 11], - [66, 35, 71, 66, 28, 51, 9, 66, 66, 66, 66, 66, 66, 66, 66, 66], - [66, 63, 24, 50, 34, 44, 45, 38, 36, 57, 23, 14, 66, 29, 53, 27], - [66, 66, 10, 66, 2, 2, 2, 66, 40, 40, 5, 40, 21, 32, 33, 20], - [66, 66, 66, 7, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], - [66, 66, 66, 46, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 30], - [66, 66, 66, 59, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], - [66, 66, 66, 66, 56, 8, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], - [66, 66, 66, 66, 66, 66, 66, 66, 66, 3, 66, 66, 66, 66, 66, 66], - [66, 66, 66, 66, 66, 66, 66, 66, 66, 17, 13, 66, 43, 37, 39, 25], - [66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 4, 52, 2, 66], - [66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 12, 66, 66, 66], - [66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 58, 66, 66], - [66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], - [66, 66, 66, 67, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66], + [10, 55, 52, 6, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], + [15, 24, 20, 34, 35, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], + [15, 46, 1, 19, 63, 8, 54, 52, 52, 52, 52, 52, 52, 52, 52, 52], + [52, 5, 39, 52, 27, 14, 70, 52, 52, 52, 52, 52, 52, 52, 52, 52], + [52, 52, 52, 50, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 25], + [52, 52, 52, 52, 9, 53, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], + [52, 52, 52, 52, 52, 52, 52, 52, 52, 8, 52, 52, 52, 52, 52, 52], + [52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 15, 13, 2, 52], + [52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 40, 52, 52, 52], + [52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], + [52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 56, 52, 52], + [52, 52, 52, 52, 52, 52, 52, 52, 52, 62, 38, 52, 47, 43, 45, 29], + [52, 52, 52, 57, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], + [52, 52, 52, 65, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], + [52, 52, 52, 66, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], + [52, 52, 54, 52, 2, 2, 2, 52, 20, 20, 64, 20, 32, 23, 22, 33], + [52, 69, 30, 16, 21, 48, 49, 44, 42, 7, 31, 37, 52, 26, 12, 28], + [60, 36, 51, 11, 61, 58, 17, 4, 0, 59, 71, 18, 67, 68, 3, 41], ]; - static BITSET_CANONICAL: [u64; 58] = [ - 0b1111111111111111110000000000000000000000000011111111111111111111, - 0b1111111111111111111111000000000000000000000000001111110111111111, + static BITSET_CANONICAL: [u64; 52] = [ + 0b0000111111111111111111111111110000000000000000000000000011111111, + 0b1010101010101010101010101010101010101010101010101010100000000010, 0b1111111111111111111111111111111111111111111111111111111111111111, + 0b1111111111111111111111000000000000000000000000001111110111111111, + 0b1111111111111111110000000000000000000000000011111111111111111111, + 0b1000000000000010000000000000000000000000000000000000000000000000, + 0b0000111111111111111111111111111111111111000000000000000000000000, + 0b1111111111111111111111111111111111111111111111111010101010000101, 0b1111111111111111111111111111111100000000000000000000000000000000, + 0b1111111111111111111111111111110000000000000000000000000000000000, + 0b1111111111111111111111110000000000000000000000000000000000000000, + 0b1111111111111111111111000000000000000000000000001111111111101111, + 0b1111111111111111111100000000000000000000000000010000000000000000, + 0b1111111111111111000000011111111111110111111111111111111111111111, + 0b1111111111111111000000000000000000000000000000000100001111000000, 0b1111111111111111000000000000000000000000000000000000000000000000, - 0b1010101010101010101010101010101010111111111010101010101010101010, - 0b0000111111111111111111111111111111111111000000000000000000000000, - 0b0000000000000111111111111111111111111111111111111111111111111111, - 0b0000000000000000000000000000000000000000000000000000000000001111, - 0b0000000000000000000000000000000000000000000000000000000000010000, - 0b0000000000000000000000000000000000000000000000000000000111111111, - 0b0000000000000000000000000000000000000000000000000000101111110111, - 0b0000000000000000000000000000000000000000111110000000000001111111, - 0b0000000000000000000000000000000000111010101010101010101010101010, - 0b0000000000000000000000000001111100000000000000000000000000000011, - 0b0000000000000000000000001111111111111111110111111100000000000000, - 0b0000000000000000001000001011111111111111111111111111111111111111, - 0b0000000000000000001010101010101010101010101010101010101010101010, - 0b0000000000000000101010101010101010101010101010101010101010101010, - 0b0000000000001000010100000001101010101010101010101010101010101010, - 0b0000000011011100000000001111111100000000110011110000000011011100, - 0b0000000011111111000000001111111100000000001111110000000011111111, - 0b0000000011111111111111111111111111000000000000000000000000001111, - 0b0000000111111111111111111111111111111111111011111111111111111111, - 0b0000010000100000000001000000000000000000000000000000000000000000, - 0b0000011101000000000000000000000000000000000000000000010100001000, - 0b0000111111111111111111111111110000000000000000000000000011111111, - 0b0001100100101111101010101010101010101010111000110111111111111111, - 0b0011001000010000100000000000000000000000000010001100010000000000, - 0b0011110010001010000000000000000000000000000000000000000000100000, - 0b0011111100000000000000000000000000000000000000000000000000000000, - 0b0011111111011010000101010110001001111111111111111111111111111111, - 0b0011111111111111000000001111111100000000111111110000000000111111, - 0b0100000011011111000000001111111100000000111111110000000011111111, + 0b1111111101111111111111111111111110000000000000000000000000000000, + 0b1111110000000000000000000000000011111111111111111111111111000000, + 0b1111000000000000000000000000001111110111111111111111111111111100, + 0b1010101010101010101010101010101010101010101010101101010101010100, + 0b1010101010101010101010101010101010101010101010101010101010101010, 0b0101010110101010101010101010101010101010101010101010101010101010, - 0b1000000000000010000000000000000000000000000000000000000000000000, + 0b0100000011011111000000001111111100000000111111110000000011111111, + 0b0011111111111111000000001111111100000000111111110000000000111111, + 0b0011111111011010000101010110001001111111111111111111111111111111, + 0b0011111100000000000000000000000000000000000000000000000000000000, + 0b0011110010001010000000000000000000000000000000000000000000100000, + 0b0011001000010000100000000000000000000000000010001100010000000000, + 0b0001100100101111101010101010101010101010111000110111111111111111, + 0b0000011101000000000000000000000000000000000000000000010100001000, + 0b0000010000100000000001000000000000000000000000000000000000000000, + 0b0000000111111111111111111111111111111111111011111111111111111111, + 0b0000000011111111000000001111111100000000001111110000000011111111, + 0b0000000011011100000000001111111100000000110011110000000011011100, + 0b0000000000001000010100000001101010101010101010101010101010101010, + 0b0000000000000000001000001011111111111111111111111111111111111111, + 0b0000000000000000000000001111111111111111110111111100000000000000, + 0b0000000000000000000000000001111100000000000000000000000000000011, + 0b0000000000000000000000000000000000111010101010101010101010101010, + 0b0000000000000000000000000000000000011111111111110000000000000000, + 0b0000000000000000000000000000000000000000111110000000000001111111, + 0b0000000000000000000000000000000000000000000000000000101111110111, 0b1001001111111010101010101010101010101010101010101010101010101010, 0b1001010111111111101010101010101010101010101010101010101010101010, 0b1010101000101001101010101010101010110101010101010101001001000000, 0b1010101010100000100000101010101010101010101110100101000010101010, - 0b1010101010101010101010101010101010101010101010101010101010101010, - 0b1010101010101010101010101010101010101010101010101101010101010100, 0b1010101010101010101010101010101011111111111111111111111111111111, 0b1010101010101011101010101010100000000000000000000000000000000000, 0b1101010010101010101010101010101010101010101010101010101101010101, 0b1110011001010001001011010010101001001110001001000011000100101001, 0b1110011111111111111111111111111111111111111111110000000000000000, 0b1110101111000000000000000000000000001111111111111111111111111100, - 0b1111000000000000000000000000001111110111111111111111111111111100, - 0b1111110000000000000000000000000011111111111111111111111111000000, - 0b1111111101111111111111111111111110000000000000000000000000000000, - 0b1111111111111111000000000000000000000000000000000100001111000000, - 0b1111111111111111000000011111111111110111111111111111111111111111, - 0b1111111111111111111100000000000000000000000000010000000000000000, - 0b1111111111111111111111000000000000000000000000001111111111101111, - 0b1111111111111111111111110000000000000000000000000000000000000000, - 0b1111111111111111111111111111110000000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111111010101010000101, ]; - static BITSET_MAPPING: [(u8, u8); 14] = [ - (0, 173), (0, 188), (0, 190), (0, 130), (0, 134), (0, 141), (1, 12), (1, 6), (2, 128), - (3, 32), (4, 16), (5, 173), (6, 142), (7, 157), + static BITSET_MAPPING: [(u8, u8); 20] = [ + (0, 188), (0, 184), (0, 179), (0, 172), (0, 161), (0, 146), (0, 144), (0, 140), (0, 136), + (0, 132), (1, 146), (1, 144), (1, 83), (2, 160), (2, 141), (3, 12), (3, 6), (4, 77), + (5, 187), (6, 78), ]; pub fn lookup(c: char) -> bool { @@ -867,100 +837,97 @@ pub mod lowercase { #[rustfmt::skip] pub mod n { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (253, 40); + static BITSET_LAST_CHUNK_MAP: (u16, u8) = (253, 21); static BITSET_CHUNKS_MAP: [u8; 249] = [ - 5, 41, 41, 21, 37, 23, 11, 18, 16, 35, 41, 27, 2, 45, 46, 41, 9, 41, 15, 34, 41, 41, 29, - 41, 1, 3, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, - 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, - 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 4, 6, 20, 41, 41, 41, 41, 41, 41, - 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, - 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 33, 32, 28, 25, 41, 13, 10, 26, 12, 8, 30, - 19, 17, 43, 41, 7, 38, 41, 41, 0, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, - 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 22, 41, 24, - 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, - 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, - 41, 41, 41, 31, 41, 39, 41, 41, 41, 41, 36, 30, 41, 41, 44, 41, 14, 41, 42, + 45, 19, 19, 39, 23, 40, 6, 37, 33, 17, 19, 12, 42, 32, 41, 19, 8, 19, 2, 16, 19, 19, 13, + 19, 1, 43, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 44, 46, 34, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 25, 15, 14, 31, 19, 4, 7, 11, 5, 9, 26, 36, + 35, 28, 19, 10, 20, 19, 19, 0, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 38, 19, 30, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 27, 19, 18, 19, 19, 19, 19, 22, 26, 19, 19, 29, 19, 3, 19, 24, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 47] = [ - [7, 23, 67, 67, 67, 67, 67, 67], [32, 67, 67, 67, 67, 67, 66, 67], - [50, 67, 67, 67, 67, 49, 67, 18], [52, 20, 8, 67, 67, 67, 67, 67], - [52, 67, 67, 53, 67, 67, 67, 67], [54, 67, 38, 67, 67, 67, 67, 67], - [58, 67, 67, 50, 48, 67, 67, 31], [67, 22, 67, 67, 67, 50, 52, 67], - [67, 24, 67, 54, 0, 67, 67, 26], [67, 30, 48, 67, 67, 46, 14, 67], - [67, 37, 72, 60, 67, 42, 64, 67], [67, 39, 67, 53, 67, 28, 67, 53], - [67, 40, 67, 67, 51, 65, 67, 63], [67, 41, 13, 3, 57, 67, 56, 1], - [67, 43, 25, 67, 36, 67, 67, 67], [67, 45, 19, 44, 67, 67, 67, 67], - [67, 48, 50, 67, 67, 67, 67, 67], [67, 50, 67, 48, 33, 67, 67, 67], - [67, 50, 67, 50, 62, 67, 67, 67], [67, 50, 67, 50, 67, 67, 67, 67], - [67, 50, 67, 67, 67, 67, 67, 54], [67, 52, 67, 54, 67, 67, 67, 48], - [67, 52, 67, 67, 67, 21, 67, 67], [67, 53, 67, 53, 67, 27, 67, 11], - [67, 67, 17, 67, 67, 67, 67, 67], [67, 67, 52, 67, 67, 67, 67, 67], - [67, 67, 67, 2, 54, 67, 67, 67], [67, 67, 67, 12, 67, 67, 67, 9], - [67, 67, 67, 34, 6, 15, 67, 59], [67, 67, 67, 35, 67, 67, 67, 67], - [67, 67, 67, 54, 67, 67, 67, 67], [67, 67, 67, 62, 67, 68, 67, 67], - [67, 67, 67, 67, 10, 5, 55, 67], [67, 67, 67, 67, 50, 67, 67, 67], - [67, 67, 67, 67, 67, 0, 61, 67], [67, 67, 67, 67, 67, 4, 67, 67], - [67, 67, 67, 67, 67, 48, 67, 67], [67, 67, 67, 67, 67, 53, 67, 29], - [67, 67, 67, 67, 67, 67, 67, 16], [67, 67, 67, 67, 67, 67, 67, 47], - [67, 67, 67, 67, 67, 67, 67, 54], [67, 67, 67, 67, 67, 67, 67, 67], - [67, 67, 67, 67, 71, 67, 67, 67], [67, 67, 67, 70, 67, 50, 67, 67], - [67, 67, 67, 73, 67, 50, 67, 67], [67, 67, 69, 67, 67, 50, 54, 67], - [67, 69, 67, 67, 67, 67, 67, 67], + [12, 52, 44, 44, 44, 44, 44, 44], [27, 44, 44, 44, 44, 44, 67, 44], + [44, 15, 51, 16, 44, 44, 44, 44], [44, 17, 34, 44, 23, 44, 44, 44], + [44, 18, 11, 4, 62, 44, 61, 2], [44, 19, 44, 44, 56, 66, 44, 46], + [44, 20, 44, 58, 44, 31, 44, 58], [44, 22, 72, 65, 44, 43, 53, 44], + [44, 29, 45, 44, 44, 14, 42, 44], [44, 36, 44, 59, 1, 44, 44, 33], + [44, 37, 44, 44, 44, 55, 57, 44], [44, 44, 44, 3, 59, 44, 44, 44], + [44, 44, 44, 10, 44, 44, 44, 8], [44, 44, 44, 24, 44, 44, 44, 44], + [44, 44, 44, 25, 5, 41, 44, 64], [44, 44, 44, 44, 9, 0, 60, 44], + [44, 44, 44, 44, 44, 1, 48, 44], [44, 44, 44, 44, 44, 7, 44, 44], + [44, 44, 44, 44, 44, 44, 44, 13], [44, 44, 44, 44, 44, 44, 44, 44], + [44, 44, 44, 44, 44, 44, 44, 49], [44, 44, 44, 44, 44, 44, 44, 59], + [44, 44, 44, 44, 44, 45, 44, 44], [44, 44, 44, 44, 44, 58, 44, 30], + [44, 44, 44, 44, 47, 44, 44, 44], [44, 44, 44, 44, 55, 44, 44, 44], + [44, 44, 44, 59, 44, 44, 44, 44], [44, 44, 44, 69, 44, 68, 44, 44], + [44, 44, 44, 71, 44, 55, 44, 44], [44, 44, 44, 73, 44, 55, 44, 44], + [44, 44, 50, 44, 44, 44, 44, 44], [44, 44, 57, 44, 44, 44, 44, 44], + [44, 44, 70, 44, 44, 55, 59, 44], [44, 45, 55, 44, 44, 44, 44, 44], + [44, 55, 44, 44, 44, 44, 44, 59], [44, 55, 44, 45, 26, 44, 44, 44], + [44, 55, 44, 55, 44, 44, 44, 44], [44, 55, 44, 55, 69, 44, 44, 44], + [44, 57, 44, 44, 44, 38, 44, 44], [44, 57, 44, 59, 44, 44, 44, 45], + [44, 58, 44, 58, 44, 32, 44, 35], [44, 70, 44, 44, 44, 44, 44, 44], + [55, 44, 44, 44, 44, 54, 44, 40], [57, 39, 6, 44, 44, 44, 44, 44], + [57, 44, 44, 58, 44, 44, 44, 44], [59, 44, 21, 44, 44, 44, 44, 44], + [63, 44, 44, 55, 45, 44, 44, 28], ]; - static BITSET_CANONICAL: [u64; 48] = [ + static BITSET_CANONICAL: [u64; 44] = [ + 0b0000000111111111111111111111111111111111111111111111111111111111, 0b1111111111000000000000000000000000000000000000000000000000000000, 0b1111111111111111111111111111111111111111111111001111111111111111, 0b1111110000000000000000000000000000000000000000000000000000000000, 0b1111100000000000000000000000000000000000000000000000000000000000, - 0b0001111111111111111111100000000000000000000000000000000000000000, - 0b0000000111111111111111111111111111111111111111111111111111111111, 0b0000000000000000000000000000111100000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111111111111111111111, 0b1111111111111110000000000000000000000000000000000000001111111111, + 0b0001111111111111111111100000000000000000000000000000000000000000, 0b0000001111111111000000111111111100000000000000000000000000000000, 0b0000000000001111111111111111111111111111111111111111111110000000, - 0b0000000000000111111111111100000000000000000000000000000000000000, 0b0000000000000001110000000000000000000000000000000000000000000000, 0b0000000000000000111111111000000000000000000000000000000000000000, - 0b0000000000000000000000000000000000000000000000000000001111100111, - 0b0000000000000000000000000000000000000000000000000000010000000010, - 0b0000000000000000000000000000000000000000000111111111111111111111, - 0b0000000000000000000000000000000000000000011111111111111111111111, - 0b0000000000000000000000000000000000000111111111110000000000000000, - 0b0000000000000000000000000000000000001111111111111111111111111111, - 0b0000000000000000000000000000000011111111111111101111111100000000, - 0b0000000000000000000000000000001111111011111111110000000000000000, - 0b0000000000000000000111111111111111111111111111110000000000000000, - 0b0000000000000000011111111111111111111111111111111111111111111111, - 0b0000000000000000111111111111111111111111111111000000000000000000, - 0b0000000000011110111011111111111111111111111111111111111111111111, - 0b0000000000011111111111111111111000000011111111110000000000000000, - 0b0000000011111100111111111100000000000000000000000000000000000000, - 0b0000000111111111111111111100000001111111000000000000000000000000, - 0b0000001111110000111111111100000000000000000000000000000000000000, - 0b0000001111110001000000000000000000000000000000000000000000000000, - 0b0000001111111111000000000000000000000011111111110000000000000000, - 0b0000011100000000000000111111111000000000000000000000000010000000, - 0b0000111111111111000000000000000000000000000000000000000000000000, - 0b0000111111111111111111111111111000000000000000000000000000000000, - 0b0010000000000000000000000000000000000000000000000000000000000000, - 0b0011111111111111101111111111111111111111111111111111111111111110, - 0b0110000000000000000000000000000000000000000000000000000111111111, - 0b0111001000001100000000000000000000000000000000000000000000000000, - 0b0111111100000000111111111100000000000000000000000000000000000000, - 0b0111111111111111111111111111111100000000000000000000000000000000, + 0b1111111111111111111111111111111111111111111111111111111111111111, + 0b1111111111111111111111111111111111111111111111111100000000000000, + 0b1111111111111111111111111111111111111111111111110000000000000000, + 0b1111111111111111111111111111111100000000000000000000000000000000, + 0b1111111111111111111111000000000000000000000000000000000000000000, + 0b1111111111111110000000000000000000000000000000000000000000000000, 0b1111111000000000000000000000000011111111000000000000000000000000, + 0b0111111111111111111111111111111100000000000000000000000000000000, + 0b0111111100000000111111111100000000000000000000000000000000000000, + 0b0111001000001100000000000000000000000000000000000000000000000000, + 0b0110000000000000000000000000000000000000000000000000000111111111, + 0b0011111111111111101111111111111111111111111111111111111111111110, + 0b0010000000000000000000000000000000000000000000000000000000000000, + 0b0000111111111111111111111111111000000000000000000000000000000000, + 0b0000111111111111000000000000000000000000000000000000000000000000, + 0b0000011100000000000000111111111000000000000000000000000010000000, + 0b0000001111111111000000000000000000000011111111110000000000000000, + 0b0000001111110001000000000000000000000000000000000000000000000000, + 0b0000001111110000111111111100000000000000000000000000000000000000, + 0b0000000111111111111111111100000001111111000000000000000000000000, + 0b0000000011111100111111111100000000000000000000000000000000000000, + 0b0000000000011111111111111111111000000011111111110000000000000000, + 0b0000000000011110111011111111111111111111111111111111111111111111, + 0b0000000000000111111111111100000000000000000000000000000000000000, + 0b0000000000000000111111111111111111111111111111000000000000000000, + 0b0000000000000000000111111111111111111111111111110000000000000000, + 0b0000000000000000000000000000001111111011111111110000000000000000, + 0b0000000000000000000000000000000011111111111111101111111100000000, + 0b0000000000000000000000000000000000000111111111110000000000000000, + 0b0000000000000000000000000000000000000000000000000000010000000010, + 0b0000000000000000000000000000000000000000000000000000001111100111, 0b1111111100000000000000000000000011111111000000000000000000000000, - 0b1111111111111110000000000000000000000000000000000000000000000000, - 0b1111111111111111111111000000000000000000000000000000000000000000, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111110000000000000000, - 0b1111111111111111111111111111111111111111111111111100000000000000, ]; - static BITSET_MAPPING: [(u8, u8); 26] = [ - (0, 10), (0, 16), (0, 26), (0, 39), (0, 42), (0, 48), (0, 58), (1, 186), (1, 172), (2, 28), - (2, 54), (3, 22), (3, 48), (4, 23), (4, 55), (5, 140), (5, 176), (6, 49), (6, 50), (7, 128), - (8, 47), (9, 32), (10, 172), (11, 26), (12, 47), (13, 32), + static BITSET_MAPPING: [(u8, u8); 30] = [ + (0, 185), (0, 175), (0, 76), (0, 172), (0, 165), (0, 164), (0, 162), (0, 157), (0, 138), + (0, 112), (1, 16), (1, 26), (1, 39), (1, 42), (1, 48), (1, 58), (2, 122), (2, 108), (3, 28), + (3, 54), (4, 22), (4, 48), (5, 49), (5, 50), (6, 47), (7, 55), (8, 32), (9, 108), (10, 47), + (11, 32), ]; pub fn lookup(c: char) -> bool { @@ -979,73 +946,66 @@ pub mod n { pub mod uppercase { static BITSET_LAST_CHUNK_MAP: (u16, u8) = (124, 3); static BITSET_CHUNKS_MAP: [u8; 123] = [ - 12, 16, 4, 4, 2, 4, 4, 11, 8, 0, 4, 14, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 5, 4, 13, 4, 10, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 15, 4, 4, - 4, 4, 9, + 12, 15, 5, 5, 0, 5, 5, 2, 4, 11, 5, 14, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 6, 5, 13, 5, 10, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 16, 5, 5, + 5, 5, 9, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [ - [8, 8, 2, 57, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], - [8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], - [8, 8, 4, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 60], - [8, 8, 8, 8, 1, 49, 59, 8, 8, 8, 8, 8, 8, 8, 8, 8], - [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], - [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 54, 8, 8, 8], - [8, 8, 8, 8, 8, 8, 8, 8, 8, 17, 13, 8, 31, 37, 35, 23], - [8, 8, 8, 8, 8, 8, 8, 8, 8, 63, 8, 8, 8, 8, 8, 8], - [8, 8, 8, 8, 42, 20, 66, 8, 8, 8, 8, 8, 8, 8, 8, 8], - [8, 8, 8, 8, 64, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], - [8, 8, 22, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], - [8, 8, 45, 8, 8, 8, 8, 8, 34, 34, 65, 34, 47, 19, 27, 28], - [8, 51, 8, 14, 41, 30, 29, 36, 38, 10, 8, 8, 8, 40, 16, 44], - [15, 8, 1, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], - [18, 43, 34, 21, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], - [55, 0, 24, 52, 39, 50, 25, 53, 46, 56, 5, 26, 3, 62, 61, 7], - [58, 32, 6, 33, 48, 12, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], + [41, 41, 5, 33, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 5, 0], + [41, 41, 5, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41], + [41, 41, 38, 41, 41, 41, 41, 41, 17, 17, 61, 17, 40, 29, 24, 23], + [41, 41, 41, 41, 9, 8, 42, 41, 41, 41, 41, 41, 41, 41, 41, 41], + [41, 41, 41, 41, 35, 28, 65, 41, 41, 41, 41, 41, 41, 41, 41, 41], + [41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41], + [41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 56, 41, 41, 41], + [41, 41, 41, 41, 41, 41, 41, 41, 41, 46, 41, 41, 41, 41, 41, 41], + [41, 41, 41, 41, 41, 41, 41, 41, 41, 60, 59, 41, 20, 14, 16, 4], + [41, 41, 41, 41, 47, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41], + [41, 41, 51, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41], + [41, 41, 52, 43, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41], + [41, 53, 41, 31, 34, 21, 22, 15, 13, 32, 41, 41, 41, 11, 30, 37], + [48, 41, 9, 44, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41], + [49, 36, 17, 27, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41], + [50, 19, 2, 18, 10, 45, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41], + [57, 1, 26, 54, 12, 7, 25, 55, 39, 58, 6, 3, 64, 63, 62, 66], ]; - static BITSET_CANONICAL: [u64; 51] = [ + static BITSET_CANONICAL: [u64; 41] = [ + 0b0000000000111111111111111111111111111111111111111111111111111111, 0b1111111111111111111111110000000000000000000000000011111111111111, - 0b1111111111111111000000000000000000000000000000000000000000000000, - 0b1111111111000000000000000000000000000000000000000000000000000000, - 0b0000000000011111111111111111111111110000000000000000000000000000, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111111110000000000000000000000000000001111111111, 0b0101010101010101010101010101010101010101010101010101010000000001, - 0b0000000000000000000000000000000000000000000000000000010000000000, - 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b0000000000000000000000000000000000000000000000000010000010111111, - 0b0000000000000000000000000000000000000000000000000101010101111010, - 0b0000000000000000000000000000000000000000000011111111111111111111, - 0b0000000000000000000000000000000000000000011111111111111111111111, - 0b0000000000000000000000000000000000000101010101010101010101010101, - 0b0000000000000000000000000000000001111111011111111111111111111111, - 0b0000000000000000000000001111111111111111111111111111111111111111, - 0b0000000000000000000011111111101111111111111111101101011101000000, - 0b0000000000000000000101010101010101010101010101010101010101010101, - 0b0000000000000000011111111111111111111111111111111111111111111111, - 0b0000000000000000111111110000000010101010000000000011111100000000, - 0b0000000000000000111111111111111100000000000000000000000000100000, - 0b0000000000000100001010000000010101010101010101010101010101010101, - 0b0000000000000111111111111111111111111111111111111111111111111111, - 0b0000000000100000000000000000000000000000000000000000001011110100, - 0b0000000000111111110111100110010011010000000000000000000000000011, - 0b0000001111111111111111111111111100000000000000000000000000111111, 0b0000011111111111111111111111110000000000000000000000000000000001, - 0b0000111100000000000000000000000000000000000000000000000000000000, - 0b0000111100000000000111110000000000001111000000000000111100000000, - 0b0001000110101110110100101101010110110001110110111100111011010110, - 0b0010101101010101010101010101010101010101010101010101010010101010, - 0b0101010101010100010101010101010000000000000000000000000000000000, - 0b0101010101010101010101010101010100000000000000000000000000000000, - 0b0101010101010101010101010101010101010101010101010010101010101011, - 0b0101010101010101010101010101010101010101010101010101010101010101, - 0b0101010101011111011111010101010101010101010001010010100001010101, - 0b0101010111010010010101010101010101001010101010101010010010010000, - 0b0110101000000000010101010101010101010101010101010101010101010101, - 0b0110110000000101010101010101010101010101010101010101010101010101, - 0b0111101100000000000000000000000000011111110111111110011110110000, + 0b0000000000100000000000000000000000000000000000000000001011110100, + 0b1111111111111111111111111111111100000000000000000000000000000000, + 0b1111111111111111111111110000000000000000000000000000001111111111, + 0b1111111111111111111100000000000000000000000000011111110001011111, + 0b1111111111111111000000111111111111111111111111110000001111111111, + 0b1111111111111111000000000000000000000000000000000000000000000000, + 0b1111111111111110010101010101010101010101010101010101010101010101, 0b1000000001000101000000000000000000000000000000000000000000000000, + 0b0111101100000000000000000000000000011111110111111110011110110000, + 0b0110110000000101010101010101010101010101010101010101010101010101, + 0b0110101000000000010101010101010101010101010101010101010101010101, + 0b0101010111010010010101010101010101001010101010101010010010010000, + 0b0101010101011111011111010101010101010101010001010010100001010101, + 0b0101010101010101010101010101010101010101010101010101010101010101, + 0b0101010101010101010101010101010101010101010101010010101010101011, + 0b0101010101010101010101010101010100000000000000000000000000000000, + 0b0101010101010100010101010101010000000000000000000000000000000000, + 0b0010101101010101010101010101010101010101010101010101010010101010, + 0b0001000110101110110100101101010110110001110110111100111011010110, + 0b0000111100000000000111110000000000001111000000000000111100000000, + 0b0000111100000000000000000000000000000000000000000000000000000000, + 0b0000001111111111111111111111111100000000000000000000000000111111, + 0b0000000000111111110111100110010011010000000000000000000000000011, + 0b0000000000000100001010000000010101010101010101010101010101010101, + 0b0000000000000000111111111111111100000000000000000000000000100000, + 0b0000000000000000111111110000000010101010000000000011111100000000, + 0b0000000000000000000011111111101111111111111111101101011101000000, + 0b0000000000000000000000000000000001111111011111111111111111111111, + 0b0000000000000000000000000000000000000000000000000101010101111010, + 0b0000000000000000000000000000000000000000000000000010000010111111, 0b1010101001010101010101010101010101010101010101010101010101010101, 0b1100000000001111001111010101000000111110001001110011100010000100, 0b1100000000100101111010101001110100000000000000000000000000000000, @@ -1053,13 +1013,11 @@ pub mod uppercase { 0b1110011111111111111111111111111111111111111111110000000000000000, 0b1111000000000000000000000000001111111111111111111111111100000000, 0b1111111100000000111111110000000000111111000000001111111100000000, - 0b1111111111111110010101010101010101010101010101010101010101010101, - 0b1111111111111111000000111111111111111111111111110000001111111111, - 0b1111111111111111111100000000000000000000000000011111110001011111, ]; - static BITSET_MAPPING: [(u8, u8); 16] = [ - (0, 179), (0, 130), (0, 134), (0, 147), (0, 12), (0, 8), (1, 16), (1, 128), (2, 10), - (2, 128), (3, 52), (3, 58), (4, 32), (5, 24), (6, 20), (7, 57), + static BITSET_MAPPING: [(u8, u8); 26] = [ + (0, 182), (0, 74), (0, 166), (0, 162), (0, 159), (0, 150), (0, 148), (0, 142), (0, 135), + (0, 134), (0, 131), (0, 64), (1, 115), (1, 66), (1, 70), (1, 83), (1, 12), (1, 8), (2, 164), + (2, 146), (2, 20), (3, 146), (3, 140), (3, 134), (4, 178), (4, 171), ]; pub fn lookup(c: char) -> bool { @@ -1078,20 +1036,19 @@ pub mod uppercase { pub mod white_space { static BITSET_LAST_CHUNK_MAP: (u16, u8) = (32, 3); static BITSET_CHUNKS_MAP: [u8; 22] = [ - 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, ]; static BITSET_INDEX_CHUNKS: [[u8; 6]; 4] = [ - [1, 1, 1, 1, 1, 1], [1, 1, 4, 0, 1, 1], [3, 1, 2, 1, 1, 1], [5, 1, 1, 1, 1, 1], + [1, 4, 2, 4, 4, 4], [4, 4, 0, 3, 4, 4], [4, 4, 4, 4, 4, 4], [5, 4, 4, 4, 4, 4], ]; - static BITSET_CANONICAL: [u64; 5] = [ - 0b0000000000000000000000000000000010000000000000000000000000000000, - 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b0000000000000000000000000000000100000000000000000000000000100000, - 0b0000000000000000000000000000000100000000000000000011111000000000, + static BITSET_CANONICAL: [u64; 4] = [ 0b0000000000000000100000110000000000000000000000000000011111111111, + 0b0000000000000000000000000000000100000000000000000011111000000000, + 0b0000000000000000000000000000000100000000000000000000000000100000, + 0b0000000000000000000000000000000010000000000000000000000000000000, ]; - static BITSET_MAPPING: [(u8, u8); 1] = [ - (0, 33), + static BITSET_MAPPING: [(u8, u8); 2] = [ + (0, 176), (0, 175), ]; pub fn lookup(c: char) -> bool { diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 5e8865fc9e3b5..65ece05043a8a 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -286,13 +286,18 @@ fn range_search< } else { let (real_idx, mapping) = bitset_canonicalized[idx - CANONICAL]; let mut word = bitset_canonical[real_idx as usize]; - let should_invert = mapping & (1 << 7) != 0; + let should_invert = mapping & (1 << 6) != 0; if should_invert { word = !word; } - // Unset the inversion bit - let rotate_by = mapping & !(1 << 7); - word = word.rotate_left(rotate_by as u32); + // Lower 6 bits + let quantity = mapping & ((1 << 6) - 1); + if mapping & (1 << 7) != 0 { + // shift + word >>= quantity as u64; + } else { + word = word.rotate_left(quantity as u32); + } word }; (word & (1 << (needle % 64) as u64)) != 0 diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index 38b36c3404228..a0814fd0d3663 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -233,6 +233,7 @@ impl Canonicalized { Rotate(u32), Invert, RotateAndInvert(u32), + ShiftRight(u32), } // key is the word being mapped to @@ -270,6 +271,18 @@ impl Canonicalized { continue 'b; } } + + // All possible shifts + for shift_by in 1..64 { + if a == (b >> shift_by) { + mappings + .entry(b) + .or_default() + .push((a, Mapping::ShiftRight(shift_by as u32))); + // We're not interested in further mappings between a and b + continue 'b; + } + } } } // These are the bitset words which will be represented "raw" (as a u64) @@ -384,6 +397,8 @@ impl Canonicalized { assert!(distinct_indices.insert(idx)); } + const LOWER_6: u32 = (1 << 6) - 1; + let canonicalized_words = canonicalized_words .into_iter() .map(|v| { @@ -391,14 +406,18 @@ impl Canonicalized { u8::try_from(v.0).unwrap(), match v.1 { Mapping::RotateAndInvert(amount) => { - assert!(amount < (1 << 7)); - 1 << 7 | (amount as u8) + assert_eq!(amount, amount & LOWER_6); + 1 << 6 | (amount as u8) } Mapping::Rotate(amount) => { - assert!(amount < (1 << 7)); + assert_eq!(amount, amount & LOWER_6); amount as u8 } - Mapping::Invert => 1 << 7, + Mapping::Invert => 1 << 6, + Mapping::ShiftRight(shift_by) => { + assert_eq!(shift_by, shift_by & LOWER_6); + 1 << 7 | (shift_by as u8) + } }, ) }) From 5f71d98f90354f9ee67c2b77c8607fbc9169d63e Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Sat, 21 Mar 2020 12:20:18 -0400 Subject: [PATCH 07/14] Deduplicate test and primary range_search definitions This ensures that what we test is what we get for final results as well. --- src/libcore/unicode/mod.rs | 45 -------------- src/libcore/unicode/unicode_data.rs | 51 +++++++++++++++- src/tools/unicode-table-generator/src/main.rs | 59 ++----------------- .../src/range_search.rs | 49 +++++++++++++++ 4 files changed, 103 insertions(+), 101 deletions(-) create mode 100644 src/tools/unicode-table-generator/src/range_search.rs diff --git a/src/libcore/unicode/mod.rs b/src/libcore/unicode/mod.rs index 39532166a0b66..94a2507e26ccb 100644 --- a/src/libcore/unicode/mod.rs +++ b/src/libcore/unicode/mod.rs @@ -32,48 +32,3 @@ pub use unicode_data::lowercase::lookup as Lowercase; pub use unicode_data::n::lookup as N; pub use unicode_data::uppercase::lookup as Uppercase; pub use unicode_data::white_space::lookup as White_Space; - -#[inline(always)] -fn range_search< - const N: usize, - const CHUNK_SIZE: usize, - const N1: usize, - const CANONICAL: usize, - const CANONICALIZED: usize, ->( - needle: u32, - chunk_idx_map: &[u8; N], - (last_chunk_idx, last_chunk_mapping): (u16, u8), - bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], - bitset_canonical: &[u64; CANONICAL], - bitset_canonicalized: &[(u8, u8); CANONICALIZED], -) -> bool { - let bucket_idx = (needle / 64) as usize; - let chunk_map_idx = bucket_idx / CHUNK_SIZE; - let chunk_piece = bucket_idx % CHUNK_SIZE; - let chunk_idx = if chunk_map_idx >= N { - if chunk_map_idx == last_chunk_idx as usize { - last_chunk_mapping - } else { - return false; - } - } else { - chunk_idx_map[chunk_map_idx] - }; - let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece] as usize; - let word = if idx < CANONICAL { - bitset_canonical[idx] - } else { - let (real_idx, mapping) = bitset_canonicalized[idx - CANONICAL]; - let mut word = bitset_canonical[real_idx as usize]; - let should_invert = mapping & (1 << 6) != 0; - if should_invert { - word = !word; - } - // Unset the inversion bit - let rotate_by = mapping & !(1 << 6); - word = word.rotate_left(rotate_by as u32); - word - }; - (word & (1 << (needle % 64) as u64)) != 0 -} diff --git a/src/libcore/unicode/unicode_data.rs b/src/libcore/unicode/unicode_data.rs index bae6d8ea95365..5b1efbaa28fe7 100644 --- a/src/libcore/unicode/unicode_data.rs +++ b/src/libcore/unicode/unicode_data.rs @@ -1,5 +1,54 @@ ///! This file is generated by src/tools/unicode-table-generator; do not edit manually! -use super::range_search; + +#[inline(always)] +fn range_search< + const N: usize, + const CHUNK_SIZE: usize, + const N1: usize, + const CANONICAL: usize, + const CANONICALIZED: usize, +>( + needle: u32, + chunk_idx_map: &[u8; N], + (last_chunk_idx, last_chunk_mapping): (u16, u8), + bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], + bitset_canonical: &[u64; CANONICAL], + bitset_canonicalized: &[(u8, u8); CANONICALIZED], +) -> bool { + let bucket_idx = (needle / 64) as usize; + let chunk_map_idx = bucket_idx / CHUNK_SIZE; + let chunk_piece = bucket_idx % CHUNK_SIZE; + let chunk_idx = if chunk_map_idx >= N { + if chunk_map_idx == last_chunk_idx as usize { + last_chunk_mapping + } else { + return false; + } + } else { + chunk_idx_map[chunk_map_idx] + }; + let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece] as usize; + let word = if idx < CANONICAL { + bitset_canonical[idx] + } else { + let (real_idx, mapping) = bitset_canonicalized[idx - CANONICAL]; + let mut word = bitset_canonical[real_idx as usize]; + let should_invert = mapping & (1 << 6) != 0; + if should_invert { + word = !word; + } + // Lower 6 bits + let quantity = mapping & ((1 << 6) - 1); + if mapping & (1 << 7) != 0 { + // shift + word >>= quantity as u64; + } else { + word = word.rotate_left(quantity as u32); + } + word + }; + (word & (1 << (needle % 64) as u64)) != 0 +} pub const UNICODE_VERSION: (u32, u32, u32) = (13, 0, 0); diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 65ece05043a8a..af23c166871e1 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -181,7 +181,10 @@ fn main() { "///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n", ); - table_file.push_str("use super::range_search;\n\n"); + // Include the range search function + table_file.push('\n'); + table_file.push_str(include_str!("range_search.rs")); + table_file.push('\n'); table_file.push_str(&version()); @@ -251,60 +254,6 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec>)]) -> String s.push_str(&format!("#[path = \"{}\"]\n", data_path)); s.push_str("mod unicode_data;\n\n"); - s.push_str( - " -#[inline(always)] -fn range_search< - const N: usize, - const CHUNK_SIZE: usize, - const N1: usize, - const CANONICAL: usize, - const CANONICALIZED: usize, ->( - needle: u32, - chunk_idx_map: &[u8; N], - (last_chunk_idx, last_chunk_mapping): (u16, u8), - bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], - bitset_canonical: &[u64; CANONICAL], - bitset_canonicalized: &[(u8, u8); CANONICALIZED], -) -> bool { - let bucket_idx = (needle / 64) as usize; - let chunk_map_idx = bucket_idx / CHUNK_SIZE; - let chunk_piece = bucket_idx % CHUNK_SIZE; - let chunk_idx = if chunk_map_idx >= N { - if chunk_map_idx == last_chunk_idx as usize { - last_chunk_mapping - } else { - return false; - } - } else { - chunk_idx_map[chunk_map_idx] - }; - let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece] as usize; - let word = if idx < CANONICAL { - bitset_canonical[idx] - } else { - let (real_idx, mapping) = bitset_canonicalized[idx - CANONICAL]; - let mut word = bitset_canonical[real_idx as usize]; - let should_invert = mapping & (1 << 6) != 0; - if should_invert { - word = !word; - } - // Lower 6 bits - let quantity = mapping & ((1 << 6) - 1); - if mapping & (1 << 7) != 0 { - // shift - word >>= quantity as u64; - } else { - word = word.rotate_left(quantity as u32); - } - word - }; - (word & (1 << (needle % 64) as u64)) != 0 -} - ", - ); - s.push_str("\nfn main() {\n"); for (property, ranges) in ranges { diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs new file mode 100644 index 0000000000000..a0bc1e6aec53a --- /dev/null +++ b/src/tools/unicode-table-generator/src/range_search.rs @@ -0,0 +1,49 @@ +#[inline(always)] +fn range_search< + const N: usize, + const CHUNK_SIZE: usize, + const N1: usize, + const CANONICAL: usize, + const CANONICALIZED: usize, +>( + needle: u32, + chunk_idx_map: &[u8; N], + (last_chunk_idx, last_chunk_mapping): (u16, u8), + bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], + bitset_canonical: &[u64; CANONICAL], + bitset_canonicalized: &[(u8, u8); CANONICALIZED], +) -> bool { + let bucket_idx = (needle / 64) as usize; + let chunk_map_idx = bucket_idx / CHUNK_SIZE; + let chunk_piece = bucket_idx % CHUNK_SIZE; + let chunk_idx = if chunk_map_idx >= N { + if chunk_map_idx == last_chunk_idx as usize { + last_chunk_mapping + } else { + return false; + } + } else { + chunk_idx_map[chunk_map_idx] + }; + let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece] as usize; + let word = if idx < CANONICAL { + bitset_canonical[idx] + } else { + let (real_idx, mapping) = bitset_canonicalized[idx - CANONICAL]; + let mut word = bitset_canonical[real_idx as usize]; + let should_invert = mapping & (1 << 6) != 0; + if should_invert { + word = !word; + } + // Lower 6 bits + let quantity = mapping & ((1 << 6) - 1); + if mapping & (1 << 7) != 0 { + // shift + word >>= quantity as u64; + } else { + word = word.rotate_left(quantity as u32); + } + word + }; + (word & (1 << (needle % 64) as u64)) != 0 +} From 233ab2f168d29d5e77abc1bc6c3923edf3575e08 Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Sat, 21 Mar 2020 15:22:41 -0400 Subject: [PATCH 08/14] Push the byte of LAST_CHUNK_MAP into the array This optimizes slightly better. Alphabetic : 2536 bytes Case_Ignorable : 1771 bytes Cased : 788 bytes Cc : 24 bytes Grapheme_Extend: 1488 bytes Lowercase : 863 bytes N : 1038 bytes Uppercase : 776 bytes White_Space : 83 bytes Total table sizes: 9367 bytes (-18 bytes; 2 bytes per set) --- src/libcore/unicode/unicode_data.rs | 70 +++++++++---------- .../src/range_search.rs | 16 ++--- .../src/raw_emitter.rs | 9 ++- 3 files changed, 49 insertions(+), 46 deletions(-) diff --git a/src/libcore/unicode/unicode_data.rs b/src/libcore/unicode/unicode_data.rs index 5b1efbaa28fe7..1899b927592d9 100644 --- a/src/libcore/unicode/unicode_data.rs +++ b/src/libcore/unicode/unicode_data.rs @@ -10,7 +10,7 @@ fn range_search< >( needle: u32, chunk_idx_map: &[u8; N], - (last_chunk_idx, last_chunk_mapping): (u16, u8), + last_chunk_idx: u16, bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], bitset_canonical: &[u64; CANONICAL], bitset_canonicalized: &[(u8, u8); CANONICALIZED], @@ -18,14 +18,14 @@ fn range_search< let bucket_idx = (needle / 64) as usize; let chunk_map_idx = bucket_idx / CHUNK_SIZE; let chunk_piece = bucket_idx % CHUNK_SIZE; - let chunk_idx = if chunk_map_idx >= N { - if chunk_map_idx == last_chunk_idx as usize { - last_chunk_mapping - } else { - return false; - } - } else { + // The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`, + // so we need to remap it + let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) { chunk_idx_map[chunk_map_idx] + } else if chunk_map_idx == last_chunk_idx as usize { + chunk_idx_map[chunk_idx_map.len() - 1] + } else { + return false; }; let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece] as usize; let word = if idx < CANONICAL { @@ -54,8 +54,8 @@ pub const UNICODE_VERSION: (u32, u32, u32) = (13, 0, 0); #[rustfmt::skip] pub mod alphabetic { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (393, 13); - static BITSET_CHUNKS_MAP: [u8; 393] = [ + const BITSET_LAST_CHUNK_MAP: u16 = 393; + static BITSET_CHUNKS_MAP: [u8; 394] = [ 61, 18, 2, 35, 46, 39, 38, 74, 37, 25, 70, 34, 36, 73, 66, 5, 52, 58, 54, 58, 58, 58, 69, 64, 43, 58, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 6, 6, 23, @@ -70,7 +70,7 @@ pub mod alphabetic { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 21, 6, 6, 6, 6, 6, 6, 6, 15, 72, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, - 58, 58, 58, 58, 58, 58, 6, 62, 58, 58, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 58, 58, 58, 58, 58, 58, 6, 62, 58, 58, 6, 6, 6, 6, 6, 6, 6, 6, 6, 13, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 75] = [ [0, 252, 121, 172, 14, 172, 172, 172], [13, 51, 125, 172, 79, 35, 166, 172], @@ -312,8 +312,8 @@ pub mod alphabetic { #[rustfmt::skip] pub mod case_ignorable { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 2); - static BITSET_CHUNKS_MAP: [u8; 250] = [ + const BITSET_LAST_CHUNK_MAP: u16 = 1792; + static BITSET_CHUNKS_MAP: [u8; 251] = [ 14, 28, 47, 22, 19, 11, 4, 13, 9, 40, 39, 32, 49, 23, 15, 36, 18, 39, 39, 39, 39, 39, 27, 26, 12, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, @@ -324,7 +324,7 @@ pub mod case_ignorable { 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 44, 39, 35, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 29, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 34, 48, 39, 39, 39, 0, 39, 39, 21, 43, 39, 39, 45, 39, 39, 39, 39, 37, + 34, 48, 39, 39, 39, 0, 39, 39, 21, 43, 39, 39, 45, 39, 39, 39, 39, 37, 2, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 52] = [ [3, 75, 88, 142, 142, 142, 142, 142], [5, 110, 38, 181, 142, 142, 12, 182], @@ -503,14 +503,14 @@ pub mod case_ignorable { #[rustfmt::skip] pub mod cased { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (124, 12); - static BITSET_CHUNKS_MAP: [u8; 123] = [ + const BITSET_LAST_CHUNK_MAP: u16 = 124; + static BITSET_CHUNKS_MAP: [u8; 124] = [ 4, 0, 18, 18, 6, 18, 18, 9, 5, 8, 18, 3, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 14, 15, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 16, 18, 1, 18, 10, 18, 18, 7, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 13, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, - 18, 2, 18, 18, 18, 18, 11, + 18, 2, 18, 18, 18, 18, 11, 12, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 19] = [ [5, 5, 7, 5, 50, 10, 40, 58, 58, 58, 58, 58, 58, 58, 58, 58], @@ -594,9 +594,9 @@ pub mod cased { #[rustfmt::skip] pub mod cc { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (2, 0); - static BITSET_CHUNKS_MAP: [u8; 2] = [ - 0, 1, + const BITSET_LAST_CHUNK_MAP: u16 = 2; + static BITSET_CHUNKS_MAP: [u8; 3] = [ + 0, 1, 0, ]; static BITSET_INDEX_CHUNKS: [[u8; 1]; 3] = [ [0], [1], [2], @@ -623,8 +623,8 @@ pub mod cc { #[rustfmt::skip] pub mod grapheme_extend { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 3); - static BITSET_CHUNKS_MAP: [u8; 245] = [ + const BITSET_LAST_CHUNK_MAP: u16 = 1792; + static BITSET_CHUNKS_MAP: [u8; 246] = [ 34, 30, 41, 44, 17, 11, 0, 12, 9, 36, 34, 29, 43, 20, 13, 34, 21, 34, 34, 34, 34, 34, 26, 34, 16, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, @@ -635,7 +635,7 @@ pub mod grapheme_extend { 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 40, 34, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 24, 34, 34, 34, 34, 34, 34, 34, 34, - 34, 32, 42, 34, 34, 34, 1, 34, 34, 19, 38, 34, 34, 39, + 34, 32, 42, 34, 34, 34, 1, 34, 34, 19, 38, 34, 34, 39, 3, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 45] = [ [1, 85, 27, 86, 34, 84, 100, 88], [4, 60, 71, 120, 120, 120, 120, 120], @@ -785,12 +785,12 @@ pub mod grapheme_extend { #[rustfmt::skip] pub mod lowercase { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (122, 5); - static BITSET_CHUNKS_MAP: [u8; 118] = [ + const BITSET_LAST_CHUNK_MAP: u16 = 122; + static BITSET_CHUNKS_MAP: [u8; 119] = [ 16, 2, 9, 9, 4, 9, 9, 15, 3, 12, 9, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 11, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 10, 9, 0, 9, 14, 9, 9, 13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 17, + 9, 9, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 17, 5, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 18] = [ [10, 55, 52, 6, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], @@ -886,8 +886,8 @@ pub mod lowercase { #[rustfmt::skip] pub mod n { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (253, 21); - static BITSET_CHUNKS_MAP: [u8; 249] = [ + const BITSET_LAST_CHUNK_MAP: u16 = 253; + static BITSET_CHUNKS_MAP: [u8; 250] = [ 45, 19, 19, 39, 23, 40, 6, 37, 33, 17, 19, 12, 42, 32, 41, 19, 8, 19, 2, 16, 19, 19, 13, 19, 1, 43, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, @@ -898,7 +898,7 @@ pub mod n { 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 38, 19, 30, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 27, 19, 18, 19, 19, 19, 19, 22, 26, 19, 19, 29, 19, 3, 19, 24, + 19, 19, 27, 19, 18, 19, 19, 19, 19, 22, 26, 19, 19, 29, 19, 3, 19, 24, 21, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 47] = [ [12, 52, 44, 44, 44, 44, 44, 44], [27, 44, 44, 44, 44, 44, 67, 44], @@ -993,13 +993,13 @@ pub mod n { #[rustfmt::skip] pub mod uppercase { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (124, 3); - static BITSET_CHUNKS_MAP: [u8; 123] = [ + const BITSET_LAST_CHUNK_MAP: u16 = 124; + static BITSET_CHUNKS_MAP: [u8; 124] = [ 12, 15, 5, 5, 0, 5, 5, 2, 4, 11, 5, 14, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 13, 5, 10, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 16, 5, 5, - 5, 5, 9, + 5, 5, 9, 3, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [ [41, 41, 5, 33, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 5, 0], @@ -1083,9 +1083,9 @@ pub mod uppercase { #[rustfmt::skip] pub mod white_space { - static BITSET_LAST_CHUNK_MAP: (u16, u8) = (32, 3); - static BITSET_CHUNKS_MAP: [u8; 22] = [ - 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, + const BITSET_LAST_CHUNK_MAP: u16 = 32; + static BITSET_CHUNKS_MAP: [u8; 23] = [ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 3, ]; static BITSET_INDEX_CHUNKS: [[u8; 6]; 4] = [ [1, 4, 2, 4, 4, 4], [4, 4, 0, 3, 4, 4], [4, 4, 4, 4, 4, 4], [5, 4, 4, 4, 4, 4], diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs index a0bc1e6aec53a..12efa5a9f83bf 100644 --- a/src/tools/unicode-table-generator/src/range_search.rs +++ b/src/tools/unicode-table-generator/src/range_search.rs @@ -8,7 +8,7 @@ fn range_search< >( needle: u32, chunk_idx_map: &[u8; N], - (last_chunk_idx, last_chunk_mapping): (u16, u8), + last_chunk_idx: u16, bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], bitset_canonical: &[u64; CANONICAL], bitset_canonicalized: &[(u8, u8); CANONICALIZED], @@ -16,14 +16,14 @@ fn range_search< let bucket_idx = (needle / 64) as usize; let chunk_map_idx = bucket_idx / CHUNK_SIZE; let chunk_piece = bucket_idx % CHUNK_SIZE; - let chunk_idx = if chunk_map_idx >= N { - if chunk_map_idx == last_chunk_idx as usize { - last_chunk_mapping - } else { - return false; - } - } else { + // The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`, + // so we need to remap it + let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) { chunk_idx_map[chunk_map_idx] + } else if chunk_map_idx == last_chunk_idx as usize { + chunk_idx_map[chunk_idx_map.len() - 1] + } else { + return false; }; let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece] as usize; let word = if idx < CANONICAL { diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index a0814fd0d3663..4898df3c80018 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -150,19 +150,22 @@ impl RawEmitter { while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx { chunk_indices.pop(); } + // We do not count the LAST_CHUNK_MAP as adding bytes because it's a + // small constant whose values are inlined directly into the instruction + // stream. writeln!( &mut self.file, - "static BITSET_LAST_CHUNK_MAP: (u16, u8) = ({}, {});", + "const BITSET_LAST_CHUNK_MAP: u16 = {};", chunk_indices.len() - 1, - chunk_indices.pop().unwrap(), ) .unwrap(); - self.bytes_used += 3; + let nonzero = chunk_indices.pop().unwrap(); // Try to pop again, now that we've recorded a non-zero pointing index // into the LAST_CHUNK_MAP. while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx { chunk_indices.pop(); } + chunk_indices.push(nonzero); writeln!( &mut self.file, "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];", From a7ec6f8fe0fb10fa91ac40f68beccd2675cba50c Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Sat, 21 Mar 2020 17:20:57 -0400 Subject: [PATCH 09/14] Arrange for zero to be canonical We find that it is common for large ranges of chars to be false -- and that means that it is plausibly common for us to ask about a word that is entirely empty. Therefore, we should make sure that we do not need to rotate bits or otherwise perform some operation to map to the zero word; canonicalize it first if possible. --- src/libcore/unicode/unicode_data.rs | 479 +++++++++--------- .../src/raw_emitter.rs | 16 +- 2 files changed, 242 insertions(+), 253 deletions(-) diff --git a/src/libcore/unicode/unicode_data.rs b/src/libcore/unicode/unicode_data.rs index 1899b927592d9..aea923e90757e 100644 --- a/src/libcore/unicode/unicode_data.rs +++ b/src/libcore/unicode/unicode_data.rs @@ -56,64 +56,65 @@ pub const UNICODE_VERSION: (u32, u32, u32) = (13, 0, 0); pub mod alphabetic { const BITSET_LAST_CHUNK_MAP: u16 = 393; static BITSET_CHUNKS_MAP: [u8; 394] = [ - 61, 18, 2, 35, 46, 39, 38, 74, 37, 25, 70, 34, 36, 73, 66, 5, 52, 58, 54, 58, 58, 58, 69, - 64, 43, 58, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 6, 6, 23, - 47, 49, 65, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 58, 58, 58, - 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 57, 33, 17, 51, 40, 53, 4, 16, 41, 45, - 30, 55, 28, 42, 27, 0, 67, 71, 1, 56, 6, 12, 31, 58, 58, 58, 58, 58, 6, 6, 63, 58, 58, 58, - 58, 58, 58, 58, 6, 29, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 6, - 68, 58, 50, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 6, 6, 20, 58, 58, 58, 58, 58, 58, 58, 58, - 58, 58, 58, 58, 58, 58, 58, 58, 58, 14, 22, 58, 58, 58, 58, 26, 58, 58, 58, 58, 58, 58, 58, - 58, 58, 58, 58, 32, 24, 58, 58, 58, 58, 48, 60, 58, 58, 19, 58, 58, 44, 59, 58, 58, 58, 58, - 58, 58, 58, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 21, 6, 6, 6, 6, - 6, 6, 6, 15, 72, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, - 58, 58, 58, 58, 58, 58, 6, 62, 58, 58, 6, 6, 6, 6, 6, 6, 6, 6, 6, 13, + 11, 35, 32, 14, 25, 18, 17, 74, 16, 29, 12, 61, 15, 73, 66, 36, 9, 0, 6, 0, 0, 0, 70, 64, + 22, 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 42, 39, 39, 53, 26, 28, 65, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 60, 48, 8, 19, 5, 34, 47, 20, 24, 57, 7, 55, 21, 31, 69, 67, 71, 13, 3, + 39, 43, 58, 0, 0, 0, 0, 0, 39, 39, 63, 0, 0, 0, 0, 0, 0, 0, 39, 56, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 39, 68, 0, 10, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 41, 39, + 39, 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 45, 52, 0, 0, 0, 0, 30, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 59, 54, 0, 0, 0, 0, 27, 4, 0, 0, 49, 0, 0, 23, 2, 0, 0, 0, 0, 0, 0, + 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 51, 39, 39, 39, 39, 39, 39, 39, + 46, 72, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 33, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 40, 0, 0, 0, 0, 0, 0, 39, 62, 0, 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 44, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 75] = [ - [0, 252, 121, 172, 14, 172, 172, 172], [13, 51, 125, 172, 79, 35, 166, 172], - [15, 15, 7, 15, 221, 27, 76, 138], [15, 15, 10, 15, 15, 15, 15, 15], - [15, 15, 11, 108, 247, 194, 172, 172], [15, 15, 15, 15, 8, 96, 91, 104], - [15, 15, 15, 15, 15, 15, 15, 15], [15, 15, 15, 15, 15, 15, 15, 172], - [15, 15, 15, 15, 15, 15, 15, 193], [15, 15, 15, 15, 15, 15, 15, 210], - [15, 15, 15, 15, 15, 15, 15, 214], [15, 15, 15, 15, 15, 15, 47, 238], - [15, 15, 15, 15, 15, 15, 188, 172], [15, 15, 15, 15, 15, 181, 172, 172], - [15, 15, 15, 15, 192, 45, 15, 15], [15, 15, 15, 15, 207, 15, 15, 15], - [15, 15, 15, 15, 209, 153, 172, 172], [15, 15, 15, 15, 215, 5, 232, 110], - [15, 15, 15, 145, 172, 77, 33, 218], [15, 15, 15, 176, 15, 170, 172, 172], - [15, 15, 15, 187, 179, 172, 172, 172], [15, 15, 15, 191, 15, 15, 15, 15], - [15, 15, 15, 213, 172, 172, 172, 172], [15, 15, 182, 251, 15, 15, 15, 15], - [15, 15, 230, 61, 235, 236, 237, 234], [15, 22, 88, 19, 20, 189, 244, 248], - [15, 103, 161, 172, 172, 172, 172, 172], [15, 158, 15, 171, 172, 172, 87, 245], - [15, 177, 118, 151, 205, 126, 15, 164], [15, 178, 172, 172, 172, 172, 172, 172], - [15, 179, 205, 205, 195, 172, 172, 172], [15, 200, 15, 15, 15, 175, 172, 172], - [15, 224, 63, 225, 90, 17, 15, 15], [15, 228, 15, 188, 92, 16, 204, 18], - [15, 229, 25, 119, 133, 134, 1, 165], [26, 37, 15, 80, 5, 4, 204, 115], - [30, 211, 40, 208, 120, 132, 239, 180], [59, 5, 23, 60, 15, 15, 15, 15], + [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 172, 172, 172, 172], [0, 0, 0, 0, 243, 10, 180, 0], + [0, 0, 0, 124, 0, 0, 203, 0], [0, 0, 0, 199, 0, 0, 0, 0], + [0, 0, 24, 185, 242, 112, 231, 168], [0, 0, 55, 197, 0, 0, 0, 0], + [0, 0, 141, 0, 46, 177, 243, 123], [0, 54, 172, 214, 113, 34, 216, 163], + [0, 83, 241, 0, 62, 29, 179, 0], [0, 172, 0, 0, 172, 4, 159, 142], + [0, 249, 116, 3, 172, 172, 172, 172], [1, 172, 172, 172, 172, 172, 172, 172], + [14, 51, 125, 0, 79, 35, 166, 0], [26, 37, 172, 80, 6, 5, 204, 115], + [30, 211, 40, 208, 120, 132, 239, 180], [59, 6, 23, 60, 172, 172, 172, 172], [67, 157, 68, 139, 66, 58, 99, 136], [75, 128, 69, 106, 71, 143, 74, 167], - [78, 254, 15, 212, 172, 207, 172, 172], [82, 122, 192, 130, 117, 172, 6, 172], - [94, 172, 44, 196, 70, 156, 172, 172], [105, 226, 31, 217, 48, 15, 28, 243], - [111, 93, 109, 172, 172, 172, 172, 172], [127, 102, 190, 154, 208, 137, 186, 172], - [147, 149, 53, 43, 216, 50, 72, 107], [148, 12, 15, 202, 32, 15, 233, 52], - [150, 172, 172, 172, 97, 183, 172, 172], [152, 206, 15, 64, 41, 101, 220, 89], - [172, 15, 172, 172, 15, 3, 159, 142], [172, 54, 15, 214, 113, 34, 0, 163], - [172, 83, 241, 172, 62, 29, 179, 172], [172, 172, 24, 185, 242, 112, 231, 168], - [172, 172, 55, 197, 172, 172, 172, 172], [172, 172, 141, 172, 46, 177, 243, 123], - [172, 172, 172, 124, 172, 172, 203, 172], [172, 172, 172, 172, 15, 15, 15, 15], - [172, 172, 172, 172, 172, 172, 172, 172], [172, 172, 172, 172, 243, 9, 180, 172], - [172, 172, 172, 199, 172, 172, 172, 172], [172, 249, 116, 2, 15, 15, 15, 15], - [191, 172, 172, 172, 172, 172, 172, 172], [200, 172, 172, 172, 172, 172, 172, 172], - [201, 172, 172, 172, 172, 172, 172, 172], [209, 56, 0, 129, 38, 42, 15, 198], - [209, 95, 65, 114, 15, 15, 15, 250], [211, 172, 30, 85, 81, 174, 36, 155], - [211, 192, 172, 146, 202, 73, 184, 172], [222, 223, 15, 135, 39, 144, 86, 21], - [226, 15, 15, 15, 15, 15, 15, 15], [227, 5, 162, 211, 172, 172, 172, 172], - [231, 15, 15, 15, 15, 15, 15, 15], [240, 131, 84, 173, 219, 253, 57, 140], - [246, 169, 98, 160, 173, 49, 100, 172], + [78, 254, 172, 212, 0, 207, 0, 0], [82, 122, 192, 130, 117, 0, 7, 0], + [94, 0, 44, 196, 70, 156, 0, 0], [105, 1, 31, 218, 48, 172, 28, 243], + [111, 93, 109, 0, 0, 0, 0, 0], [127, 102, 190, 154, 208, 137, 186, 0], + [147, 149, 53, 43, 217, 50, 72, 107], [148, 13, 172, 202, 32, 172, 233, 52], + [150, 0, 0, 0, 97, 183, 0, 0], [152, 206, 172, 64, 41, 101, 221, 89], + [172, 22, 88, 19, 20, 189, 244, 248], [172, 103, 161, 0, 0, 0, 0, 0], + [172, 158, 172, 171, 0, 0, 87, 245], [172, 172, 8, 172, 222, 27, 76, 138], + [172, 172, 11, 172, 172, 172, 172, 172], [172, 172, 12, 108, 247, 194, 0, 0], + [172, 172, 172, 145, 0, 77, 33, 219], [172, 172, 172, 172, 9, 96, 91, 104], + [172, 172, 172, 172, 172, 172, 47, 238], [172, 172, 172, 172, 172, 172, 172, 0], + [172, 172, 172, 172, 172, 172, 172, 172], [172, 172, 172, 172, 172, 172, 172, 193], + [172, 172, 172, 172, 172, 172, 172, 210], [172, 172, 172, 172, 172, 172, 172, 214], + [172, 172, 172, 172, 172, 172, 188, 0], [172, 172, 172, 172, 172, 181, 0, 0], + [172, 172, 172, 172, 192, 45, 172, 172], [172, 172, 172, 172, 207, 172, 172, 172], + [172, 172, 172, 172, 209, 153, 0, 0], [172, 172, 172, 172, 215, 6, 232, 110], + [172, 172, 172, 176, 172, 170, 0, 0], [172, 172, 172, 187, 179, 0, 0, 0], + [172, 172, 172, 191, 172, 172, 172, 172], [172, 172, 172, 213, 0, 0, 0, 0], + [172, 172, 182, 251, 172, 172, 172, 172], [172, 172, 230, 61, 235, 236, 237, 234], + [172, 177, 118, 151, 205, 126, 172, 164], [172, 178, 0, 0, 0, 0, 0, 0], + [172, 179, 205, 205, 195, 0, 0, 0], [172, 200, 172, 172, 172, 175, 0, 0], + [172, 225, 63, 226, 90, 17, 172, 172], [172, 228, 172, 188, 92, 16, 204, 18], + [172, 229, 25, 119, 133, 134, 2, 165], [191, 0, 0, 0, 0, 0, 0, 0], + [200, 0, 0, 0, 0, 0, 0, 0], [201, 0, 0, 0, 0, 0, 0, 0], + [209, 56, 216, 129, 38, 42, 172, 198], [209, 95, 65, 114, 172, 172, 172, 250], + [211, 0, 30, 85, 81, 174, 36, 155], [211, 192, 0, 146, 202, 73, 184, 0], + [216, 252, 121, 0, 15, 0, 0, 0], [223, 224, 172, 135, 39, 144, 86, 21], + [227, 6, 162, 211, 0, 0, 0, 0], [231, 172, 172, 172, 172, 172, 172, 172], + [240, 131, 84, 173, 220, 253, 57, 140], [246, 169, 98, 160, 173, 49, 100, 0], ]; static BITSET_CANONICAL: [u64; 172] = [ - 0b0111111111111111111111111111111111111111111111111111111111111111, + 0b0000000000000000000000000000000000000000000000000000000000000000, + 0b1111111111111111111111111111111111111111111111111111111111111110, 0b1111111111001111111111111111111111111111111111111111111111111111, 0b1111111101111111111111111111111111111111011111111111111111111111, 0b1111111111111111111111111111111111111111111111111000011111111111, @@ -128,7 +129,6 @@ pub mod alphabetic { 0b1000111111110000011111111111111111111111111111111111111111111111, 0b0111111101111111111111111111111111111111111111111111110111111111, 0b0000000000000000000001111111111111100111111111111111111111111111, - 0b1111111111111111111111111111111111111111111111111111111111111111, 0b1111111111111111111111111111111111111111111111111111111111011011, 0b1111111111111111111111111111111111111111111111011111110001011111, 0b1111111111111111111111111111111111111111111110000000000000000000, @@ -287,15 +287,15 @@ pub mod alphabetic { 0b0000000000000000000000000000000000000000000000000000000010110011, ]; static BITSET_MAPPING: [(u8, u8); 83] = [ - (0, 191), (0, 65), (0, 188), (0, 187), (0, 186), (0, 185), (0, 184), (0, 182), (0, 181), - (0, 180), (0, 178), (0, 79), (0, 175), (0, 174), (0, 173), (0, 169), (0, 165), (0, 164), - (0, 162), (0, 161), (0, 160), (0, 158), (0, 155), (0, 151), (0, 150), (0, 149), (0, 148), - (0, 147), (0, 144), (0, 112), (0, 143), (0, 113), (0, 141), (0, 140), (0, 139), (0, 138), - (0, 137), (0, 136), (0, 135), (0, 134), (0, 132), (0, 131), (0, 130), (0, 129), (0, 61), - (0, 60), (0, 55), (0, 53), (0, 52), (0, 49), (0, 48), (0, 32), (0, 22), (0, 5), (0, 1), - (1, 129), (1, 58), (1, 57), (1, 50), (1, 42), (1, 28), (1, 21), (2, 180), (2, 30), (2, 24), - (2, 18), (3, 132), (3, 33), (3, 17), (4, 80), (4, 32), (5, 112), (5, 16), (6, 96), (6, 3), - (7, 38), (8, 32), (9, 17), (10, 69), (11, 32), (12, 187), (13, 179), (14, 141), + (0, 64), (1, 64), (1, 189), (1, 188), (1, 187), (1, 186), (1, 185), (1, 183), (1, 182), + (1, 181), (1, 179), (1, 78), (1, 176), (1, 175), (1, 174), (1, 170), (1, 166), (1, 165), + (1, 163), (1, 162), (1, 161), (1, 159), (1, 156), (1, 152), (1, 151), (1, 150), (1, 149), + (1, 148), (1, 145), (1, 111), (1, 144), (1, 112), (1, 142), (1, 141), (1, 140), (1, 139), + (1, 138), (1, 137), (1, 136), (1, 135), (1, 133), (1, 132), (1, 131), (1, 130), (1, 63), + (1, 60), (1, 59), (1, 54), (1, 52), (1, 51), (1, 48), (1, 47), (1, 31), (1, 21), (1, 4), + (2, 129), (2, 58), (2, 57), (2, 50), (2, 42), (2, 28), (2, 21), (3, 180), (3, 30), (3, 24), + (3, 18), (4, 132), (4, 33), (4, 17), (5, 80), (5, 32), (6, 112), (6, 16), (7, 96), (7, 3), + (8, 38), (9, 32), (10, 17), (11, 69), (12, 32), (13, 187), (14, 179), (15, 141), ]; pub fn lookup(c: char) -> bool { @@ -314,58 +314,53 @@ pub mod alphabetic { pub mod case_ignorable { const BITSET_LAST_CHUNK_MAP: u16 = 1792; static BITSET_CHUNKS_MAP: [u8; 251] = [ - 14, 28, 47, 22, 19, 11, 4, 13, 9, 40, 39, 32, 49, 23, 15, 36, 18, 39, 39, 39, 39, 39, 27, - 26, 12, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 25, 39, 30, 24, 20, 16, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 33, 39, 17, 38, 31, 39, 39, 39, 7, 41, 46, 3, 10, 1, - 6, 51, 8, 5, 42, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 50, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 44, 39, 35, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 29, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 34, 48, 39, 39, 39, 0, 39, 39, 21, 43, 39, 39, 45, 39, 39, 39, 39, 37, 2, + 36, 19, 18, 44, 41, 33, 22, 35, 31, 6, 0, 7, 49, 45, 37, 3, 40, 0, 0, 0, 0, 0, 20, 48, 34, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 47, 0, 10, 46, 42, + 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 39, 2, 11, 0, 0, 0, 29, 9, 17, 26, 32, 24, 28, 51, 30, + 27, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 21, 0, 0, 0, 23, 0, 0, 43, 13, 0, 0, 15, 0, 0, 0, 0, 1, 25, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 52] = [ - [3, 75, 88, 142, 142, 142, 142, 142], [5, 110, 38, 181, 142, 142, 12, 182], - [21, 4, 142, 142, 4, 4, 4, 160], [28, 163, 50, 131, 76, 138, 6, 120], - [31, 103, 14, 105, 54, 106, 124, 119], [33, 142, 17, 142, 34, 175, 118, 142], - [35, 123, 71, 142, 96, 142, 142, 142], [37, 142, 142, 144, 142, 142, 142, 142], - [41, 115, 117, 142, 142, 142, 142, 142], [42, 78, 112, 139, 142, 142, 142, 142], - [45, 142, 142, 98, 54, 77, 142, 142], [58, 74, 58, 29, 14, 104, 126, 122], - [62, 142, 180, 2, 142, 142, 142, 142], [63, 164, 53, 121, 67, 168, 52, 129], - [65, 177, 68, 142, 142, 142, 142, 142], [70, 16, 142, 66, 23, 69, 20, 0], - [72, 57, 30, 73, 142, 97, 142, 94], [87, 178, 142, 141, 47, 179, 143, 61], - [89, 40, 113, 85, 142, 142, 142, 142], [90, 151, 142, 19, 56, 84, 59, 46], - [95, 142, 142, 39, 162, 174, 49, 100], [99, 142, 142, 142, 167, 142, 142, 142], - [114, 86, 142, 91, 25, 158, 10, 51], [116, 36, 24, 123, 55, 81, 93, 83], - [130, 32, 155, 146, 159, 137, 150, 148], [133, 142, 142, 142, 142, 142, 142, 142], - [136, 142, 142, 142, 142, 142, 142, 142], [142, 1, 142, 153, 142, 15, 142, 22], - [142, 142, 26, 4, 4, 64, 176, 142], [142, 142, 102, 142, 142, 142, 142, 142], - [142, 142, 142, 16, 142, 142, 142, 142], [142, 142, 142, 135, 142, 170, 142, 142], - [142, 142, 142, 142, 79, 82, 48, 111], [142, 142, 142, 142, 134, 142, 7, 125], - [142, 142, 142, 142, 142, 27, 92, 142], [142, 142, 142, 142, 142, 132, 108, 101], - [142, 142, 142, 142, 142, 142, 13, 43], [142, 142, 142, 142, 142, 142, 142, 8], - [142, 142, 142, 142, 142, 142, 142, 140], [142, 142, 142, 142, 142, 142, 142, 142], - [142, 142, 142, 142, 142, 152, 142, 142], [142, 142, 142, 142, 156, 142, 142, 142], - [142, 142, 142, 147, 142, 142, 142, 142], [142, 142, 142, 157, 142, 142, 142, 142], - [142, 142, 142, 169, 9, 128, 142, 142], [142, 142, 142, 172, 142, 161, 142, 142], - [142, 142, 145, 142, 142, 173, 142, 142], [142, 142, 171, 142, 142, 109, 11, 80], - [142, 149, 142, 142, 142, 142, 142, 142], [154, 127, 18, 142, 60, 142, 142, 142], - [165, 142, 142, 142, 142, 142, 142, 142], [166, 142, 142, 142, 44, 127, 142, 107], + [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 8], [0, 0, 0, 0, 0, 0, 0, 141], + [0, 0, 0, 0, 0, 0, 14, 42], [0, 0, 0, 0, 0, 27, 92, 0], [0, 0, 0, 0, 0, 133, 108, 101], + [0, 0, 0, 0, 0, 152, 0, 0], [0, 0, 0, 0, 79, 82, 47, 111], [0, 0, 0, 0, 135, 0, 5, 126], + [0, 0, 0, 0, 156, 0, 0, 0], [0, 0, 0, 17, 0, 0, 0, 0], [0, 0, 0, 136, 0, 168, 0, 0], + [0, 0, 0, 147, 0, 0, 0, 0], [0, 0, 0, 157, 0, 0, 0, 0], [0, 0, 0, 167, 9, 129, 0, 0], + [0, 0, 0, 170, 0, 161, 0, 0], [0, 0, 102, 0, 0, 0, 0, 0], [0, 0, 145, 0, 0, 171, 0, 0], + [0, 0, 169, 0, 0, 109, 12, 80], [0, 0, 174, 123, 123, 64, 176, 0], + [0, 49, 0, 153, 0, 16, 0, 23], [0, 149, 0, 0, 0, 0, 0, 0], + [2, 103, 15, 105, 54, 106, 125, 119], [4, 75, 88, 0, 0, 0, 0, 0], + [6, 110, 37, 181, 0, 0, 13, 182], [22, 123, 0, 0, 123, 123, 123, 11], + [28, 163, 50, 132, 76, 139, 7, 120], [32, 0, 18, 0, 33, 175, 118, 0], + [34, 124, 71, 0, 96, 0, 0, 0], [36, 0, 0, 144, 0, 0, 0, 0], [40, 115, 117, 0, 0, 0, 0, 0], + [41, 78, 112, 140, 0, 0, 0, 0], [44, 0, 0, 98, 54, 77, 0, 0], + [58, 74, 58, 29, 15, 104, 127, 122], [62, 0, 180, 3, 0, 0, 0, 0], + [63, 164, 53, 121, 67, 160, 52, 130], [65, 177, 68, 0, 0, 0, 0, 0], + [70, 17, 0, 66, 24, 69, 21, 1], [72, 57, 30, 73, 0, 97, 0, 94], + [87, 178, 0, 142, 46, 179, 143, 61], [89, 39, 113, 85, 0, 0, 0, 0], + [90, 151, 0, 20, 56, 84, 59, 45], [95, 0, 0, 38, 162, 172, 48, 100], + [99, 0, 0, 0, 159, 0, 0, 0], [114, 86, 0, 91, 26, 158, 10, 51], + [116, 35, 25, 124, 55, 81, 93, 83], [131, 31, 155, 146, 173, 138, 150, 148], + [134, 0, 0, 0, 0, 0, 0, 0], [137, 0, 0, 0, 0, 0, 0, 0], [154, 128, 19, 0, 60, 0, 0, 0], + [165, 0, 0, 0, 0, 0, 0, 0], [166, 0, 0, 0, 43, 128, 0, 107], ]; static BITSET_CANONICAL: [u64; 123] = [ + 0b0000000000000000000000000000000000000000000000000000000000000000, 0b1111101111111111111111111111111111111111111111111111111111111111, - 0b0011000000000000000000000000000000000000000000000000000000000000, + 0b1100000000000000000000000000000000000000000000000000000000010001, 0b0111000000000000000000000000000000000000000000000000000000000000, 0b1111100001111111111111111111111111111111111111111111111111111111, - 0b1111111111111111111111111111111111111111111111111111111111111111, + 0b1111111111111100000000000000000000000000000000000000000000000000, 0b1111111100000000000000000000000000000000000000000000000000000000, 0b0111111111000000000000000000000000000000000000000000000000000011, - 0b1111111111111100000000000000000000000000000000000000000000000000, 0b1111100000000000000000000000000000000000000000000000000000000000, 0b0000000001111111000000000000000000000000000000000000000000000000, 0b0000000000000001111111111100000000000000000000000000000000000000, + 0b0000000000000000111111111111111111111111111111111111111111111111, 0b1011111111111111111111111111111111111111111111100000000000000000, 0b1011000000111100000000000000000000000000000000000000000000000000, 0b1010000000000000000000000000000000000000000000000000000000000000, @@ -381,12 +376,10 @@ pub mod case_ignorable { 0b1111111111111111111100000000000000000000000000000000000000000000, 0b1111111111111111000000001000000000000000000000000000000000000000, 0b1111111111111111000000000000000000000000000000101000000000000000, - 0b1111111111111111000000000000000000000000000000000000000000000000, 0b1111111111111000000000111000000000000000000000000000000000000000, 0b1111111100000000000000000000000000000000000000000000000000000010, 0b1111110000000000000000000000110000000000000000000010000110111110, 0b1100000110011101000000000000000000000000000000000000000000000000, - 0b1100000000000000000000000000000000000000000000000000000000010001, 0b1011111111110111100000000000000000000000000000000000000000000000, 0b1011111101111111000000000000000000000000000000000000000000000000, 0b1011010001111110000000000000000000000000000000000000000000000000, @@ -405,6 +398,7 @@ pub mod case_ignorable { 0b0100000000000000000000000000000000000100000000000100000010000000, 0b0011111110110000000000000000000000000000000000000000000000000000, 0b0011001111001000000000000000000000000000000000000000000000000111, + 0b0011000000000000000000000000000000000000000000000000000000000000, 0b0010011001111000000000000000000000000000000000000000000000000011, 0b0010010000111111111110000000000000000000000000000000000000000000, 0b0001111111111111111111111111111111111110111111111110000011011111, @@ -480,13 +474,13 @@ pub mod case_ignorable { 0b0000000000000000000000000000000000000000000000000010000000000001, ]; static BITSET_MAPPING: [(u8, u8); 60] = [ - (0, 70), (0, 71), (0, 190), (0, 72), (0, 73), (0, 188), (0, 76), (0, 82), (0, 83), (0, 85), - (0, 91), (0, 100), (0, 102), (0, 117), (0, 118), (0, 121), (0, 66), (0, 67), (0, 69), - (1, 190), (1, 34), (1, 41), (1, 47), (1, 52), (1, 55), (1, 60), (2, 6), (2, 12), (2, 29), - (2, 33), (2, 51), (3, 84), (3, 101), (3, 109), (3, 117), (4, 181), (4, 158), (4, 144), - (5, 12), (5, 46), (5, 7), (6, 176), (6, 134), (6, 57), (7, 62), (7, 63), (8, 53), (8, 59), - (9, 19), (9, 32), (10, 32), (10, 33), (11, 184), (12, 184), (13, 33), (14, 170), (15, 1), - (16, 33), (17, 179), (18, 23), + (0, 64), (1, 70), (1, 71), (1, 190), (1, 72), (1, 73), (1, 188), (1, 76), (1, 82), (1, 83), + (1, 85), (1, 91), (1, 100), (1, 102), (1, 117), (1, 118), (1, 121), (1, 66), (1, 67), + (1, 69), (2, 160), (2, 153), (2, 147), (2, 142), (2, 139), (2, 134), (3, 6), (3, 12), + (3, 29), (3, 33), (3, 51), (4, 84), (4, 101), (4, 109), (4, 117), (5, 181), (5, 62), + (5, 63), (6, 12), (6, 46), (6, 7), (7, 176), (7, 134), (7, 57), (8, 53), (8, 59), (9, 19), + (9, 32), (10, 32), (10, 33), (11, 142), (11, 64), (12, 184), (13, 184), (14, 33), (15, 170), + (16, 1), (17, 33), (18, 179), (19, 23), ]; pub fn lookup(c: char) -> bool { @@ -505,41 +499,40 @@ pub mod case_ignorable { pub mod cased { const BITSET_LAST_CHUNK_MAP: u16 = 124; static BITSET_CHUNKS_MAP: [u8; 124] = [ - 4, 0, 18, 18, 6, 18, 18, 9, 5, 8, 18, 3, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, - 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 14, 15, 18, 18, 18, 18, - 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 16, 18, 1, 18, 10, 18, 18, - 7, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 13, 18, - 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, - 18, 2, 18, 18, 18, 18, 11, 12, + 13, 15, 0, 0, 8, 0, 0, 11, 14, 10, 0, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 3, 2, 0, 16, 0, 12, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, + 0, 0, 0, 7, 6, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 19] = [ - [5, 5, 7, 5, 50, 10, 40, 58, 58, 58, 58, 58, 58, 58, 58, 58], - [5, 42, 16, 24, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], - [5, 53, 38, 0, 20, 9, 5, 5, 5, 5, 4, 18, 55, 56, 57, 54], - [51, 52, 5, 29, 30, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], - [58, 2, 27, 1, 5, 5, 48, 6, 5, 5, 28, 31, 58, 35, 14, 49], - [58, 34, 32, 58, 19, 11, 62, 58, 58, 58, 58, 58, 58, 58, 58, 58], - [58, 58, 12, 37, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 12, 61], - [58, 58, 12, 44, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], - [58, 58, 17, 45, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], - [58, 58, 36, 58, 5, 5, 5, 58, 5, 5, 5, 5, 3, 22, 21, 23], - [58, 58, 47, 47, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], - [58, 58, 58, 58, 5, 39, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], - [58, 58, 58, 58, 16, 60, 41, 58, 58, 58, 58, 58, 58, 58, 58, 58], - [58, 58, 58, 58, 58, 58, 58, 58, 58, 5, 58, 58, 58, 58, 58, 58], - [58, 58, 58, 58, 58, 58, 58, 58, 58, 46, 43, 58, 13, 5, 8, 26], - [58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 16, 15, 5, 58], - [58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 25, 59, 58, 58], - [58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 33, 58, 58, 58], - [58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 15, 39, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 59, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 33, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 39, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 47, 44, 0, 13, 39, 8, 26], + [0, 0, 0, 0, 16, 60, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 39, 40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 12, 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 61], + [0, 0, 12, 45, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 17, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 36, 0, 39, 39, 39, 0, 39, 39, 39, 39, 4, 22, 21, 23], + [0, 0, 48, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 3, 27, 2, 39, 39, 49, 6, 39, 39, 28, 31, 0, 35, 14, 50], + [0, 34, 32, 0, 19, 11, 62, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [39, 39, 7, 39, 51, 10, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [39, 43, 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [39, 54, 38, 1, 20, 9, 39, 39, 39, 39, 5, 18, 56, 57, 58, 55], + [52, 53, 39, 29, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ]; static BITSET_CANONICAL: [u64; 39] = [ + 0b0000000000000000000000000000000000000000000000000000000000000000, 0b1111111111111111111111111111111111111111111111111111111111101111, 0b1111111101111111111111111111111111111111011111111111111111111111, 0b0000011111111111111111111111111000000111111111111111111111111110, 0b1111111111111111111111111111111100111111001111111111111111111111, 0b1111111111111111111111110011111111111111111111111111111111111111, - 0b1111111111111111111111111111111111111111111111111111111111111111, 0b1111111111111111111111111111111111111111111111111111111111110000, 0b1111111111111111111111111111111111111111111111111111110000000011, 0b1111111111111111111111111111111111111111111111110111100011111111, @@ -575,9 +568,9 @@ pub mod cased { 0b1110101111111111110111100110010011011111111111111111111111111111, ]; static BITSET_MAPPING: [(u8, u8); 24] = [ - (0, 188), (0, 183), (0, 182), (0, 176), (0, 162), (0, 160), (0, 150), (0, 146), (0, 141), - (0, 55), (0, 50), (0, 44), (0, 43), (0, 27), (0, 17), (1, 180), (1, 30), (1, 24), (1, 18), - (2, 187), (2, 160), (2, 15), (3, 32), (4, 93), + (0, 64), (1, 188), (1, 183), (1, 182), (1, 176), (1, 162), (1, 160), (1, 150), (1, 146), + (1, 141), (1, 55), (1, 50), (1, 44), (1, 43), (1, 27), (1, 17), (2, 180), (2, 30), (2, 24), + (2, 18), (3, 160), (3, 15), (4, 32), (5, 93), ]; pub fn lookup(c: char) -> bool { @@ -625,44 +618,37 @@ pub mod cc { pub mod grapheme_extend { const BITSET_LAST_CHUNK_MAP: u16 = 1792; static BITSET_CHUNKS_MAP: [u8; 246] = [ - 34, 30, 41, 44, 17, 11, 0, 12, 9, 36, 34, 29, 43, 20, 13, 34, 21, 34, 34, 34, 34, 34, 26, - 34, 16, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, - 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, - 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 23, 18, 14, 34, 34, 34, 34, 34, 34, - 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, - 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 31, 34, 15, 35, 27, 34, 34, 34, 7, 37, 25, 4, 10, - 22, 6, 2, 8, 5, 28, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, - 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 40, 34, 33, 34, - 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, - 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 24, 34, 34, 34, 34, 34, 34, 34, 34, - 34, 32, 42, 34, 34, 34, 1, 34, 34, 19, 38, 34, 34, 39, 3, + 0, 6, 17, 44, 37, 31, 20, 32, 29, 4, 0, 5, 43, 40, 33, 0, 41, 0, 0, 0, 0, 0, 9, 0, 36, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 38, 34, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, 0, 35, 1, 10, 0, 0, 0, 27, 8, 16, 24, 30, 42, 26, 22, 28, 25, 11, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 14, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 19, 0, 0, + 0, 21, 0, 0, 39, 12, 0, 0, 13, 23, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 45] = [ - [1, 85, 27, 86, 34, 84, 100, 88], [4, 60, 71, 120, 120, 120, 120, 120], - [7, 120, 120, 120, 33, 101, 120, 87], [13, 8, 120, 120, 8, 8, 8, 139], - [17, 0, 51, 120, 61, 111, 138, 97], [21, 120, 56, 120, 22, 141, 95, 120], - [24, 99, 55, 120, 79, 120, 120, 120], [28, 120, 120, 116, 120, 120, 120, 120], - [31, 92, 94, 120, 120, 120, 120, 120], [32, 63, 91, 120, 120, 120, 120, 120], - [36, 120, 120, 80, 34, 62, 120, 120], [48, 59, 48, 9, 19, 83, 38, 96], - [50, 104, 44, 140, 53, 30, 43, 102], [54, 120, 120, 52, 120, 120, 120, 6], - [57, 47, 20, 58, 120, 120, 120, 77], [70, 120, 120, 120, 120, 120, 115, 120], - [72, 120, 113, 120, 120, 120, 120, 120], [73, 123, 120, 14, 46, 82, 35, 37], - [78, 120, 120, 29, 110, 127, 41, 109], [81, 120, 120, 120, 5, 120, 120, 120], - [93, 26, 16, 99, 45, 64, 76, 66], [103, 120, 120, 68, 120, 120, 120, 120], - [112, 89, 25, 137, 120, 120, 23, 143], [120, 39, 115, 118, 120, 120, 120, 120], - [120, 120, 114, 120, 120, 120, 120, 120], [120, 120, 117, 120, 120, 126, 120, 120], - [120, 120, 120, 67, 120, 136, 120, 13], [120, 120, 120, 107, 120, 11, 120, 120], - [120, 120, 120, 119, 120, 120, 120, 120], [120, 120, 120, 120, 2, 65, 40, 90], - [120, 120, 120, 120, 8, 139, 120, 120], [120, 120, 120, 120, 106, 120, 120, 120], - [120, 120, 120, 120, 120, 18, 75, 120], [120, 120, 120, 120, 120, 105, 129, 108], - [120, 120, 120, 120, 120, 120, 120, 120], [120, 120, 120, 120, 120, 120, 120, 135], - [120, 120, 120, 120, 120, 124, 120, 120], [120, 120, 120, 120, 130, 120, 120, 120], - [120, 120, 120, 131, 120, 120, 120, 120], [120, 120, 120, 134, 120, 133, 120, 120], - [120, 120, 120, 142, 5, 120, 120, 120], [120, 120, 132, 120, 120, 120, 10, 98], - [120, 121, 120, 120, 120, 120, 120, 120], [122, 120, 12, 120, 49, 120, 120, 120], - [128, 69, 120, 74, 15, 125, 3, 42], + [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 135], [0, 0, 0, 0, 0, 18, 75, 0], + [0, 0, 0, 0, 0, 106, 129, 109], [0, 0, 0, 0, 0, 124, 0, 0], [0, 0, 0, 0, 3, 65, 40, 90], + [0, 0, 0, 0, 99, 139, 0, 0], [0, 0, 0, 0, 107, 0, 0, 0], [0, 0, 0, 0, 130, 0, 0, 0], + [0, 0, 0, 67, 0, 136, 0, 14], [0, 0, 0, 108, 0, 12, 0, 0], [0, 0, 0, 120, 0, 0, 0, 0], + [0, 0, 0, 131, 0, 0, 0, 0], [0, 0, 0, 134, 0, 133, 0, 0], [0, 0, 0, 142, 6, 0, 0, 0], + [0, 0, 115, 0, 0, 0, 0, 0], [0, 0, 118, 0, 0, 126, 0, 0], [0, 0, 132, 0, 0, 0, 11, 98], + [0, 39, 116, 119, 0, 0, 0, 0], [0, 121, 0, 0, 0, 0, 0, 0], [2, 85, 27, 86, 34, 84, 101, 88], + [5, 60, 71, 0, 0, 0, 0, 0], [8, 0, 0, 0, 33, 102, 0, 87], [14, 99, 0, 0, 99, 99, 99, 139], + [17, 1, 51, 0, 61, 112, 138, 97], [21, 0, 56, 0, 22, 141, 95, 0], + [24, 100, 55, 0, 79, 0, 0, 0], [28, 0, 0, 117, 0, 0, 0, 0], [31, 92, 94, 0, 0, 0, 0, 0], + [32, 63, 91, 0, 0, 0, 0, 0], [36, 0, 0, 80, 34, 62, 0, 0], [48, 59, 48, 10, 19, 83, 38, 96], + [50, 105, 44, 140, 53, 30, 43, 103], [54, 0, 0, 52, 0, 0, 0, 7], + [57, 47, 20, 58, 0, 0, 0, 77], [70, 0, 0, 0, 0, 0, 116, 0], [72, 0, 114, 0, 0, 0, 0, 0], + [73, 123, 0, 15, 46, 82, 35, 37], [78, 0, 0, 29, 111, 127, 41, 110], + [81, 0, 0, 0, 6, 0, 0, 0], [93, 26, 9, 100, 45, 64, 76, 66], [104, 0, 0, 68, 0, 0, 0, 0], + [113, 89, 25, 137, 0, 0, 23, 143], [122, 0, 13, 0, 49, 0, 0, 0], + [128, 69, 0, 74, 16, 125, 4, 42], ]; static BITSET_CANONICAL: [u64; 99] = [ + 0b0000000000000000000000000000000000000000000000000000000000000000, 0b1000000000000000000000000000000000000000000000000000000001111111, 0b1100000000000000000000000000000000000000000000000000000000010001, 0b0000000000011100000000000000000000000000000111000000000000000000, @@ -671,7 +657,7 @@ pub mod grapheme_extend { 0b0000000001111111000000000000000000000000000000000000000000000000, 0b1111101111111111111111111111111111111111111111111111111111111111, 0b0000011011111111100000000000000000000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111111111111111111111, + 0b1111111111111111000000000000000000000000000000000000000000000000, 0b1111110000000000000000000000110000000000000000000010000110111110, 0b1011111111111111111111111111111111111111111111100000000000000000, 0b0000011111000000000000000000000000000000000000000000000000000000, @@ -679,7 +665,6 @@ pub mod grapheme_extend { 0b1111111111111111111111111111111100000000000000000000000000000000, 0b1111111111111111111111111111101111111111111110000000000000000000, 0b1111111111111111000000000000000000000000000000100000000000000000, - 0b1111111111111111000000000000000000000000000000000000000000000000, 0b1111111100000000000000000000000000000000000000000000000000000010, 0b1111100000000111110000111010000000000000000000000000000000000000, 0b1101000000000000000000000000000000000000000000000000000000000010, @@ -764,11 +749,11 @@ pub mod grapheme_extend { 0b0000000000000000000000000000000000000000000000000000000010110110, ]; static BITSET_MAPPING: [(u8, u8); 45] = [ - (0, 191), (0, 190), (0, 188), (0, 185), (0, 179), (0, 8), (0, 176), (0, 161), (0, 159), - (0, 155), (0, 154), (0, 39), (0, 140), (0, 57), (1, 165), (1, 161), (1, 160), (1, 153), - (1, 147), (1, 142), (1, 139), (2, 181), (2, 176), (2, 167), (2, 153), (2, 149), (3, 26), - (3, 32), (3, 33), (3, 42), (4, 88), (4, 109), (4, 117), (5, 19), (5, 20), (5, 32), (6, 67), - (6, 69), (7, 183), (7, 7), (8, 144), (9, 178), (10, 184), (11, 58), (12, 23), + (0, 64), (1, 191), (1, 190), (1, 188), (1, 185), (1, 179), (1, 8), (1, 176), (1, 161), + (1, 159), (1, 155), (1, 154), (1, 39), (1, 140), (1, 57), (2, 165), (2, 161), (2, 160), + (2, 153), (2, 147), (2, 142), (2, 139), (3, 176), (3, 167), (3, 153), (3, 149), (4, 26), + (4, 32), (4, 33), (4, 42), (5, 88), (5, 109), (5, 117), (6, 19), (6, 20), (6, 32), (7, 67), + (7, 69), (8, 183), (8, 7), (9, 64), (10, 178), (11, 184), (12, 58), (13, 23), ]; pub fn lookup(c: char) -> bool { @@ -787,39 +772,40 @@ pub mod grapheme_extend { pub mod lowercase { const BITSET_LAST_CHUNK_MAP: u16 = 122; static BITSET_CHUNKS_MAP: [u8; 119] = [ - 16, 2, 9, 9, 4, 9, 9, 15, 3, 12, 9, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 11, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 8, 10, 9, 0, 9, 14, 9, 9, 13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 17, 5, + 13, 16, 0, 0, 8, 0, 0, 11, 12, 9, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 1, 0, 14, 0, 7, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 6, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 18] = [ - [10, 55, 52, 6, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], - [15, 24, 20, 34, 35, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], - [15, 46, 1, 19, 63, 8, 54, 52, 52, 52, 52, 52, 52, 52, 52, 52], - [52, 5, 39, 52, 27, 14, 70, 52, 52, 52, 52, 52, 52, 52, 52, 52], - [52, 52, 52, 50, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 25], - [52, 52, 52, 52, 9, 53, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], - [52, 52, 52, 52, 52, 52, 52, 52, 52, 8, 52, 52, 52, 52, 52, 52], - [52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 15, 13, 2, 52], - [52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 40, 52, 52, 52], - [52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], - [52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 56, 52, 52], - [52, 52, 52, 52, 52, 52, 52, 52, 52, 62, 38, 52, 47, 43, 45, 29], - [52, 52, 52, 57, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], - [52, 52, 52, 65, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], - [52, 52, 52, 66, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52], - [52, 52, 54, 52, 2, 2, 2, 52, 20, 20, 64, 20, 32, 23, 22, 33], - [52, 69, 30, 16, 21, 48, 49, 44, 42, 7, 31, 37, 52, 26, 12, 28], - [60, 36, 51, 11, 61, 58, 17, 4, 0, 59, 71, 18, 67, 68, 3, 41], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 14, 52, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 39, 0, 47, 43, 45, 30], + [0, 0, 0, 0, 10, 53, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26], + [0, 0, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 67, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 54, 0, 52, 52, 52, 0, 21, 21, 64, 21, 33, 24, 23, 34], + [0, 5, 71, 0, 28, 15, 69, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 61, 31, 17, 22, 48, 49, 44, 42, 8, 32, 38, 0, 27, 13, 29], + [11, 55, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [16, 25, 21, 35, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [16, 46, 2, 20, 63, 9, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [60, 37, 51, 12, 70, 58, 18, 1, 6, 59, 68, 19, 65, 66, 3, 41], ]; static BITSET_CANONICAL: [u64; 52] = [ - 0b0000111111111111111111111111110000000000000000000000000011111111, + 0b0000000000000000000000000000000000000000000000000000000000000000, + 0b1111111111111111110000000000000000000000000011111111111111111111, 0b1010101010101010101010101010101010101010101010101010100000000010, - 0b1111111111111111111111111111111111111111111111111111111111111111, 0b1111111111111111111111000000000000000000000000001111110111111111, - 0b1111111111111111110000000000000000000000000011111111111111111111, - 0b1000000000000010000000000000000000000000000000000000000000000000, 0b0000111111111111111111111111111111111111000000000000000000000000, + 0b1000000000000010000000000000000000000000000000000000000000000000, + 0b0000111111111111111111111111110000000000000000000000000011111111, + 0b0000000000000111111111111111111111111111111111111111111111111111, 0b1111111111111111111111111111111111111111111111111010101010000101, 0b1111111111111111111111111111111100000000000000000000000000000000, 0b1111111111111111111111111111110000000000000000000000000000000000, @@ -852,7 +838,6 @@ pub mod lowercase { 0b0000000000000000000000001111111111111111110111111100000000000000, 0b0000000000000000000000000001111100000000000000000000000000000011, 0b0000000000000000000000000000000000111010101010101010101010101010, - 0b0000000000000000000000000000000000011111111111110000000000000000, 0b0000000000000000000000000000000000000000111110000000000001111111, 0b0000000000000000000000000000000000000000000000000000101111110111, 0b1001001111111010101010101010101010101010101010101010101010101010, @@ -867,9 +852,9 @@ pub mod lowercase { 0b1110101111000000000000000000000000001111111111111111111111111100, ]; static BITSET_MAPPING: [(u8, u8); 20] = [ - (0, 188), (0, 184), (0, 179), (0, 172), (0, 161), (0, 146), (0, 144), (0, 140), (0, 136), - (0, 132), (1, 146), (1, 144), (1, 83), (2, 160), (2, 141), (3, 12), (3, 6), (4, 77), - (5, 187), (6, 78), + (0, 64), (1, 188), (1, 183), (1, 176), (1, 109), (1, 124), (1, 126), (1, 66), (1, 70), + (1, 77), (2, 146), (2, 144), (2, 83), (3, 12), (3, 6), (4, 156), (4, 78), (5, 187), + (6, 132), (7, 93), ]; pub fn lookup(c: char) -> bool { @@ -888,45 +873,36 @@ pub mod lowercase { pub mod n { const BITSET_LAST_CHUNK_MAP: u16 = 253; static BITSET_CHUNKS_MAP: [u8; 250] = [ - 45, 19, 19, 39, 23, 40, 6, 37, 33, 17, 19, 12, 42, 32, 41, 19, 8, 19, 2, 16, 19, 19, 13, - 19, 1, 43, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 44, 46, 34, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 25, 15, 14, 31, 19, 4, 7, 11, 5, 9, 26, 36, - 35, 28, 19, 10, 20, 19, 19, 0, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 38, 19, 30, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 27, 19, 18, 19, 19, 19, 19, 22, 26, 19, 19, 29, 19, 3, 19, 24, 21, + 45, 0, 0, 37, 7, 38, 26, 35, 31, 5, 0, 12, 42, 21, 39, 0, 28, 0, 22, 4, 0, 0, 13, 0, 40, + 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 43, 46, + 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 8, 14, 20, 0, 24, 27, 11, 25, 29, 15, 34, 33, 17, 0, + 30, 2, 0, 0, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 16, 0, 1, 0, 0, 0, 0, 6, 15, 0, 0, 18, 0, 23, 0, 9, 3, ]; static BITSET_INDEX_CHUNKS: [[u8; 8]; 47] = [ - [12, 52, 44, 44, 44, 44, 44, 44], [27, 44, 44, 44, 44, 44, 67, 44], - [44, 15, 51, 16, 44, 44, 44, 44], [44, 17, 34, 44, 23, 44, 44, 44], - [44, 18, 11, 4, 62, 44, 61, 2], [44, 19, 44, 44, 56, 66, 44, 46], - [44, 20, 44, 58, 44, 31, 44, 58], [44, 22, 72, 65, 44, 43, 53, 44], - [44, 29, 45, 44, 44, 14, 42, 44], [44, 36, 44, 59, 1, 44, 44, 33], - [44, 37, 44, 44, 44, 55, 57, 44], [44, 44, 44, 3, 59, 44, 44, 44], - [44, 44, 44, 10, 44, 44, 44, 8], [44, 44, 44, 24, 44, 44, 44, 44], - [44, 44, 44, 25, 5, 41, 44, 64], [44, 44, 44, 44, 9, 0, 60, 44], - [44, 44, 44, 44, 44, 1, 48, 44], [44, 44, 44, 44, 44, 7, 44, 44], - [44, 44, 44, 44, 44, 44, 44, 13], [44, 44, 44, 44, 44, 44, 44, 44], - [44, 44, 44, 44, 44, 44, 44, 49], [44, 44, 44, 44, 44, 44, 44, 59], - [44, 44, 44, 44, 44, 45, 44, 44], [44, 44, 44, 44, 44, 58, 44, 30], - [44, 44, 44, 44, 47, 44, 44, 44], [44, 44, 44, 44, 55, 44, 44, 44], - [44, 44, 44, 59, 44, 44, 44, 44], [44, 44, 44, 69, 44, 68, 44, 44], - [44, 44, 44, 71, 44, 55, 44, 44], [44, 44, 44, 73, 44, 55, 44, 44], - [44, 44, 50, 44, 44, 44, 44, 44], [44, 44, 57, 44, 44, 44, 44, 44], - [44, 44, 70, 44, 44, 55, 59, 44], [44, 45, 55, 44, 44, 44, 44, 44], - [44, 55, 44, 44, 44, 44, 44, 59], [44, 55, 44, 45, 26, 44, 44, 44], - [44, 55, 44, 55, 44, 44, 44, 44], [44, 55, 44, 55, 69, 44, 44, 44], - [44, 57, 44, 44, 44, 38, 44, 44], [44, 57, 44, 59, 44, 44, 44, 45], - [44, 58, 44, 58, 44, 32, 44, 35], [44, 70, 44, 44, 44, 44, 44, 44], - [55, 44, 44, 44, 44, 54, 44, 40], [57, 39, 6, 44, 44, 44, 44, 44], - [57, 44, 44, 58, 44, 44, 44, 44], [59, 44, 21, 44, 44, 44, 44, 44], - [63, 44, 44, 55, 45, 44, 44, 28], + [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 13], [0, 0, 0, 0, 0, 0, 0, 49], + [0, 0, 0, 0, 0, 0, 0, 59], [0, 0, 0, 0, 0, 2, 48, 0], [0, 0, 0, 0, 0, 8, 0, 0], + [0, 0, 0, 0, 0, 45, 0, 0], [0, 0, 0, 0, 0, 58, 0, 30], [0, 0, 0, 0, 10, 1, 60, 0], + [0, 0, 0, 0, 47, 0, 0, 0], [0, 0, 0, 0, 55, 0, 0, 0], [0, 0, 0, 4, 59, 0, 0, 0], + [0, 0, 0, 11, 0, 0, 0, 9], [0, 0, 0, 24, 0, 0, 0, 0], [0, 0, 0, 25, 6, 41, 0, 64], + [0, 0, 0, 59, 0, 0, 0, 0], [0, 0, 0, 69, 0, 68, 0, 0], [0, 0, 0, 71, 0, 55, 0, 0], + [0, 0, 0, 73, 0, 55, 0, 0], [0, 0, 50, 0, 0, 0, 0, 0], [0, 0, 57, 0, 0, 0, 0, 0], + [0, 0, 70, 0, 0, 55, 59, 0], [0, 15, 51, 16, 0, 0, 0, 0], [0, 17, 34, 0, 23, 0, 0, 0], + [0, 18, 12, 5, 62, 0, 61, 3], [0, 19, 0, 0, 56, 66, 0, 46], [0, 20, 0, 58, 0, 31, 0, 58], + [0, 22, 72, 65, 0, 43, 53, 0], [0, 29, 45, 0, 0, 14, 42, 0], [0, 36, 0, 59, 2, 0, 0, 33], + [0, 37, 0, 0, 0, 55, 57, 0], [0, 45, 55, 0, 0, 0, 0, 0], [0, 55, 0, 0, 0, 0, 0, 59], + [0, 55, 0, 45, 26, 0, 0, 0], [0, 55, 0, 55, 0, 0, 0, 0], [0, 55, 0, 55, 69, 0, 0, 0], + [0, 57, 0, 0, 0, 38, 0, 0], [0, 57, 0, 59, 0, 0, 0, 45], [0, 58, 0, 58, 0, 32, 0, 35], + [0, 70, 0, 0, 0, 0, 0, 0], [27, 0, 0, 0, 0, 0, 67, 0], [44, 52, 0, 0, 0, 0, 0, 0], + [55, 0, 0, 0, 0, 54, 0, 40], [57, 0, 0, 58, 0, 0, 0, 0], [57, 39, 7, 0, 0, 0, 0, 0], + [59, 0, 21, 0, 0, 0, 0, 0], [63, 0, 0, 55, 45, 0, 0, 28], ]; static BITSET_CANONICAL: [u64; 44] = [ + 0b0000000000000000000000000000000000000000000000000000000000000000, 0b0000000111111111111111111111111111111111111111111111111111111111, 0b1111111111000000000000000000000000000000000000000000000000000000, 0b1111111111111111111111111111111111111111111111001111111111111111, @@ -939,7 +915,6 @@ pub mod n { 0b0000000000001111111111111111111111111111111111111111111110000000, 0b0000000000000001110000000000000000000000000000000000000000000000, 0b0000000000000000111111111000000000000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111111111111111111111, 0b1111111111111111111111111111111111111111111111111100000000000000, 0b1111111111111111111111111111111111111111111111110000000000000000, 0b1111111111111111111111111111111100000000000000000000000000000000, @@ -973,10 +948,10 @@ pub mod n { 0b1111111100000000000000000000000011111111000000000000000000000000, ]; static BITSET_MAPPING: [(u8, u8); 30] = [ - (0, 185), (0, 175), (0, 76), (0, 172), (0, 165), (0, 164), (0, 162), (0, 157), (0, 138), - (0, 112), (1, 16), (1, 26), (1, 39), (1, 42), (1, 48), (1, 58), (2, 122), (2, 108), (3, 28), - (3, 54), (4, 22), (4, 48), (5, 49), (5, 50), (6, 47), (7, 55), (8, 32), (9, 108), (10, 47), - (11, 32), + (0, 64), (1, 175), (1, 76), (1, 172), (1, 165), (1, 164), (1, 162), (1, 157), (1, 138), + (1, 112), (2, 16), (2, 26), (2, 39), (2, 42), (2, 48), (2, 58), (3, 122), (3, 108), (4, 28), + (4, 54), (5, 22), (5, 48), (6, 49), (6, 50), (7, 47), (8, 55), (9, 32), (10, 108), (11, 47), + (12, 32), ]; pub fn lookup(c: char) -> bool { diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index 4898df3c80018..e5b15224795f5 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -301,7 +301,21 @@ impl Canonicalized { Canonicalized(usize), } - while let Some((&to, _)) = mappings.iter().max_by_key(|m| m.1.len()) { + // Map 0 first, so that it is the first canonical word. + // This is realistically not inefficient because 0 is not mapped to by + // anything else (a shift pattern could do it, but would be wasteful). + // + // However, 0s are quite common in the overall dataset, and it is quite + // wasteful to have to go through a mapping function to determine that + // we have a zero. + // + // FIXME: Experiment with choosing most common words in overall data set + // for canonical when possible. + while let Some((&to, _)) = mappings + .iter() + .find(|(&to, _)| to == 0) + .or_else(|| mappings.iter().max_by_key(|m| m.1.len())) + { // Get the mapping with the most entries. Currently, no mapping can // only exist transitively (i.e., there is no A, B, C such that A // does not map to C and but A maps to B maps to C), so this is From af243d4d91400e071a5b8fe5041f55f07fd8a928 Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Sat, 21 Mar 2020 18:01:50 -0400 Subject: [PATCH 10/14] Avoid relying on const parameters to function LLVM seems to at least sometimes optimize better when the length comes directly from the `len()` of the array vs. an equivalent integer. Also, this allows easier copy/pasting of the function into compiler explorer for experimentation. --- src/libcore/unicode/unicode_data.rs | 8 ++++---- src/tools/unicode-table-generator/src/range_search.rs | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/libcore/unicode/unicode_data.rs b/src/libcore/unicode/unicode_data.rs index aea923e90757e..660b91b6025d1 100644 --- a/src/libcore/unicode/unicode_data.rs +++ b/src/libcore/unicode/unicode_data.rs @@ -27,11 +27,11 @@ fn range_search< } else { return false; }; - let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece] as usize; - let word = if idx < CANONICAL { - bitset_canonical[idx] + let idx = bitset_chunk_idx[chunk_idx as usize][chunk_piece] as usize; + let word = if let Some(word) = bitset_canonical.get(idx) { + *word } else { - let (real_idx, mapping) = bitset_canonicalized[idx - CANONICAL]; + let (real_idx, mapping) = bitset_canonicalized[idx - bitset_canonical.len()]; let mut word = bitset_canonical[real_idx as usize]; let should_invert = mapping & (1 << 6) != 0; if should_invert { diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs index 12efa5a9f83bf..b57fd2c1d8623 100644 --- a/src/tools/unicode-table-generator/src/range_search.rs +++ b/src/tools/unicode-table-generator/src/range_search.rs @@ -25,11 +25,11 @@ fn range_search< } else { return false; }; - let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece] as usize; - let word = if idx < CANONICAL { - bitset_canonical[idx] + let idx = bitset_chunk_idx[chunk_idx as usize][chunk_piece] as usize; + let word = if let Some(word) = bitset_canonical.get(idx) { + *word } else { - let (real_idx, mapping) = bitset_canonicalized[idx - CANONICAL]; + let (real_idx, mapping) = bitset_canonicalized[idx - bitset_canonical.len()]; let mut word = bitset_canonical[real_idx as usize]; let should_invert = mapping & (1 << 6) != 0; if should_invert { From 33b9e6f5cff73e0a5ef9c0405e06275f954c3b4b Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Tue, 24 Mar 2020 16:24:47 -0400 Subject: [PATCH 11/14] Add richer printing --- src/tools/unicode-table-generator/src/main.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index af23c166871e1..04c72116e5f8b 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -171,7 +171,15 @@ fn main() { emit_codepoints(&mut emitter, &ranges); modules.push((property.to_lowercase().to_string(), emitter.file)); - println!("{:15}: {} bytes, {} codepoints", property, emitter.bytes_used, datapoints,); + println!( + "{:15}: {} bytes, {} codepoints in {} ranges ({} - {})", + property, + emitter.bytes_used, + datapoints, + ranges.len(), + ranges.first().unwrap().start, + ranges.last().unwrap().end + ); total_bytes += emitter.bytes_used; } From 9c1ceece20e14e2a746c382b639f9288746e493c Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Wed, 25 Mar 2020 21:00:01 -0400 Subject: [PATCH 12/14] Add skip list based implementation for smaller encoding This arranges for the sparser sets (everything except lower and uppercase) to be encoded in a significantly smaller context. However, it is also a performance trade-off (roughly 3x slower than the bitset encoding). The 40% size reduction is deemed to be sufficiently important to merit this performance loss, particularly as it is unlikely that this code is hot anywhere (and if it is, paying the memory cost for a bitset that directly represents the data seems worthwhile). Alphabetic : 1599 bytes (- 937 bytes) Case_Ignorable : 949 bytes (- 822 bytes) Cased : 359 bytes (- 429 bytes) Cc : 9 bytes (- 15 bytes) Grapheme_Extend: 813 bytes (- 675 bytes) Lowercase : 863 bytes N : 419 bytes (- 619 bytes) Uppercase : 776 bytes White_Space : 37 bytes (- 46 bytes) Total table sizes: 5824 bytes (-3543 bytes) --- src/libcore/unicode/unicode_data.rs | 1021 ++++------------- src/tools/unicode-table-generator/src/main.rs | 42 +- .../src/range_search.rs | 51 +- .../src/raw_emitter.rs | 73 +- .../unicode-table-generator/src/skiplist.rs | 98 ++ 5 files changed, 466 insertions(+), 819 deletions(-) create mode 100644 src/tools/unicode-table-generator/src/skiplist.rs diff --git a/src/libcore/unicode/unicode_data.rs b/src/libcore/unicode/unicode_data.rs index 660b91b6025d1..72ea8ce038184 100644 --- a/src/libcore/unicode/unicode_data.rs +++ b/src/libcore/unicode/unicode_data.rs @@ -1,7 +1,7 @@ ///! This file is generated by src/tools/unicode-table-generator; do not edit manually! #[inline(always)] -fn range_search< +fn bitset_search< const N: usize, const CHUNK_SIZE: usize, const N1: usize, @@ -50,720 +50,267 @@ fn range_search< (word & (1 << (needle % 64) as u64)) != 0 } +fn decode_prefix_sum(short_offset_run_header: u32) -> u32 { + short_offset_run_header & ((1 << 21) - 1) +} + +fn decode_length(short_offset_run_header: u32) -> usize { + (short_offset_run_header >> 21) as usize +} + +#[inline(always)] +fn skip_search( + needle: u32, + short_offset_runs: &[u32; SOR], + offsets: &[u8; OFFSETS], +) -> bool { + // Note that this *cannot* be past the end of the array, as the last + // element is greater than std::char::MAX (the largest possible needle). + // + // So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct + // location cannot be past it, so Err(idx) != length either. + // + // This means that we can avoid bounds checking for the accesses below, too. + let last_idx = + match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) { + Ok(idx) => idx + 1, + Err(idx) => idx, + }; + + let mut offset_idx = decode_length(short_offset_runs[last_idx]); + let length = if let Some(next) = short_offset_runs.get(last_idx + 1) { + decode_length(*next) - offset_idx + } else { + offsets.len() - offset_idx + }; + let prev = + last_idx.checked_sub(1).map(|prev| decode_prefix_sum(short_offset_runs[prev])).unwrap_or(0); + + let total = needle - prev; + let mut prefix_sum = 0; + for _ in 0..(length - 1) { + let offset = offsets[offset_idx]; + prefix_sum += offset as u32; + if prefix_sum > total { + break; + } + offset_idx += 1; + } + offset_idx % 2 == 1 +} + pub const UNICODE_VERSION: (u32, u32, u32) = (13, 0, 0); #[rustfmt::skip] pub mod alphabetic { - const BITSET_LAST_CHUNK_MAP: u16 = 393; - static BITSET_CHUNKS_MAP: [u8; 394] = [ - 11, 35, 32, 14, 25, 18, 17, 74, 16, 29, 12, 61, 15, 73, 66, 36, 9, 0, 6, 0, 0, 0, 70, 64, - 22, 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 42, 39, 39, 53, 26, 28, 65, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 60, 48, 8, 19, 5, 34, 47, 20, 24, 57, 7, 55, 21, 31, 69, 67, 71, 13, 3, - 39, 43, 58, 0, 0, 0, 0, 0, 39, 39, 63, 0, 0, 0, 0, 0, 0, 0, 39, 56, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 39, 68, 0, 10, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 41, 39, - 39, 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 45, 52, 0, 0, 0, 0, 30, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 59, 54, 0, 0, 0, 0, 27, 4, 0, 0, 49, 0, 0, 23, 2, 0, 0, 0, 0, 0, 0, - 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 51, 39, 39, 39, 39, 39, 39, 39, - 46, 72, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 33, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 40, 0, 0, 0, 0, 0, 0, 39, 62, 0, 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 44, - ]; - static BITSET_INDEX_CHUNKS: [[u8; 8]; 75] = [ - [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 172, 172, 172, 172], [0, 0, 0, 0, 243, 10, 180, 0], - [0, 0, 0, 124, 0, 0, 203, 0], [0, 0, 0, 199, 0, 0, 0, 0], - [0, 0, 24, 185, 242, 112, 231, 168], [0, 0, 55, 197, 0, 0, 0, 0], - [0, 0, 141, 0, 46, 177, 243, 123], [0, 54, 172, 214, 113, 34, 216, 163], - [0, 83, 241, 0, 62, 29, 179, 0], [0, 172, 0, 0, 172, 4, 159, 142], - [0, 249, 116, 3, 172, 172, 172, 172], [1, 172, 172, 172, 172, 172, 172, 172], - [14, 51, 125, 0, 79, 35, 166, 0], [26, 37, 172, 80, 6, 5, 204, 115], - [30, 211, 40, 208, 120, 132, 239, 180], [59, 6, 23, 60, 172, 172, 172, 172], - [67, 157, 68, 139, 66, 58, 99, 136], [75, 128, 69, 106, 71, 143, 74, 167], - [78, 254, 172, 212, 0, 207, 0, 0], [82, 122, 192, 130, 117, 0, 7, 0], - [94, 0, 44, 196, 70, 156, 0, 0], [105, 1, 31, 218, 48, 172, 28, 243], - [111, 93, 109, 0, 0, 0, 0, 0], [127, 102, 190, 154, 208, 137, 186, 0], - [147, 149, 53, 43, 217, 50, 72, 107], [148, 13, 172, 202, 32, 172, 233, 52], - [150, 0, 0, 0, 97, 183, 0, 0], [152, 206, 172, 64, 41, 101, 221, 89], - [172, 22, 88, 19, 20, 189, 244, 248], [172, 103, 161, 0, 0, 0, 0, 0], - [172, 158, 172, 171, 0, 0, 87, 245], [172, 172, 8, 172, 222, 27, 76, 138], - [172, 172, 11, 172, 172, 172, 172, 172], [172, 172, 12, 108, 247, 194, 0, 0], - [172, 172, 172, 145, 0, 77, 33, 219], [172, 172, 172, 172, 9, 96, 91, 104], - [172, 172, 172, 172, 172, 172, 47, 238], [172, 172, 172, 172, 172, 172, 172, 0], - [172, 172, 172, 172, 172, 172, 172, 172], [172, 172, 172, 172, 172, 172, 172, 193], - [172, 172, 172, 172, 172, 172, 172, 210], [172, 172, 172, 172, 172, 172, 172, 214], - [172, 172, 172, 172, 172, 172, 188, 0], [172, 172, 172, 172, 172, 181, 0, 0], - [172, 172, 172, 172, 192, 45, 172, 172], [172, 172, 172, 172, 207, 172, 172, 172], - [172, 172, 172, 172, 209, 153, 0, 0], [172, 172, 172, 172, 215, 6, 232, 110], - [172, 172, 172, 176, 172, 170, 0, 0], [172, 172, 172, 187, 179, 0, 0, 0], - [172, 172, 172, 191, 172, 172, 172, 172], [172, 172, 172, 213, 0, 0, 0, 0], - [172, 172, 182, 251, 172, 172, 172, 172], [172, 172, 230, 61, 235, 236, 237, 234], - [172, 177, 118, 151, 205, 126, 172, 164], [172, 178, 0, 0, 0, 0, 0, 0], - [172, 179, 205, 205, 195, 0, 0, 0], [172, 200, 172, 172, 172, 175, 0, 0], - [172, 225, 63, 226, 90, 17, 172, 172], [172, 228, 172, 188, 92, 16, 204, 18], - [172, 229, 25, 119, 133, 134, 2, 165], [191, 0, 0, 0, 0, 0, 0, 0], - [200, 0, 0, 0, 0, 0, 0, 0], [201, 0, 0, 0, 0, 0, 0, 0], - [209, 56, 216, 129, 38, 42, 172, 198], [209, 95, 65, 114, 172, 172, 172, 250], - [211, 0, 30, 85, 81, 174, 36, 155], [211, 192, 0, 146, 202, 73, 184, 0], - [216, 252, 121, 0, 15, 0, 0, 0], [223, 224, 172, 135, 39, 144, 86, 21], - [227, 6, 162, 211, 0, 0, 0, 0], [231, 172, 172, 172, 172, 172, 172, 172], - [240, 131, 84, 173, 220, 253, 57, 140], [246, 169, 98, 160, 173, 49, 100, 0], - ]; - static BITSET_CANONICAL: [u64; 172] = [ - 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111111111111111111110, - 0b1111111111001111111111111111111111111111111111111111111111111111, - 0b1111111101111111111111111111111111111111011111111111111111111111, - 0b1111111111111111111111111111111111111111111111111000011111111111, - 0b1111111111111111111111111111111111111111111111111110000000000000, - 0b1111111111111111111111111111111111111111111111110000000000000000, - 0b1100000011111111111111111111111111111111111111111111111111111111, - 0b1111111111111111111111111111111111111111111111111111110000000011, - 0b1111111111111111111111111111111100111111001111111111111111111111, - 0b1111111111111111000000111111111111111111111111110000001111111111, - 0b1111111111111111000000000000001111111111111111111111111111111111, - 0b1111111111111111000000000000000000111111111111111111111111111111, - 0b1000111111110000011111111111111111111111111111111111111111111111, - 0b0111111101111111111111111111111111111111111111111111110111111111, - 0b0000000000000000000001111111111111100111111111111111111111111111, - 0b1111111111111111111111111111111111111111111111111111111111011011, - 0b1111111111111111111111111111111111111111111111011111110001011111, - 0b1111111111111111111111111111111111111111111110000000000000000000, - 0b1111111111111111111111111111111111111111011111111111111100111101, - 0b1111111111111111111111111111111111111111001111011111111111111111, - 0b1111111111111111111111111111111101111111011111110111111101111111, - 0b1111111111111111111111111111111100111101011111110011110111111111, - 0b1111111111111111111111111111111100111100000000001111111111111111, - 0b1111111111111111111111111111111100011111111111111111111111111111, - 0b1111111111111111111111111111111100000111111111111111111111111110, - 0b1111111111111111111111111111111100000111111111110000000000000000, - 0b1111111111111111111111111111111100000010011111111111111111111111, - 0b1111111111111111111111111111111100000000000000000111111111111111, - 0b1111111111111111111111111111111100000000000000000100001111100000, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111111111111111011100000011111111111111111111111, - 0b1111111111111111111111111111110011111111100000000000000000000000, - 0b1111111111111111111111111111101111111111111111111101011101000000, - 0b1111111111111111111111111100000000000111111111111111111111111110, - 0b1111111111111111111111011011111100000000000000000000000011001011, - 0b1111111111111111111111001111111100000000000000000000000000000000, - 0b1111111111111111110000000000000011111110111111111111111111111111, - 0b1111111111111111011111110111111100000000011111100111111001111110, - 0b1111111111111111001000001011111111111111111111111111111111111111, - 0b1111111111111111000001111111111111111111111111111111111111111111, - 0b1111111111111111000001111111111111111111111111111111110000000000, - 0b1111111111111111000000111111111111110111111111111111111111111111, - 0b1111111111111111000000111111100011111111111100000000000011111111, - 0b1111111111111111000000011111111110111111111111111011110101111111, - 0b1111111111111111000000001111000000000000000001110000000000000000, - 0b1111111111111111000000001000000000011111111111111111111111111111, - 0b1111111111111111000000000000111111111111111111111111111111111111, - 0b1111111111111110111111111111111111111111111111111111111111100000, - 0b1111111111111110000111111111111111111111111111111111111011111111, - 0b1111111111111110000000000000111111111111111000011101111111111111, - 0b1111111111111100000000000000000000000000000000000000000000000001, - 0b1111111111100000000000000000000000000000000000000000011111111100, - 0b1111111111011111111111111111111100000000000000000000000000000000, - 0b1111111111011111000000000000000000000000000000000000000000000000, - 0b1111111111000000000000000000000000000000000000000000000000000000, - 0b1111110001111111111111111111111100000000000000000011111111111111, - 0b1111110000000000111100111111111111111111111111111111111111111111, - 0b1111110000000000000000000000111110000000111100000101110111011111, - 0b1111100101111111111111111111111111111111111111111111111111111111, - 0b1111011111111111111111111111111111111111111111110010000010111111, - 0b1111011111111111111111111111111111110111111111111111111111111101, - 0b1111001111111111101111010101000000111110001011111111110010000100, - 0b1110101111111111110111100110010011011111111111111111111111111111, - 0b1110100011111100000000000000000000000000000000000000000000101111, - 0b1110011111111111111111111111111111111111111111110000000111111111, - 0b1110011111111111111111111111111111111111111111011101111111111111, - 0b1110001111111111111111011111111111111111111111011101111111101111, - 0b1110001111101111111111011111111111111111111111011101111111101111, - 0b1110001111101101111111011111111111111111111110111011111111101110, - 0b1110001111101101111111011111111111111111111110011001111111101111, - 0b1110001111101101111111011111111111111111111110011001111111101110, - 0b1110001111000101111111011111111111111111111110011001111111101111, - 0b1110000011111111111111111111100000000000000000000000000000001111, - 0b1100001111111111110001110001100011010110001111011100011111101100, - 0b1100001101101101111111011111111111111111111110011000011111101110, - 0b1011111111111111000000000000000000000000000000000000000111111111, - 0b1011110011011111000000000000000000000000000000000000000000100000, - 0b1011011111111111111111110111111111111111111111111110111111111111, - 0b1011010001111111111111111111111111111111111111111111101101111111, - 0b1001110000000000111000011111111000011111111011111111111111111111, - 0b1001100110111111111111111111111111111111011011111111001001111111, - 0b1001000110111111111111111111111111111111111111111111110100111111, - 0b1000000000000010000000000000000000000000000000000000000000000000, - 0b1000000000000000000000001000000000000000000000000000000000000000, - 0b1000000000000000000000000000000011111111111111111111111111111111, - 0b0111111101111111011111110111111100000000011111111111111111111111, - 0b0111111100111111111111111111111111111111111111111111111111111111, - 0b0111111100111101111111111111111111111111111111110011110111111111, - 0b0111110000000000111111111111111100000000000000001000000000000000, - 0b0111101111111111111111111111111111011111110111111110011110111111, - 0b0101111111011111111111111111111111111111111111111111111111111111, - 0b0101111101111111111111011111111111100000111110000000000001111111, - 0b0101111011110111111101111001011010101010100101101110101010000100, - 0b0100000010011111111111111111111111111111111110111111111111111111, - 0b0011111111111111111111111111111111111100000000001110000000000000, - 0b0011111111111111111111111111111110101010111111110011111100111111, - 0b0011111110000000000111111111111111111111111111111111111111111111, - 0b0011101111111111111111111010111111111111111111111111011111010110, - 0b0010111111111011111111111111111111111100011111111111111111101110, - 0b0001111111111111111111111111111111111110111111111111111100000011, - 0b0001111111111111111111111111111100000000000001111111111111111111, - 0b0001111111111111111111111111111100000000000000000000000000000000, - 0b0001111111111111000001111111111111111111111111111111111111111111, - 0b0001111111011100000111111111111100001111110011110001111111011100, - 0b0001111100111110000000111111111000000000000000000000000011100000, - 0b0001111000000000000000000000111100000000000000010001101110111111, - 0b0001000000000011000000000000111110110000100000000101100110011111, - 0b0000111111111111111111111111111111111111000011111111111111111111, - 0b0000111111111111111110111110111000001111111111111111101111111111, - 0b0000111111111111000000000000000000000000000000000000000011111111, - 0b0000101011110111111111101001011011111111111111111111111111101111, - 0b0000011111111111111111111111111111111111111111110000011111111111, - 0b0000011111111111111111111111111000000000000000000000000000000000, - 0b0000010001101111110111100000000000000000000000000000000000000000, - 0b0000010000110000000001111111111111111111111111111111110000000000, - 0b0000010000100000000001000000000000000000000000000000000000000000, - 0b0000001111111111111111111111111100000000001111111111111111111111, - 0b0000000111111111111111111111111111111111111111111111111111111100, - 0b0000000111111111110001111111111111111111111111111111111111111111, - 0b0000000111111111000011111111111101111111111111111111111111111111, - 0b0000000100111111111111111111111111111111111111111111111111111111, - 0b0000000001111111111111111111111100000000001111111111111111111111, - 0b0000000001111111111111111111111100000000000000000000000000011111, - 0b0000000001111111111111111111111100000000000000000000000000000000, - 0b0000000001111111111111101111111111111111111111001111111111111111, - 0b0000000001000111111111111111111111111111111111110000000011110000, - 0b0000000000111111111111111111111111111110111011111111000001101111, - 0b0000000000111111000000000000000001011110000000100001100110000111, - 0b0000000000111100111111111111111100111000000000000000000000000101, - 0b0000000000110111111111111111111100000000000000000000000000000000, - 0b0000000000011111111111111111111001111111111111111111111111111111, - 0b0000000000011111001111111111111111111111111111110000000000000000, - 0b0000000000001111111111111111111100000000000011111101111111111111, - 0b0000000000001101110111111111111100000000000011111111111111111111, - 0b0000000000001100011110000001111111111111111111111111111111111111, - 0b0000000000001100000000000000000011111111010111111000000001111111, - 0b0000000000000111111111111111111100000000001111111111111111111111, - 0b0000000000000111100001111111111111111111111111110000000010110110, - 0b0000000000000110000000000000111101000000011000000001110111011111, - 0b0000000000000011111111111011111111111111111111111111111111111111, - 0b0000000000000011000110111111111111111111111111111111111111111111, - 0b0000000000000011000000000000101100000000000000000000000000000000, - 0b0000000000000010000000000000111110110000110000000001100110011111, - 0b0000000000000000100000001111111111111111111111111111111111111111, - 0b0000000000000000010100000001111100000000000000111111111111000011, - 0b0000000000000000001111111111111111111111111111110000000000000000, - 0b0000000000000000000111111111111111111100111111111111111111111111, - 0b0000000000000000000011000000000011111111111111110001111111111111, - 0b0000000000000000000001111111111100000001111111111111111111111111, - 0b0000000000000000000001111101101111111001111111111111111101111111, - 0b0000000000000000000000011111111111111111111111110000000000000000, - 0b0000000000000000000000001111111111111111111111111111111110111111, - 0b0000000000000000000000001111111100000000001111111111111111111111, - 0b0000000000000000000000000001111111111111111111111111111011111111, - 0b0000000000000000000000000001101011111100111111111111111111111111, - 0b0000000000000000000000000000111111100000100000010001100110011111, - 0b0000000000000000000000000000111100000111011000000001110111011111, - 0b0000000000000000000000000000001110000000000000000000011110111011, - 0b0000000000000000000000000000000011111111111111111000000011111111, - 0b0000000000000000000000000000000011110000000000000010000001011111, - 0b0000000000000000000000000000000001000011111111110000000111111111, - 0b0000000000000000000000000000000000100000111111111111111111111111, - 0b0000000000000000000000000000000000011100111111001111110011111100, - 0b0000000000000000000000000000000000010100000000001100000000011110, - 0b0000000000000000000000000000000000010000100000000000000111111111, - 0b0000000000000000000000000000000000000001011110110111111111111111, - 0b0000000000000000000000000000000000000000100000010001110111000111, - 0b0000000000000000000000000000000000000000001111101111111100001111, - 0b0000000000000000000000000000000000000000000000000010000001111111, - 0b0000000000000000000000000000000000000000000000000000100010001111, - 0b0000000000000000000000000000000000000000000000000000000010110011, + static SHORT_OFFSET_RUNS: [u32; 52] = [ + 706, 33559113, 868226669, 947920662, 1157637302, 1306536960, 1310732293, 1398813696, + 1449151936, 1451270141, 1455465613, 1459660301, 1468061604, 1648425216, 1658911342, + 1661009214, 1707147904, 1793132343, 1853951616, 1994464256, 2330009312, 2418090906, + 2428579840, 2439066671, 2441167872, 2443265607, 2445371392, 2447469113, 2449567296, + 2476836856, 2508295382, 2512498688, 2518790431, 2520888060, 2533473280, 2535576576, + 2556548774, 2634145792, 2682380992, 2715936768, 2720132608, 2736910640, 2875326464, + 2887952094, 2890053429, 2894253730, 2902649825, 2906847232, 2908944926, 2911043584, + 2913145675, 2916356939, ]; - static BITSET_MAPPING: [(u8, u8); 83] = [ - (0, 64), (1, 64), (1, 189), (1, 188), (1, 187), (1, 186), (1, 185), (1, 183), (1, 182), - (1, 181), (1, 179), (1, 78), (1, 176), (1, 175), (1, 174), (1, 170), (1, 166), (1, 165), - (1, 163), (1, 162), (1, 161), (1, 159), (1, 156), (1, 152), (1, 151), (1, 150), (1, 149), - (1, 148), (1, 145), (1, 111), (1, 144), (1, 112), (1, 142), (1, 141), (1, 140), (1, 139), - (1, 138), (1, 137), (1, 136), (1, 135), (1, 133), (1, 132), (1, 131), (1, 130), (1, 63), - (1, 60), (1, 59), (1, 54), (1, 52), (1, 51), (1, 48), (1, 47), (1, 31), (1, 21), (1, 4), - (2, 129), (2, 58), (2, 57), (2, 50), (2, 42), (2, 28), (2, 21), (3, 180), (3, 30), (3, 24), - (3, 18), (4, 132), (4, 33), (4, 17), (5, 80), (5, 32), (6, 112), (6, 16), (7, 96), (7, 3), - (8, 38), (9, 32), (10, 17), (11, 69), (12, 32), (13, 187), (14, 179), (15, 141), + static OFFSETS: [u8; 1391] = [ + 65, 26, 6, 26, 47, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 0, 4, 12, 14, 5, 7, 1, 1, 1, 86, 1, 42, + 5, 1, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 2, 1, 6, 41, 39, + 14, 1, 1, 1, 2, 1, 2, 1, 1, 8, 27, 4, 4, 29, 11, 5, 56, 1, 7, 14, 102, 1, 8, 4, 8, 4, 3, 10, + 3, 2, 1, 16, 48, 13, 101, 24, 33, 9, 2, 4, 1, 5, 24, 2, 19, 19, 25, 7, 11, 53, 21, 1, 18, + 12, 12, 3, 7, 6, 76, 1, 16, 1, 3, 4, 15, 13, 19, 1, 8, 2, 2, 2, 22, 1, 7, 1, 1, 3, 4, 3, 8, + 2, 2, 2, 2, 1, 1, 8, 1, 4, 2, 1, 5, 12, 2, 10, 1, 4, 3, 1, 6, 4, 2, 2, 22, 1, 7, 1, 2, 1, 2, + 1, 2, 4, 5, 4, 2, 2, 2, 4, 1, 7, 4, 1, 1, 17, 6, 11, 3, 1, 9, 1, 3, 1, 22, 1, 7, 1, 2, 1, 5, + 3, 9, 1, 3, 1, 2, 3, 1, 15, 4, 21, 4, 4, 3, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 3, 8, 2, 2, + 2, 2, 9, 2, 4, 2, 1, 5, 13, 1, 16, 2, 1, 6, 3, 3, 1, 4, 3, 2, 1, 1, 1, 2, 3, 2, 3, 3, 3, 12, + 4, 5, 3, 3, 1, 3, 3, 1, 6, 1, 40, 4, 1, 8, 1, 3, 1, 23, 1, 16, 3, 8, 1, 3, 1, 3, 8, 2, 1, 3, + 5, 4, 28, 4, 1, 8, 1, 3, 1, 23, 1, 10, 1, 5, 3, 8, 1, 3, 1, 3, 8, 2, 7, 1, 1, 4, 13, 2, 13, + 13, 1, 3, 1, 41, 2, 8, 1, 3, 1, 3, 1, 1, 5, 4, 7, 5, 22, 6, 1, 3, 1, 18, 3, 24, 1, 9, 1, 1, + 2, 7, 8, 6, 1, 1, 1, 8, 18, 2, 13, 58, 5, 7, 6, 1, 51, 2, 1, 1, 1, 5, 1, 24, 1, 1, 1, 19, 1, + 3, 2, 5, 1, 1, 6, 1, 14, 4, 32, 1, 63, 8, 1, 36, 4, 17, 6, 16, 1, 36, 67, 55, 1, 1, 2, 5, + 16, 64, 10, 4, 2, 38, 1, 1, 5, 1, 2, 43, 1, 0, 1, 4, 2, 7, 1, 1, 1, 4, 2, 41, 1, 4, 2, 33, + 1, 4, 2, 7, 1, 1, 1, 4, 2, 15, 1, 57, 1, 4, 2, 67, 37, 16, 16, 86, 2, 6, 3, 0, 2, 17, 1, 26, + 5, 75, 3, 11, 7, 13, 1, 6, 12, 20, 12, 20, 12, 13, 1, 3, 1, 2, 12, 52, 2, 19, 14, 1, 4, 1, + 67, 89, 7, 43, 5, 70, 10, 31, 1, 12, 4, 9, 23, 30, 2, 5, 11, 44, 4, 26, 54, 28, 4, 63, 2, + 20, 50, 1, 23, 2, 63, 52, 1, 15, 1, 7, 52, 42, 2, 4, 10, 44, 1, 11, 14, 55, 22, 3, 10, 36, + 2, 9, 7, 43, 2, 3, 41, 4, 1, 6, 1, 2, 3, 1, 5, 192, 39, 14, 11, 0, 2, 6, 2, 38, 2, 6, 2, 8, + 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, 7, 116, + 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 11, 2, 4, 5, + 5, 4, 1, 17, 41, 0, 52, 0, 47, 1, 47, 1, 133, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 2, 56, 7, 1, + 16, 23, 9, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 32, 47, 1, 0, 3, 25, 9, 7, 5, 2, + 5, 4, 86, 6, 3, 1, 90, 1, 4, 5, 43, 1, 94, 17, 32, 48, 16, 0, 0, 64, 0, 3, 0, 67, 46, 2, 0, + 3, 16, 10, 2, 20, 47, 5, 8, 3, 113, 39, 9, 2, 103, 2, 53, 2, 9, 42, 17, 1, 33, 24, 52, 12, + 68, 1, 1, 44, 6, 3, 1, 1, 3, 10, 33, 5, 35, 13, 29, 3, 51, 1, 12, 15, 1, 16, 16, 10, 5, 1, + 55, 9, 14, 18, 23, 3, 69, 1, 1, 1, 1, 24, 3, 2, 16, 2, 4, 11, 6, 2, 6, 2, 6, 9, 7, 1, 7, 1, + 43, 1, 14, 6, 123, 21, 0, 12, 23, 4, 49, 0, 0, 2, 106, 38, 7, 12, 5, 5, 12, 1, 13, 1, 5, 1, + 1, 1, 2, 1, 2, 1, 108, 33, 0, 18, 64, 2, 54, 40, 12, 116, 5, 1, 135, 36, 26, 6, 26, 11, 89, + 3, 6, 2, 6, 2, 6, 2, 3, 35, 12, 1, 26, 1, 19, 1, 2, 1, 15, 2, 14, 34, 123, 69, 53, 0, 29, 3, + 49, 47, 32, 13, 30, 5, 43, 5, 30, 2, 36, 4, 8, 1, 5, 42, 158, 18, 36, 4, 36, 4, 40, 8, 52, + 156, 0, 9, 22, 10, 8, 152, 6, 2, 1, 1, 44, 1, 2, 3, 1, 2, 23, 10, 23, 9, 31, 65, 19, 1, 2, + 10, 22, 10, 26, 70, 56, 6, 2, 64, 4, 1, 2, 5, 8, 1, 3, 1, 29, 42, 29, 3, 29, 35, 8, 1, 28, + 27, 54, 10, 22, 10, 19, 13, 18, 110, 73, 55, 51, 13, 51, 13, 40, 0, 42, 1, 2, 3, 2, 78, 29, + 10, 1, 8, 22, 106, 21, 27, 23, 9, 70, 60, 55, 23, 25, 23, 51, 17, 4, 8, 35, 3, 1, 9, 64, 1, + 4, 9, 2, 10, 1, 1, 1, 35, 18, 1, 34, 2, 1, 6, 1, 65, 7, 1, 1, 1, 4, 1, 15, 1, 10, 7, 57, 23, + 4, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 3, 8, 2, 2, 2, 2, 3, 1, 6, 1, 5, 7, 156, 66, 1, 3, + 1, 4, 20, 3, 30, 66, 2, 2, 1, 1, 184, 54, 2, 7, 25, 6, 34, 63, 1, 1, 3, 1, 59, 54, 2, 1, 71, + 27, 2, 14, 213, 57, 103, 64, 31, 8, 2, 1, 2, 8, 1, 2, 1, 30, 1, 2, 2, 2, 2, 4, 93, 8, 2, 46, + 2, 6, 1, 1, 1, 2, 27, 51, 2, 10, 17, 72, 5, 1, 34, 57, 0, 9, 1, 45, 1, 7, 1, 1, 49, 30, 2, + 22, 1, 14, 73, 7, 1, 2, 1, 44, 3, 1, 1, 2, 1, 3, 1, 1, 2, 2, 24, 6, 1, 2, 1, 37, 1, 2, 1, 4, + 1, 1, 0, 23, 185, 1, 79, 0, 102, 111, 17, 196, 0, 0, 0, 0, 0, 0, 7, 31, 113, 30, 18, 48, 16, + 4, 31, 21, 5, 19, 0, 64, 128, 75, 4, 57, 7, 17, 64, 2, 1, 1, 12, 2, 14, 0, 8, 0, 42, 9, 0, + 0, 49, 3, 17, 4, 8, 0, 0, 107, 5, 13, 3, 9, 7, 10, 4, 1, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2, + 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25, + 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 7, 1, 17, 2, 7, 1, + 2, 1, 5, 213, 45, 10, 7, 16, 1, 0, 44, 0, 197, 59, 68, 3, 1, 3, 1, 0, 4, 1, 27, 1, 2, 1, 1, + 2, 1, 1, 10, 1, 4, 1, 1, 1, 1, 6, 1, 4, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 2, 1, 1, 2, 4, 1, 7, 1, 4, 1, 4, 1, 1, 1, 10, 1, 17, 5, 3, 1, 5, 1, 17, 0, + 26, 6, 26, 6, 26, 0, 0, 34, 0, 11, 222, 2, 0, 14, 0, 0, 0, 0, 0, 0, ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } #[rustfmt::skip] pub mod case_ignorable { - const BITSET_LAST_CHUNK_MAP: u16 = 1792; - static BITSET_CHUNKS_MAP: [u8; 251] = [ - 36, 19, 18, 44, 41, 33, 22, 35, 31, 6, 0, 7, 49, 45, 37, 3, 40, 0, 0, 0, 0, 0, 20, 48, 34, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 47, 0, 10, 46, 42, - 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 39, 2, 11, 0, 0, 0, 29, 9, 17, 26, 32, 24, 28, 51, 30, - 27, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 4, 21, 0, 0, 0, 23, 0, 0, 43, 13, 0, 0, 15, 0, 0, 0, 0, 1, 25, - ]; - static BITSET_INDEX_CHUNKS: [[u8; 8]; 52] = [ - [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 8], [0, 0, 0, 0, 0, 0, 0, 141], - [0, 0, 0, 0, 0, 0, 14, 42], [0, 0, 0, 0, 0, 27, 92, 0], [0, 0, 0, 0, 0, 133, 108, 101], - [0, 0, 0, 0, 0, 152, 0, 0], [0, 0, 0, 0, 79, 82, 47, 111], [0, 0, 0, 0, 135, 0, 5, 126], - [0, 0, 0, 0, 156, 0, 0, 0], [0, 0, 0, 17, 0, 0, 0, 0], [0, 0, 0, 136, 0, 168, 0, 0], - [0, 0, 0, 147, 0, 0, 0, 0], [0, 0, 0, 157, 0, 0, 0, 0], [0, 0, 0, 167, 9, 129, 0, 0], - [0, 0, 0, 170, 0, 161, 0, 0], [0, 0, 102, 0, 0, 0, 0, 0], [0, 0, 145, 0, 0, 171, 0, 0], - [0, 0, 169, 0, 0, 109, 12, 80], [0, 0, 174, 123, 123, 64, 176, 0], - [0, 49, 0, 153, 0, 16, 0, 23], [0, 149, 0, 0, 0, 0, 0, 0], - [2, 103, 15, 105, 54, 106, 125, 119], [4, 75, 88, 0, 0, 0, 0, 0], - [6, 110, 37, 181, 0, 0, 13, 182], [22, 123, 0, 0, 123, 123, 123, 11], - [28, 163, 50, 132, 76, 139, 7, 120], [32, 0, 18, 0, 33, 175, 118, 0], - [34, 124, 71, 0, 96, 0, 0, 0], [36, 0, 0, 144, 0, 0, 0, 0], [40, 115, 117, 0, 0, 0, 0, 0], - [41, 78, 112, 140, 0, 0, 0, 0], [44, 0, 0, 98, 54, 77, 0, 0], - [58, 74, 58, 29, 15, 104, 127, 122], [62, 0, 180, 3, 0, 0, 0, 0], - [63, 164, 53, 121, 67, 160, 52, 130], [65, 177, 68, 0, 0, 0, 0, 0], - [70, 17, 0, 66, 24, 69, 21, 1], [72, 57, 30, 73, 0, 97, 0, 94], - [87, 178, 0, 142, 46, 179, 143, 61], [89, 39, 113, 85, 0, 0, 0, 0], - [90, 151, 0, 20, 56, 84, 59, 45], [95, 0, 0, 38, 162, 172, 48, 100], - [99, 0, 0, 0, 159, 0, 0, 0], [114, 86, 0, 91, 26, 158, 10, 51], - [116, 35, 25, 124, 55, 81, 93, 83], [131, 31, 155, 146, 173, 138, 150, 148], - [134, 0, 0, 0, 0, 0, 0, 0], [137, 0, 0, 0, 0, 0, 0, 0], [154, 128, 19, 0, 60, 0, 0, 0], - [165, 0, 0, 0, 0, 0, 0, 0], [166, 0, 0, 0, 43, 128, 0, 107], - ]; - static BITSET_CANONICAL: [u64; 123] = [ - 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b1111101111111111111111111111111111111111111111111111111111111111, - 0b1100000000000000000000000000000000000000000000000000000000010001, - 0b0111000000000000000000000000000000000000000000000000000000000000, - 0b1111100001111111111111111111111111111111111111111111111111111111, - 0b1111111111111100000000000000000000000000000000000000000000000000, - 0b1111111100000000000000000000000000000000000000000000000000000000, - 0b0111111111000000000000000000000000000000000000000000000000000011, - 0b1111100000000000000000000000000000000000000000000000000000000000, - 0b0000000001111111000000000000000000000000000000000000000000000000, - 0b0000000000000001111111111100000000000000000000000000000000000000, - 0b0000000000000000111111111111111111111111111111111111111111111111, - 0b1011111111111111111111111111111111111111111111100000000000000000, - 0b1011000000111100000000000000000000000000000000000000000000000000, - 0b1010000000000000000000000000000000000000000000000000000000000000, - 0b1001000000000000000000000000000000000000000000000000000000000010, - 0b1000000000000000100000000000000000000000000000000000000000000000, - 0b0011111100000000000000000000000000000000000000000000000000000000, - 0b0000000001101101111111001111111111111111111111000000000000000000, - 0b0000000000000000000000100000000000000000000000000000000001100000, - 0b1111111111111111111111111111111111111111111110000000000000000000, - 0b1111111111111111111111111111111111111000000000000000000000000000, - 0b1111111111111111111111111111111100000000000000000000000000000010, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111100000000000000000000000000000000000000000000, - 0b1111111111111111000000001000000000000000000000000000000000000000, - 0b1111111111111111000000000000000000000000000000101000000000000000, - 0b1111111111111000000000111000000000000000000000000000000000000000, - 0b1111111100000000000000000000000000000000000000000000000000000010, - 0b1111110000000000000000000000110000000000000000000010000110111110, - 0b1100000110011101000000000000000000000000000000000000000000000000, - 0b1011111111110111100000000000000000000000000000000000000000000000, - 0b1011111101111111000000000000000000000000000000000000000000000000, - 0b1011010001111110000000000000000000000000000000000000000000000000, - 0b1010011111111000000000000000000000000000000000000000000000000000, - 0b1001111111111000000111111110010101111111010000000000000000000000, - 0b1000011100000000000000000000000000000000000000001111000001101110, - 0b1000010111111000000000000000000000000000000000000000000000000000, - 0b1000000000000011111111111111111100000000000000000000000000110000, - 0b1000000000000010111111111101111100000000000000000000000000000000, - 0b0111100111111000000000000000000000000000000000000000011111111110, - 0b0110011011111101111000000000000000000000000000000000000000000000, - 0b0110000000000000111000000000000011100000000000001110000000000011, - 0b0101100000000000000000000000000000000000000000000000000000000000, - 0b0100000011010011100000000000000000000000000000000000000000000000, - 0b0100000000000000000000000000110000000000000000000010000000011110, - 0b0100000000000000000000000000000000000100000000000100000010000000, - 0b0011111110110000000000000000000000000000000000000000000000000000, - 0b0011001111001000000000000000000000000000000000000000000000000111, - 0b0011000000000000000000000000000000000000000000000000000000000000, - 0b0010011001111000000000000000000000000000000000000000000000000011, - 0b0010010000111111111110000000000000000000000000000000000000000000, - 0b0001111111111111111111111111111111111110111111111110000011011111, - 0b0001111111110010000000000000000000000000000000000000000000000000, - 0b0001100000000000000000000000000000000000000000000000000000000011, - 0b0001011111010000000000000000000000000000000000000000000000001111, - 0b0001010000000000000000000000000000000000000000000000000000000111, - 0b0001000000000001000000000000000000000000000000000001000000001000, - 0b0001000000000000000000000000000000000000000000000000000000000110, - 0b0001000000000000000000000000000000000000000000000000000000000010, - 0b0000111000000100000000011000011100000000000000000000000000000000, - 0b0000111000000000000000000000100000000000000000000000000000000000, - 0b0000100000111110001111000000000000000000000000000000000000100000, - 0b0000011111110010000000000000000000000000000000000000000000000000, - 0b0000010000110000111111111111111111111111111111111111111111111111, - 0b0000010000000000010000001000000000000000000000000000000000000000, - 0b0000001100010000001000011111110111111111111101110000000000000000, - 0b0000001010100000000000000000000000000011000000000000000000000000, - 0b0000000110010000101000010000000000000000000000000000000000000000, - 0b0000000100000000000001111111111111111111111111111111111111111111, - 0b0000000011001111111100000000000000000000000000000000000000000000, - 0b0000000010111111001010000000000000000000000000000000000000000000, - 0b0000000001100110011111100000000000000000000000000000000000000000, - 0b0000000001011000001100000000000000100000000000000000000000000010, - 0b0000000000100011000000000000000000000000000000100011100110000110, - 0b0000000000100000000111111111111111111111111111111111111111111111, - 0b0000000000011111111011111000000000000000000000000000000000000111, - 0b0000000000011111000111111100000000000000000000000000000000000001, - 0b0000000000011110000000000000000111000011000000000000000000000000, - 0b0000000000011100000000000000000000000000000111000000000000000000, - 0b0000000000010000000000000000000000000000000000000000000010110110, - 0b0000000000001111111110000000000000000000000000000000000000000100, - 0b0000000000001100000000000000000000000000000011000000000000000000, - 0b0000000000000011101000110100000000000000000000000000000000000000, - 0b0000000000000010000000000000110000000000111111100010000111111110, - 0b0000000000000001111111111111111111111111111111110000000000000000, - 0b0000000000000001000000000000000011111111111111111111100000000001, - 0b0000000000000000111111111111111100000000000010001111111111111111, - 0b0000000000000000111111111111111011111000000000000000000000010000, - 0b0000000000000000011111001001000000000011000000001111100000000000, - 0b0000000000000000001111111111111111111111110000000000000000000000, - 0b0000000000000000001111011111111110111111110000000000000000000000, - 0b0000000000000000001111000000000000000000000000000000111111100111, - 0b0000000000000000001110110011110000000000000000000000000000000011, - 0b0000000000000000001000010010000000000000000000000000000000000000, - 0b0000000000000000000100000110000000000000000000000000100001000100, - 0b0000000000000000000011111011110011100000000000000000000000000000, - 0b0000000000000000000011100000000011111000000000000000000000000000, - 0b0000000000000000000001111111100010000000000000000000000000000000, - 0b0000000000000000000001111101101111111001111111111111111101111111, - 0b0000000000000000000000000110000000000000000000001000000000000000, - 0b0000000000000000000000000001101100000000000000000000000000000000, - 0b0000000000000000000000000000111101100000000000000000000000000000, - 0b0000000000000000000000000000110000000000011000000011110111000001, - 0b0000000000000000000000000000110000000000011000000010000000011110, - 0b0000000000000000000000000000110000000000000000000011000001000000, - 0b0000000000000000000000000000110000000000000000000010000000011110, - 0b0000000000000000000000000000000100001100111100000000000000000000, - 0b0000000000000000000000000000000011111111111111111000000000000000, - 0b0000000000000000000000000000000010000010000000000000000000000000, - 0b0000000000000000000000000000000001000000000000000000000001011100, - 0b0000000000000000000000000000000000100000100011111111111001000000, - 0b0000000000000000000000000000000000100000000000000010000001100100, - 0b0000000000000000000000000000000000011111111111110000000000000000, - 0b0000000000000000000000000000000000010111111111110000000000111111, - 0b0000000000000000000000000000000000001110011111100000000010000000, - 0b0000000000000000000000000000000000001001100000000000000000000000, - 0b0000000000000000000000000000000000000011011111111111110000000000, - 0b0000000000000000000000000000000000000000101000110000000000000000, - 0b0000000000000000000000000000000000000000010111000000010000000000, - 0b0000000000000000000000000000000000000000000000001001111000000000, - 0b0000000000000000000000000000000000000000000000000011111101000000, - 0b0000000000000000000000000000000000000000000000000010000000000001, + static SHORT_OFFSET_RUNS: [u32; 32] = [ + 688, 44045149, 555751186, 559947709, 794831996, 866136069, 891330581, 916497656, 920692236, + 924908318, 1122041344, 1130430973, 1193347585, 1205931300, 1231097515, 1235294255, + 1445009723, 1453399088, 1512120051, 1575040048, 1579248368, 1583443791, 1596046493, + 1612829031, 1621219840, 1642192896, 1667359024, 1688330988, 1692526800, 1696723963, + 1705902081, 1711210992, ]; - static BITSET_MAPPING: [(u8, u8); 60] = [ - (0, 64), (1, 70), (1, 71), (1, 190), (1, 72), (1, 73), (1, 188), (1, 76), (1, 82), (1, 83), - (1, 85), (1, 91), (1, 100), (1, 102), (1, 117), (1, 118), (1, 121), (1, 66), (1, 67), - (1, 69), (2, 160), (2, 153), (2, 147), (2, 142), (2, 139), (2, 134), (3, 6), (3, 12), - (3, 29), (3, 33), (3, 51), (4, 84), (4, 101), (4, 109), (4, 117), (5, 181), (5, 62), - (5, 63), (6, 12), (6, 46), (6, 7), (7, 176), (7, 134), (7, 57), (8, 53), (8, 59), (9, 19), - (9, 32), (10, 32), (10, 33), (11, 142), (11, 64), (12, 184), (13, 184), (14, 33), (15, 170), - (16, 1), (17, 33), (18, 179), (19, 23), + static OFFSETS: [u8; 821] = [ + 39, 1, 6, 1, 11, 1, 35, 1, 1, 1, 71, 1, 4, 1, 1, 1, 4, 1, 2, 2, 0, 192, 4, 2, 4, 1, 9, 2, + 1, 1, 251, 7, 207, 1, 5, 1, 49, 45, 1, 1, 1, 2, 1, 2, 1, 1, 44, 1, 11, 6, 10, 11, 1, 1, 35, + 1, 10, 21, 16, 1, 101, 8, 1, 10, 1, 4, 33, 1, 1, 1, 30, 27, 91, 11, 58, 11, 4, 1, 2, 1, 24, + 24, 43, 3, 119, 48, 55, 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 13, 1, 15, 1, 58, 1, 4, 4, 8, 1, + 20, 2, 26, 1, 2, 2, 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2, 57, 1, 4, 5, 1, 2, 4, + 1, 20, 2, 22, 6, 1, 1, 58, 1, 2, 1, 1, 4, 8, 1, 7, 2, 11, 2, 30, 1, 61, 1, 12, 1, 50, 1, 3, + 1, 57, 3, 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 1, 6, 1, 5, 2, 20, 2, 28, 2, 57, 2, 4, + 4, 8, 1, 20, 2, 29, 1, 72, 1, 7, 3, 1, 1, 90, 1, 2, 7, 11, 9, 98, 1, 2, 9, 9, 1, 1, 6, 74, + 2, 27, 1, 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, 36, 9, 1, 102, 4, 1, 6, 1, 2, 2, 2, 25, + 2, 4, 3, 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 94, 1, 0, 3, 0, 3, 29, 3, 29, 2, 30, 2, 64, 2, 1, + 7, 8, 1, 2, 11, 3, 1, 5, 1, 45, 4, 52, 1, 65, 2, 34, 1, 118, 3, 4, 2, 9, 1, 6, 3, 219, 2, 2, + 1, 58, 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 39, 1, 8, 17, 63, 4, 48, 1, 1, 5, 1, 1, 5, 1, + 40, 9, 12, 2, 32, 4, 2, 2, 1, 3, 56, 1, 1, 2, 3, 1, 1, 3, 58, 8, 2, 2, 64, 6, 82, 3, 1, 13, + 1, 7, 4, 1, 6, 1, 3, 2, 50, 63, 13, 1, 34, 95, 1, 5, 0, 1, 1, 3, 11, 3, 13, 3, 13, 3, 13, 2, + 12, 5, 8, 2, 10, 1, 2, 1, 2, 5, 49, 5, 1, 10, 1, 1, 13, 1, 16, 13, 51, 33, 0, 2, 113, 3, + 125, 1, 15, 1, 96, 32, 47, 1, 0, 1, 36, 4, 3, 5, 5, 1, 93, 6, 93, 3, 0, 1, 0, 6, 0, 1, 98, + 4, 1, 10, 1, 1, 28, 4, 80, 2, 14, 34, 78, 1, 23, 3, 109, 2, 8, 1, 3, 1, 4, 1, 25, 2, 5, 1, + 151, 2, 26, 18, 13, 1, 38, 8, 25, 11, 46, 3, 48, 1, 2, 4, 2, 2, 17, 1, 21, 2, 66, 6, 2, 2, + 2, 2, 12, 1, 8, 1, 35, 1, 11, 1, 51, 1, 1, 3, 2, 2, 5, 2, 1, 1, 27, 1, 14, 2, 5, 2, 1, 1, + 100, 5, 9, 3, 121, 1, 2, 1, 4, 1, 0, 1, 147, 16, 0, 16, 3, 1, 12, 16, 34, 1, 2, 1, 169, 1, + 7, 1, 6, 1, 11, 1, 35, 1, 1, 1, 47, 1, 45, 2, 67, 1, 21, 3, 0, 1, 226, 1, 149, 5, 0, 3, 1, + 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 0, 2, 153, 11, 176, 1, 54, 15, 56, 3, 49, 4, 2, 2, 2, 1, + 15, 1, 50, 3, 36, 5, 1, 8, 62, 1, 12, 2, 52, 9, 10, 4, 2, 1, 95, 3, 2, 1, 1, 2, 6, 1, 160, + 1, 3, 8, 21, 2, 57, 2, 3, 1, 37, 7, 3, 5, 195, 8, 2, 3, 1, 1, 23, 1, 84, 6, 1, 1, 4, 2, 1, + 2, 238, 4, 6, 2, 1, 2, 27, 2, 85, 8, 2, 1, 1, 2, 106, 1, 1, 1, 2, 6, 1, 1, 101, 3, 2, 4, 1, + 5, 0, 9, 1, 2, 0, 2, 1, 1, 4, 1, 144, 4, 2, 2, 4, 1, 32, 10, 40, 6, 2, 4, 8, 1, 9, 6, 2, 3, + 46, 13, 1, 2, 0, 7, 1, 6, 1, 1, 82, 22, 2, 7, 1, 2, 1, 2, 122, 6, 3, 1, 1, 2, 1, 7, 1, 1, + 72, 2, 3, 1, 1, 1, 0, 2, 0, 9, 0, 5, 59, 7, 9, 4, 0, 1, 63, 17, 64, 2, 1, 2, 0, 2, 1, 4, 0, + 3, 9, 16, 2, 7, 30, 4, 148, 3, 0, 55, 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, 7, 1, 17, 2, 7, + 1, 2, 1, 5, 0, 14, 0, 4, 0, 7, 109, 8, 0, 5, 0, 1, 30, 96, 128, 240, 0, ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } #[rustfmt::skip] pub mod cased { - const BITSET_LAST_CHUNK_MAP: u16 = 124; - static BITSET_CHUNKS_MAP: [u8; 124] = [ - 13, 15, 0, 0, 8, 0, 0, 11, 14, 10, 0, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 3, 2, 0, 16, 0, 12, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, - 0, 0, 0, 7, 6, - ]; - static BITSET_INDEX_CHUNKS: [[u8; 16]; 19] = [ - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 15, 39, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 59, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 33, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 39, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 47, 44, 0, 13, 39, 8, 26], - [0, 0, 0, 0, 16, 60, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 39, 40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 12, 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 61], - [0, 0, 12, 45, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 17, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 36, 0, 39, 39, 39, 0, 39, 39, 39, 39, 4, 22, 21, 23], - [0, 0, 48, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 3, 27, 2, 39, 39, 49, 6, 39, 39, 28, 31, 0, 35, 14, 50], - [0, 34, 32, 0, 19, 11, 62, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [39, 39, 7, 39, 51, 10, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [39, 43, 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [39, 54, 38, 1, 20, 9, 39, 39, 39, 39, 5, 18, 56, 57, 58, 55], - [52, 53, 39, 29, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - ]; - static BITSET_CANONICAL: [u64; 39] = [ - 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111111111111111101111, - 0b1111111101111111111111111111111111111111011111111111111111111111, - 0b0000011111111111111111111111111000000111111111111111111111111110, - 0b1111111111111111111111111111111100111111001111111111111111111111, - 0b1111111111111111111111110011111111111111111111111111111111111111, - 0b1111111111111111111111111111111111111111111111111111111111110000, - 0b1111111111111111111111111111111111111111111111111111110000000011, - 0b1111111111111111111111111111111111111111111111110111100011111111, - 0b1111111111111111111111111111111111111111111111011111110001011111, - 0b1111111111111111111111111111111100000000011111111111111111111111, - 0b1111111111111111111111111111111100000000000000000100001111100000, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111111111111110000000000000000000000000000000000, - 0b1111111111111111111111111111101111111111111111111101011101000000, - 0b1111111111111111000000011111111111110111111111111111111111111111, - 0b1111111111111111000000000000000000000000000000000000000000000000, - 0b1111111111000000000000000000000000000000000000000000000000000000, - 0b1111011111111111111111111111111111110111111111111111111111111101, - 0b1111001000011111101111010101000000111110001011111111110010000100, - 0b0111101111111111111111111111111111011111110111111110011110111111, - 0b0101111111011111111111111111111111111111111111111111111111111111, - 0b0011111111111111111111111111111110101010111111110011111100111111, - 0b0001111111011100000111111111111100001111110011110001111111011100, - 0b0000111111111111111111111111111111111111000011111111111111111111, - 0b0000011111111111111111111111111000000000000000000000000000000000, - 0b0000011101100000000000000000000000000000000000000000011111111100, - 0b0000010000100000000001000000000000000000000000000000000000000000, - 0b0000000111111111111111111111111111111111111011111111111111111111, - 0b0000000000001100011110000001111111111111111111111111111111111111, - 0b0000000000000000001000001011111111111111111111111111111111111111, - 0b0000000000000000000000000001111100000000000000000000000000000011, - 0b0000000000000000000000000000000000011111111111110000000000000000, - 0b0000000000000000000000000000000000000000111110000000000001111111, - 0b1000000000000010000000000000000000000000000000000000000000000000, - 0b1011110011001111000000000000000000000000000000000000000000100000, - 0b1110011111111111111111111111111111111111111111110000000111111111, - 0b1110011111111111111111111111111111111111111111110010000010111111, - 0b1110101111111111110111100110010011011111111111111111111111111111, + static SHORT_OFFSET_RUNS: [u32; 19] = [ + 4256, 115348384, 136322176, 144711446, 163587254, 320875520, 325101120, 358656816, + 392231680, 404815649, 413205504, 421596288, 434182304, 442592832, 446813184, 451008166, + 528607488, 576844080, 582152586, ]; - static BITSET_MAPPING: [(u8, u8); 24] = [ - (0, 64), (1, 188), (1, 183), (1, 182), (1, 176), (1, 162), (1, 160), (1, 150), (1, 146), - (1, 141), (1, 55), (1, 50), (1, 44), (1, 43), (1, 27), (1, 17), (2, 180), (2, 30), (2, 24), - (2, 18), (3, 160), (3, 15), (4, 32), (5, 93), + static OFFSETS: [u8; 283] = [ + 65, 26, 6, 26, 47, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 1, 36, 7, 2, 30, 5, + 96, 1, 42, 4, 2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, + 41, 0, 38, 1, 1, 5, 1, 2, 43, 2, 3, 0, 86, 2, 6, 0, 9, 7, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, + 38, 2, 6, 2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, + 5, 3, 1, 7, 116, 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, + 1, 6, 4, 1, 2, 4, 5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 47, 1, 47, 1, 133, 6, 4, 3, 2, 12, 38, + 1, 1, 5, 1, 0, 46, 18, 30, 132, 102, 3, 4, 1, 48, 2, 9, 42, 2, 1, 3, 0, 43, 1, 13, 7, 80, 0, + 7, 12, 5, 0, 26, 6, 26, 0, 80, 96, 36, 4, 36, 0, 51, 13, 51, 0, 64, 0, 64, 0, 85, 1, 71, 1, + 2, 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, + 7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 68, + 0, 26, 6, 26, 6, 26, 0, ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } #[rustfmt::skip] pub mod cc { - const BITSET_LAST_CHUNK_MAP: u16 = 2; - static BITSET_CHUNKS_MAP: [u8; 3] = [ - 0, 1, 0, - ]; - static BITSET_INDEX_CHUNKS: [[u8; 1]; 3] = [ - [0], [1], [2], + static SHORT_OFFSET_RUNS: [u32; 1] = [ + 1114272, ]; - static BITSET_CANONICAL: [u64; 2] = [ - 0b0000000000000000000000000000000011111111111111111111111111111111, - 0b1000000000000000000000000000000000000000000000000000000000000000, + static OFFSETS: [u8; 5] = [ + 0, 32, 95, 33, 0, ]; - static BITSET_MAPPING: [(u8, u8); 1] = [ - (0, 160), - ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } #[rustfmt::skip] pub mod grapheme_extend { - const BITSET_LAST_CHUNK_MAP: u16 = 1792; - static BITSET_CHUNKS_MAP: [u8; 246] = [ - 0, 6, 17, 44, 37, 31, 20, 32, 29, 4, 0, 5, 43, 40, 33, 0, 41, 0, 0, 0, 0, 0, 9, 0, 36, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 38, 34, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 7, 0, 35, 1, 10, 0, 0, 0, 27, 8, 16, 24, 30, 42, 26, 22, 28, 25, 11, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 14, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 19, 0, 0, - 0, 21, 0, 0, 39, 12, 0, 0, 13, 23, - ]; - static BITSET_INDEX_CHUNKS: [[u8; 8]; 45] = [ - [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 135], [0, 0, 0, 0, 0, 18, 75, 0], - [0, 0, 0, 0, 0, 106, 129, 109], [0, 0, 0, 0, 0, 124, 0, 0], [0, 0, 0, 0, 3, 65, 40, 90], - [0, 0, 0, 0, 99, 139, 0, 0], [0, 0, 0, 0, 107, 0, 0, 0], [0, 0, 0, 0, 130, 0, 0, 0], - [0, 0, 0, 67, 0, 136, 0, 14], [0, 0, 0, 108, 0, 12, 0, 0], [0, 0, 0, 120, 0, 0, 0, 0], - [0, 0, 0, 131, 0, 0, 0, 0], [0, 0, 0, 134, 0, 133, 0, 0], [0, 0, 0, 142, 6, 0, 0, 0], - [0, 0, 115, 0, 0, 0, 0, 0], [0, 0, 118, 0, 0, 126, 0, 0], [0, 0, 132, 0, 0, 0, 11, 98], - [0, 39, 116, 119, 0, 0, 0, 0], [0, 121, 0, 0, 0, 0, 0, 0], [2, 85, 27, 86, 34, 84, 101, 88], - [5, 60, 71, 0, 0, 0, 0, 0], [8, 0, 0, 0, 33, 102, 0, 87], [14, 99, 0, 0, 99, 99, 99, 139], - [17, 1, 51, 0, 61, 112, 138, 97], [21, 0, 56, 0, 22, 141, 95, 0], - [24, 100, 55, 0, 79, 0, 0, 0], [28, 0, 0, 117, 0, 0, 0, 0], [31, 92, 94, 0, 0, 0, 0, 0], - [32, 63, 91, 0, 0, 0, 0, 0], [36, 0, 0, 80, 34, 62, 0, 0], [48, 59, 48, 10, 19, 83, 38, 96], - [50, 105, 44, 140, 53, 30, 43, 103], [54, 0, 0, 52, 0, 0, 0, 7], - [57, 47, 20, 58, 0, 0, 0, 77], [70, 0, 0, 0, 0, 0, 116, 0], [72, 0, 114, 0, 0, 0, 0, 0], - [73, 123, 0, 15, 46, 82, 35, 37], [78, 0, 0, 29, 111, 127, 41, 110], - [81, 0, 0, 0, 6, 0, 0, 0], [93, 26, 9, 100, 45, 64, 76, 66], [104, 0, 0, 68, 0, 0, 0, 0], - [113, 89, 25, 137, 0, 0, 23, 143], [122, 0, 13, 0, 49, 0, 0, 0], - [128, 69, 0, 74, 16, 125, 4, 42], - ]; - static BITSET_CANONICAL: [u64; 99] = [ - 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b1000000000000000000000000000000000000000000000000000000001111111, - 0b1100000000000000000000000000000000000000000000000000000000010001, - 0b0000000000011100000000000000000000000000000111000000000000000000, - 0b0000000000000001111111111100000000000000000000000000000000000000, - 0b1111100001111111111111111111111111111111111111111111111111111111, - 0b0000000001111111000000000000000000000000000000000000000000000000, - 0b1111101111111111111111111111111111111111111111111111111111111111, - 0b0000011011111111100000000000000000000000000000000000000000000000, - 0b1111111111111111000000000000000000000000000000000000000000000000, - 0b1111110000000000000000000000110000000000000000000010000110111110, - 0b1011111111111111111111111111111111111111111111100000000000000000, - 0b0000011111000000000000000000000000000000000000000000000000000000, - 0b0000000000000000000000100000000000000000000000000000000001100000, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111111111111101111111111111110000000000000000000, - 0b1111111111111111000000000000000000000000000000100000000000000000, - 0b1111111100000000000000000000000000000000000000000000000000000010, - 0b1111100000000111110000111010000000000000000000000000000000000000, - 0b1101000000000000000000000000000000000000000000000000000000000010, - 0b1100000110011101000000000000000000000000000000000000000000000000, - 0b1011111101111111000000000000000000000000000000000000000000000000, - 0b1011010001111110000000000000000000000000000000000000000000000000, - 0b1011000000111100100000000000000000000000000000000000000000000000, - 0b1010011111111000000000000000000000000000000000000000000000000000, - 0b1010010111111001000000000000000000000000000000000000000000000000, - 0b1001111111111000000111111110010101111111010000000000000000000000, - 0b1001000000000000000000000000000000000000000000000000000000000010, - 0b1000011100000000000000000000000000000000000000001111000001101110, - 0b1000000000000011111111111111111100000000000000000000000000110000, - 0b0111111111111110000000000000000000000000000000000000000000000000, - 0b0111100111111000000000000000000000000000000000000000011111111110, - 0b0110011011111101111000000000000000000000000000000000000000000000, - 0b0101100000000001000000000000000000000000000000000000000000000000, - 0b0101100000000000000000000000000000000000000000000000000000000011, - 0b0101000000000000000000000000000000000000000000000000000000000010, - 0b0100000011010011100000000000000000000000000000000000000000000000, - 0b0100000000000000000000000000110000000000100000000010000000011110, - 0b0100000000000000000000000000000000000000000000000000000000000100, - 0b0011111111110111100000000000000000000000000000000000000000000000, - 0b0011111110110000000000000000000000000000000000000000000000000000, - 0b0011001111001000000000000000000000000000000000000000000000000111, - 0b0010000000001111111110000000000000000000000000000000000000000000, - 0b0001111111111111111111111111111111111110111111111110000011011111, - 0b0001111111110010000000000000000000000000000000000000000000000000, - 0b0001011111110000000000000000000000000000000000000000000000001111, - 0b0001010000000000000000000000000000000000000000000000000000000111, - 0b0001000000000000000000000000000000000000000000000001000000001000, - 0b0001000000000000000000000000000000000000000000000000000000000110, - 0b0000111000000100000000011000011100000000000000000000000000000000, - 0b0000011111110010000000000000000000000000000000000000000000000000, - 0b0000011001111000000000000000000000000000000000000000000000000011, - 0b0000001100010000001000011111110111111111111101110000000000000000, - 0b0000001010100000000000000000000000000011000000000000000000000000, - 0b0000000011001111111100000000000000000000000000000000000000000000, - 0b0000000010111111001010000000000000000000000000000000000000000000, - 0b0000000001101101111111001111111111111111111111000000000000000000, - 0b0000000001100110011111100000000000000000000000000000000000000000, - 0b0000000001000000001100000000000000000000000000000000000000000010, - 0b0000000000100011000000000000000000000000000000100011100110000110, - 0b0000000000100000000111111111111111111111111111111111111111111111, - 0b0000000000011111111011111000000000000000000000000000000000000111, - 0b0000000000011111000111111100000000000000100000000000000000000001, - 0b0000000000011110000000000000000111000011000000000000000000000000, - 0b0000000000001111111110000000000000000000000000000000000000000100, - 0b0000000000001100000000000000000000000000000011000000000000000000, - 0b0000000000000011101000110100000000000000000000000000000000000000, - 0b0000000000000011100000000000000000000000000000000000000000000000, - 0b0000000000000001111111111111111111111111111111110000000000000000, - 0b0000000000000001000000000000000011111111111111111111100000000000, - 0b0000000000000000111111111111111100000000000000001111111111111111, - 0b0000000000000000111111111111111011111000000000000000000000010000, - 0b0000000000000000111111000000000000000000000000000000000000000000, - 0b0000000000000000001111101110111111111011110000000000000000000000, - 0b0000000000000000001111011001111110011111110000000000000000000000, - 0b0000000000000000001111000000000000000000000000000000111111100111, - 0b0000000000000000001110110011110000000000000000000000000000000011, - 0b0000000000000000001000010010000000000000000000000000000000000000, - 0b0000000000000000000100000110000000000000000000000000100001000100, - 0b0000000000000000000011111011110011100000000000000000000000000000, - 0b0000000000000000000001111111100010000000000000000000000000000000, - 0b0000000000000000000001111101101111111001111111111111111101111111, - 0b0000000000000000000000000000110000000000111111100010000111111110, - 0b0000000000000000000000000000110000000000111000000010000000011110, - 0b0000000000000000000000000000110000000000100000000010000000011110, - 0b0000000000000000000000000000110000000000011000000011110111000001, - 0b0000000000000000000000000000110000000000011000000011000001000100, - 0b0000000000000000000000000000000100001100111100000000000000000000, - 0b0000000000000000000000000000000010000000010111001000010000000000, - 0b0000000000000000000000000000000001000000000000000000000001011100, - 0b0000000000000000000000000000000000100000000011111111111001000000, - 0b0000000000000000000000000000000000100000000000000010000001100100, - 0b0000000000000000000000000000000000001110011111100000000010000000, - 0b0000000000000000000000000000000000001001100000000000000000000000, - 0b0000000000000000000000000000000000000011011111111111110000000000, - 0b0000000000000000000000000000000000000000101000110000000000000000, - 0b0000000000000000000000000000000000000000100000000010000000000001, - 0b0000000000000000000000000000000000000000000000001001111000000000, - 0b0000000000000000000000000000000000000000000000000000000010110110, + static SHORT_OFFSET_RUNS: [u32; 31] = [ + 768, 2098307, 6292881, 10490717, 513808146, 518004748, 723528943, 731918378, 744531567, + 752920578, 769719070, 899743232, 903937950, 912327165, 916523521, 929107236, 954273451, + 958470191, 1180769328, 1252073203, 1315007216, 1319202639, 1327611037, 1340199269, + 1344395776, 1373757440, 1398923568, 1419895532, 1424091344, 1429078048, 1438581232, ]; - static BITSET_MAPPING: [(u8, u8); 45] = [ - (0, 64), (1, 191), (1, 190), (1, 188), (1, 185), (1, 179), (1, 8), (1, 176), (1, 161), - (1, 159), (1, 155), (1, 154), (1, 39), (1, 140), (1, 57), (2, 165), (2, 161), (2, 160), - (2, 153), (2, 147), (2, 142), (2, 139), (3, 176), (3, 167), (3, 153), (3, 149), (4, 26), - (4, 32), (4, 33), (4, 42), (5, 88), (5, 109), (5, 117), (6, 19), (6, 20), (6, 32), (7, 67), - (7, 69), (8, 183), (8, 7), (9, 64), (10, 178), (11, 184), (12, 58), (13, 23), + static OFFSETS: [u8; 689] = [ + 0, 112, 0, 7, 0, 45, 1, 1, 1, 2, 1, 2, 1, 1, 72, 11, 48, 21, 16, 1, 101, 7, 2, 6, 2, 2, 1, + 4, 35, 1, 30, 27, 91, 11, 58, 9, 9, 1, 24, 4, 1, 9, 1, 3, 1, 5, 43, 3, 119, 15, 1, 32, 55, + 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 29, 1, 58, 1, 1, 1, 2, 4, 8, 1, 9, 1, 10, 2, 26, 1, 2, 2, + 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2, 57, 1, 4, 5, 1, 2, 4, 1, 20, 2, 22, 6, 1, + 1, 58, 1, 1, 2, 1, 4, 8, 1, 7, 3, 10, 2, 30, 1, 59, 1, 1, 1, 12, 1, 9, 1, 40, 1, 3, 1, 57, + 3, 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 1, 2, 1, 3, 1, 5, 2, 7, 2, 11, 2, 28, 2, 57, 2, + 1, 1, 2, 4, 8, 1, 9, 1, 10, 2, 29, 1, 72, 1, 4, 1, 2, 3, 1, 1, 8, 1, 81, 1, 2, 7, 12, 8, 98, + 1, 2, 9, 11, 6, 74, 2, 27, 1, 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, 36, 9, 1, 102, 4, 1, + 6, 1, 2, 2, 2, 25, 2, 4, 3, 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 0, 3, 0, 3, 29, 3, 29, 2, 30, + 2, 64, 2, 1, 7, 8, 1, 2, 11, 9, 1, 45, 3, 119, 2, 34, 1, 118, 3, 4, 2, 9, 1, 6, 3, 219, 2, + 2, 1, 58, 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 48, 17, 63, 4, 48, 7, 1, 1, 5, 1, 40, 9, + 12, 2, 32, 4, 2, 2, 1, 3, 56, 1, 1, 2, 3, 1, 1, 3, 58, 8, 2, 2, 152, 3, 1, 13, 1, 7, 4, 1, + 6, 1, 3, 2, 198, 58, 1, 5, 0, 1, 195, 33, 0, 3, 141, 1, 96, 32, 0, 6, 105, 2, 0, 4, 1, 10, + 32, 2, 80, 2, 0, 1, 3, 1, 4, 1, 25, 2, 5, 1, 151, 2, 26, 18, 13, 1, 38, 8, 25, 11, 46, 3, + 48, 1, 2, 4, 2, 2, 39, 1, 67, 6, 2, 2, 2, 2, 12, 1, 8, 1, 47, 1, 51, 1, 1, 3, 2, 2, 5, 2, 1, + 1, 42, 2, 8, 1, 238, 1, 2, 1, 4, 1, 0, 1, 0, 16, 16, 16, 0, 2, 0, 1, 226, 1, 149, 5, 0, 3, + 1, 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 0, 2, 153, 11, 176, 1, 54, 15, 56, 3, 49, 4, 2, 2, + 69, 3, 36, 5, 1, 8, 62, 1, 12, 2, 52, 9, 10, 4, 2, 1, 95, 3, 2, 1, 1, 2, 6, 1, 160, 1, 3, 8, + 21, 2, 57, 2, 1, 1, 1, 1, 22, 1, 14, 7, 3, 5, 195, 8, 2, 3, 1, 1, 23, 1, 81, 1, 2, 6, 1, 1, + 2, 1, 1, 2, 1, 2, 235, 1, 2, 4, 6, 2, 1, 2, 27, 2, 85, 8, 2, 1, 1, 2, 106, 1, 1, 1, 2, 6, 1, + 1, 101, 3, 2, 4, 1, 5, 0, 9, 1, 2, 245, 1, 10, 2, 1, 1, 4, 1, 144, 4, 2, 2, 4, 1, 32, 10, + 40, 6, 2, 4, 8, 1, 9, 6, 2, 3, 46, 13, 1, 2, 0, 7, 1, 6, 1, 1, 82, 22, 2, 7, 1, 2, 1, 2, + 122, 6, 3, 1, 1, 2, 1, 7, 1, 1, 72, 2, 3, 1, 1, 1, 0, 2, 0, 5, 59, 7, 0, 1, 63, 4, 81, 1, 0, + 2, 0, 1, 1, 3, 4, 5, 8, 8, 2, 7, 30, 4, 148, 3, 0, 55, 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, + 7, 1, 17, 2, 7, 1, 2, 1, 5, 0, 7, 0, 4, 0, 7, 109, 7, 0, 96, 128, 240, 0, ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } @@ -858,7 +405,7 @@ pub mod lowercase { ]; pub fn lookup(c: char) -> bool { - super::range_search( + super::bitset_search( c as u32, &BITSET_CHUNKS_MAP, BITSET_LAST_CHUNK_MAP, @@ -871,97 +418,31 @@ pub mod lowercase { #[rustfmt::skip] pub mod n { - const BITSET_LAST_CHUNK_MAP: u16 = 253; - static BITSET_CHUNKS_MAP: [u8; 250] = [ - 45, 0, 0, 37, 7, 38, 26, 35, 31, 5, 0, 12, 42, 21, 39, 0, 28, 0, 22, 4, 0, 0, 13, 0, 40, - 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 43, 46, - 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 8, 14, 20, 0, 24, 27, 11, 25, 29, 15, 34, 33, 17, 0, - 30, 2, 0, 0, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 16, 0, 1, 0, 0, 0, 0, 6, 15, 0, 0, 18, 0, 23, 0, 9, 3, + static SHORT_OFFSET_RUNS: [u32; 38] = [ + 1632, 18876774, 31461440, 102765417, 111154926, 115349830, 132128880, 165684320, 186656630, + 195046653, 199241735, 203436434, 216049184, 241215536, 249605104, 274792208, 278987015, + 283181793, 295766104, 320933114, 383848032, 392238160, 434181712, 442570976, 455154768, + 463544256, 476128256, 480340576, 484535936, 497144544, 501340110, 509731136, 513925872, + 518121671, 522316913, 530706688, 551681008, 556989434, ]; - static BITSET_INDEX_CHUNKS: [[u8; 8]; 47] = [ - [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 13], [0, 0, 0, 0, 0, 0, 0, 49], - [0, 0, 0, 0, 0, 0, 0, 59], [0, 0, 0, 0, 0, 2, 48, 0], [0, 0, 0, 0, 0, 8, 0, 0], - [0, 0, 0, 0, 0, 45, 0, 0], [0, 0, 0, 0, 0, 58, 0, 30], [0, 0, 0, 0, 10, 1, 60, 0], - [0, 0, 0, 0, 47, 0, 0, 0], [0, 0, 0, 0, 55, 0, 0, 0], [0, 0, 0, 4, 59, 0, 0, 0], - [0, 0, 0, 11, 0, 0, 0, 9], [0, 0, 0, 24, 0, 0, 0, 0], [0, 0, 0, 25, 6, 41, 0, 64], - [0, 0, 0, 59, 0, 0, 0, 0], [0, 0, 0, 69, 0, 68, 0, 0], [0, 0, 0, 71, 0, 55, 0, 0], - [0, 0, 0, 73, 0, 55, 0, 0], [0, 0, 50, 0, 0, 0, 0, 0], [0, 0, 57, 0, 0, 0, 0, 0], - [0, 0, 70, 0, 0, 55, 59, 0], [0, 15, 51, 16, 0, 0, 0, 0], [0, 17, 34, 0, 23, 0, 0, 0], - [0, 18, 12, 5, 62, 0, 61, 3], [0, 19, 0, 0, 56, 66, 0, 46], [0, 20, 0, 58, 0, 31, 0, 58], - [0, 22, 72, 65, 0, 43, 53, 0], [0, 29, 45, 0, 0, 14, 42, 0], [0, 36, 0, 59, 2, 0, 0, 33], - [0, 37, 0, 0, 0, 55, 57, 0], [0, 45, 55, 0, 0, 0, 0, 0], [0, 55, 0, 0, 0, 0, 0, 59], - [0, 55, 0, 45, 26, 0, 0, 0], [0, 55, 0, 55, 0, 0, 0, 0], [0, 55, 0, 55, 69, 0, 0, 0], - [0, 57, 0, 0, 0, 38, 0, 0], [0, 57, 0, 59, 0, 0, 0, 45], [0, 58, 0, 58, 0, 32, 0, 35], - [0, 70, 0, 0, 0, 0, 0, 0], [27, 0, 0, 0, 0, 0, 67, 0], [44, 52, 0, 0, 0, 0, 0, 0], - [55, 0, 0, 0, 0, 54, 0, 40], [57, 0, 0, 58, 0, 0, 0, 0], [57, 39, 7, 0, 0, 0, 0, 0], - [59, 0, 21, 0, 0, 0, 0, 0], [63, 0, 0, 55, 45, 0, 0, 28], + static OFFSETS: [u8; 267] = [ + 48, 10, 120, 2, 5, 1, 2, 3, 0, 10, 134, 10, 198, 10, 0, 10, 118, 10, 4, 6, 108, 10, 118, + 10, 118, 10, 2, 6, 110, 13, 115, 10, 8, 7, 103, 10, 104, 7, 7, 19, 109, 10, 96, 10, 118, 10, + 70, 20, 0, 10, 70, 10, 0, 20, 0, 3, 239, 10, 6, 10, 22, 10, 0, 10, 128, 11, 165, 10, 6, 10, + 182, 10, 86, 10, 134, 10, 6, 10, 0, 1, 3, 6, 6, 10, 198, 51, 2, 5, 0, 60, 78, 22, 0, 30, 0, + 1, 0, 1, 25, 9, 14, 3, 0, 4, 138, 10, 30, 8, 1, 15, 32, 10, 39, 15, 0, 10, 188, 10, 0, 6, + 154, 10, 38, 10, 198, 10, 22, 10, 86, 10, 0, 10, 0, 10, 0, 45, 12, 57, 17, 2, 0, 27, 36, 4, + 29, 1, 8, 1, 134, 5, 202, 10, 0, 8, 25, 7, 39, 9, 75, 5, 22, 6, 160, 2, 2, 16, 2, 46, 64, 9, + 52, 2, 30, 3, 75, 5, 104, 8, 24, 8, 41, 7, 0, 6, 48, 10, 0, 31, 158, 10, 42, 4, 112, 7, 134, + 30, 128, 10, 60, 10, 144, 10, 7, 20, 251, 10, 0, 10, 118, 10, 0, 10, 102, 10, 102, 12, 0, + 19, 93, 10, 0, 29, 227, 10, 70, 10, 0, 21, 0, 111, 0, 10, 230, 10, 1, 7, 0, 23, 0, 20, 108, + 25, 0, 50, 0, 10, 0, 10, 0, 9, 128, 10, 0, 59, 1, 3, 1, 4, 76, 45, 1, 15, 0, 13, 0, 10, 0, ]; - static BITSET_CANONICAL: [u64; 44] = [ - 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b0000000111111111111111111111111111111111111111111111111111111111, - 0b1111111111000000000000000000000000000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111001111111111111111, - 0b1111110000000000000000000000000000000000000000000000000000000000, - 0b1111100000000000000000000000000000000000000000000000000000000000, - 0b0000000000000000000000000000111100000000000000000000000000000000, - 0b1111111111111110000000000000000000000000000000000000001111111111, - 0b0001111111111111111111100000000000000000000000000000000000000000, - 0b0000001111111111000000111111111100000000000000000000000000000000, - 0b0000000000001111111111111111111111111111111111111111111110000000, - 0b0000000000000001110000000000000000000000000000000000000000000000, - 0b0000000000000000111111111000000000000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111111100000000000000, - 0b1111111111111111111111111111111111111111111111110000000000000000, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111111000000000000000000000000000000000000000000, - 0b1111111111111110000000000000000000000000000000000000000000000000, - 0b1111111000000000000000000000000011111111000000000000000000000000, - 0b0111111111111111111111111111111100000000000000000000000000000000, - 0b0111111100000000111111111100000000000000000000000000000000000000, - 0b0111001000001100000000000000000000000000000000000000000000000000, - 0b0110000000000000000000000000000000000000000000000000000111111111, - 0b0011111111111111101111111111111111111111111111111111111111111110, - 0b0010000000000000000000000000000000000000000000000000000000000000, - 0b0000111111111111111111111111111000000000000000000000000000000000, - 0b0000111111111111000000000000000000000000000000000000000000000000, - 0b0000011100000000000000111111111000000000000000000000000010000000, - 0b0000001111111111000000000000000000000011111111110000000000000000, - 0b0000001111110001000000000000000000000000000000000000000000000000, - 0b0000001111110000111111111100000000000000000000000000000000000000, - 0b0000000111111111111111111100000001111111000000000000000000000000, - 0b0000000011111100111111111100000000000000000000000000000000000000, - 0b0000000000011111111111111111111000000011111111110000000000000000, - 0b0000000000011110111011111111111111111111111111111111111111111111, - 0b0000000000000111111111111100000000000000000000000000000000000000, - 0b0000000000000000111111111111111111111111111111000000000000000000, - 0b0000000000000000000111111111111111111111111111110000000000000000, - 0b0000000000000000000000000000001111111011111111110000000000000000, - 0b0000000000000000000000000000000011111111111111101111111100000000, - 0b0000000000000000000000000000000000000111111111110000000000000000, - 0b0000000000000000000000000000000000000000000000000000010000000010, - 0b0000000000000000000000000000000000000000000000000000001111100111, - 0b1111111100000000000000000000000011111111000000000000000000000000, - ]; - static BITSET_MAPPING: [(u8, u8); 30] = [ - (0, 64), (1, 175), (1, 76), (1, 172), (1, 165), (1, 164), (1, 162), (1, 157), (1, 138), - (1, 112), (2, 16), (2, 26), (2, 39), (2, 42), (2, 48), (2, 58), (3, 122), (3, 108), (4, 28), - (4, 54), (5, 22), (5, 48), (6, 49), (6, 50), (7, 47), (8, 55), (9, 32), (10, 108), (11, 47), - (12, 32), - ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } @@ -1045,7 +526,7 @@ pub mod uppercase { ]; pub fn lookup(c: char) -> bool { - super::range_search( + super::bitset_search( c as u32, &BITSET_CHUNKS_MAP, BITSET_LAST_CHUNK_MAP, @@ -1058,31 +539,17 @@ pub mod uppercase { #[rustfmt::skip] pub mod white_space { - const BITSET_LAST_CHUNK_MAP: u16 = 32; - static BITSET_CHUNKS_MAP: [u8; 23] = [ - 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 3, - ]; - static BITSET_INDEX_CHUNKS: [[u8; 6]; 4] = [ - [1, 4, 2, 4, 4, 4], [4, 4, 0, 3, 4, 4], [4, 4, 4, 4, 4, 4], [5, 4, 4, 4, 4, 4], - ]; - static BITSET_CANONICAL: [u64; 4] = [ - 0b0000000000000000100000110000000000000000000000000000011111111111, - 0b0000000000000000000000000000000100000000000000000011111000000000, - 0b0000000000000000000000000000000100000000000000000000000000100000, - 0b0000000000000000000000000000000010000000000000000000000000000000, + static SHORT_OFFSET_RUNS: [u32; 4] = [ + 5760, 18882560, 23080960, 40972289, ]; - static BITSET_MAPPING: [(u8, u8); 2] = [ - (0, 176), (0, 175), + static OFFSETS: [u8; 21] = [ + 9, 5, 18, 1, 100, 1, 26, 1, 0, 1, 0, 11, 29, 2, 5, 1, 47, 1, 0, 1, 0, ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 04c72116e5f8b..053ed825018a9 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -4,6 +4,7 @@ use ucd_parse::Codepoints; mod case_mapping; mod raw_emitter; +mod skiplist; mod unicode_download; use raw_emitter::{emit_codepoints, RawEmitter}; @@ -172,13 +173,14 @@ fn main() { modules.push((property.to_lowercase().to_string(), emitter.file)); println!( - "{:15}: {} bytes, {} codepoints in {} ranges ({} - {})", + "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}", property, emitter.bytes_used, datapoints, ranges.len(), ranges.first().unwrap().start, - ranges.last().unwrap().end + ranges.last().unwrap().end, + emitter.desc, ); total_bytes += emitter.bytes_used; } @@ -259,6 +261,7 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec>)]) -> String let mut s = String::new(); s.push_str("#![allow(incomplete_features, unused)]\n"); s.push_str("#![feature(const_generics)]\n\n"); + s.push_str("\n#[allow(unused)]\nuse std::hint;\n"); s.push_str(&format!("#[path = \"{}\"]\n", data_path)); s.push_str("mod unicode_data;\n\n"); @@ -267,7 +270,8 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec>)]) -> String for (property, ranges) in ranges { s.push_str(&format!(r#" println!("Testing {}");"#, property)); s.push('\n'); - s.push_str(&format!(" {}();\n", property.to_lowercase())); + s.push_str(&format!(" {}_true();\n", property.to_lowercase())); + s.push_str(&format!(" {}_false();\n", property.to_lowercase())); let mut is_true = Vec::new(); let mut is_false = Vec::new(); for ch_num in 0..(std::char::MAX as u32) { @@ -281,8 +285,10 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec>)]) -> String } } - s.push_str(&format!(" fn {}() {{\n", property.to_lowercase())); + s.push_str(&format!(" fn {}_true() {{\n", property.to_lowercase())); generate_asserts(&mut s, property, &is_true, true); + s.push_str(" }\n\n"); + s.push_str(&format!(" fn {}_false() {{\n", property.to_lowercase())); generate_asserts(&mut s, property, &is_false, false); s.push_str(" }\n\n"); } @@ -295,19 +301,19 @@ fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool for range in ranges_from_set(points) { if range.end == range.start + 1 { s.push_str(&format!( - " assert!({}unicode_data::{}::lookup(std::char::from_u32({}).unwrap()), \"{}\");\n", + " assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n", if truthy { "" } else { "!" }, property.to_lowercase(), - range.start, std::char::from_u32(range.start).unwrap(), - )); + range.start, + )); } else { s.push_str(&format!(" for chn in {:?}u32 {{\n", range)); s.push_str(&format!( " assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n", if truthy { "" } else { "!" }, property.to_lowercase(), - )); + )); s.push_str(" }\n"); } } @@ -323,17 +329,25 @@ fn merge_ranges(ranges: &mut Vec>) { loop { let mut new_ranges = Vec::new(); let mut idx_iter = 0..(ranges.len() - 1); + let mut should_insert_last = true; while let Some(idx) = idx_iter.next() { let cur = ranges[idx].clone(); let next = ranges[idx + 1].clone(); if cur.end == next.start { - let _ = idx_iter.next(); // skip next as we're merging it in + if idx_iter.next().is_none() { + // We're merging the last element + should_insert_last = false; + } new_ranges.push(cur.start..next.end); } else { + // We're *not* merging the last element + should_insert_last = true; new_ranges.push(cur); } } - new_ranges.push(ranges.last().unwrap().clone()); + if should_insert_last { + new_ranges.push(ranges.last().unwrap().clone()); + } if new_ranges.len() == ranges.len() { *ranges = new_ranges; break; @@ -341,4 +355,12 @@ fn merge_ranges(ranges: &mut Vec>) { *ranges = new_ranges; } } + + let mut last_end = None; + for range in ranges { + if let Some(last) = last_end { + assert!(range.start > last, "{:?}", range); + } + last_end = Some(range.end); + } } diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs index b57fd2c1d8623..49e65521c9846 100644 --- a/src/tools/unicode-table-generator/src/range_search.rs +++ b/src/tools/unicode-table-generator/src/range_search.rs @@ -1,5 +1,5 @@ #[inline(always)] -fn range_search< +fn bitset_search< const N: usize, const CHUNK_SIZE: usize, const N1: usize, @@ -47,3 +47,52 @@ fn range_search< }; (word & (1 << (needle % 64) as u64)) != 0 } + +fn decode_prefix_sum(short_offset_run_header: u32) -> u32 { + short_offset_run_header & ((1 << 21) - 1) +} + +fn decode_length(short_offset_run_header: u32) -> usize { + (short_offset_run_header >> 21) as usize +} + +#[inline(always)] +fn skip_search( + needle: u32, + short_offset_runs: &[u32; SOR], + offsets: &[u8; OFFSETS], +) -> bool { + // Note that this *cannot* be past the end of the array, as the last + // element is greater than std::char::MAX (the largest possible needle). + // + // So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct + // location cannot be past it, so Err(idx) != length either. + // + // This means that we can avoid bounds checking for the accesses below, too. + let last_idx = + match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) { + Ok(idx) => idx + 1, + Err(idx) => idx, + }; + + let mut offset_idx = decode_length(short_offset_runs[last_idx]); + let length = if let Some(next) = short_offset_runs.get(last_idx + 1) { + decode_length(*next) - offset_idx + } else { + offsets.len() - offset_idx + }; + let prev = + last_idx.checked_sub(1).map(|prev| decode_prefix_sum(short_offset_runs[prev])).unwrap_or(0); + + let total = needle - prev; + let mut prefix_sum = 0; + for _ in 0..(length - 1) { + let offset = offsets[offset_idx]; + prefix_sum += offset as u32; + if prefix_sum > total { + break; + } + offset_idx += 1; + } + offset_idx % 2 == 1 +} diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index e5b15224795f5..db9d04b3fa9d3 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -46,12 +46,13 @@ use std::ops::Range; #[derive(Clone)] pub struct RawEmitter { pub file: String, + pub desc: String, pub bytes_used: usize, } impl RawEmitter { pub fn new() -> RawEmitter { - RawEmitter { file: String::new(), bytes_used: 0 } + RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() } } fn blank_line(&mut self) { @@ -61,8 +62,21 @@ impl RawEmitter { writeln!(&mut self.file, "").unwrap(); } - fn emit_bitset(&mut self, words: &[u64]) { - let mut words = words.to_vec(); + fn emit_bitset(&mut self, ranges: &[Range]) { + let last_code_point = ranges.last().unwrap().end; + // bitset for every bit in the codepoint range + // + // + 2 to ensure an all zero word to use for padding + let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2]; + for range in ranges { + for codepoint in range.clone() { + let bucket = codepoint as usize / 64; + let bit = codepoint as u64 % 64; + buckets[bucket] |= 1 << bit; + } + } + + let mut words = buckets; // Ensure that there's a zero word in the dataset, used for padding and // such. words.push(0); @@ -118,6 +132,19 @@ impl RawEmitter { // We only need it for the words that we removed by applying a shift and // flip to them. self.bytes_used += 2 * canonicalized.canonicalized_words.len(); + + self.blank_line(); + + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " super::bitset_search(",).unwrap(); + writeln!(&mut self.file, " c as u32,").unwrap(); + writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); + writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap(); + writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap(); + writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap(); + writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap(); + writeln!(&mut self.file, " )").unwrap(); + writeln!(&mut self.file, "}}").unwrap(); } fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) { @@ -184,40 +211,24 @@ impl RawEmitter { .unwrap(); self.bytes_used += chunk_length * chunks.len(); } - - pub fn emit_lookup(&mut self) { - writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); - writeln!(&mut self.file, " super::range_search(",).unwrap(); - writeln!(&mut self.file, " c as u32,").unwrap(); - writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); - writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap(); - writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap(); - writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap(); - writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap(); - writeln!(&mut self.file, " )").unwrap(); - writeln!(&mut self.file, "}}").unwrap(); - } } pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range]) { emitter.blank_line(); - let last_code_point = ranges.last().unwrap().end; - // bitset for every bit in the codepoint range - // - // + 2 to ensure an all zero word to use for padding - let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2]; - for range in ranges { - for codepoint in range.clone() { - let bucket = codepoint as usize / 64; - let bit = codepoint as u64 % 64; - buckets[bucket] |= 1 << bit; - } - } + let mut bitset = emitter.clone(); + bitset.emit_bitset(&ranges); - emitter.emit_bitset(&buckets); - emitter.blank_line(); - emitter.emit_lookup(); + let mut skiplist = emitter.clone(); + skiplist.emit_skiplist(&ranges); + + if bitset.bytes_used <= skiplist.bytes_used { + *emitter = bitset; + emitter.desc = format!("bitset"); + } else { + *emitter = skiplist; + emitter.desc = format!("skiplist"); + } } struct Canonicalized { diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs new file mode 100644 index 0000000000000..6e439968c3bfd --- /dev/null +++ b/src/tools/unicode-table-generator/src/skiplist.rs @@ -0,0 +1,98 @@ +use crate::fmt_list; +use crate::raw_emitter::RawEmitter; +use std::convert::TryInto; +use std::fmt::Write as _; +use std::ops::Range; + +/// This will get packed into a single u32 before inserting into the data set. +#[derive(Debug, PartialEq)] +struct ShortOffsetRunHeader { + /// Note, we only allow for 21 bits here. + prefix_sum: u32, + + /// Note, we actually only allow for 11 bits here. This should be enough -- + /// our largest sets are around ~1400 offsets long. + start_idx: u16, +} + +impl ShortOffsetRunHeader { + fn pack(&self) -> u32 { + assert!(self.start_idx < (1 << 11)); + assert!(self.prefix_sum < (1 << 21)); + + (self.start_idx as u32) << 21 | self.prefix_sum + } +} + +impl RawEmitter { + pub fn emit_skiplist(&mut self, ranges: &[Range]) { + let mut offsets = Vec::::new(); + let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::>(); + let mut offset = 0; + for pt in points { + let delta = pt - offset; + offsets.push(delta); + offset = pt; + } + // Guaranteed to terminate, as it's impossible to subtract a value this + // large from a valid char. + offsets.push(std::char::MAX as u32 + 1); + let mut coded_offsets: Vec = Vec::new(); + let mut short_offset_runs: Vec = vec![]; + let mut iter = offsets.iter().cloned(); + let mut prefix_sum = 0; + loop { + let mut any_elements = false; + let mut inserted = false; + let start = coded_offsets.len(); + for offset in iter.by_ref() { + any_elements = true; + prefix_sum += offset; + if let Ok(offset) = offset.try_into() { + coded_offsets.push(offset); + } else { + short_offset_runs.push(ShortOffsetRunHeader { + start_idx: start.try_into().unwrap(), + prefix_sum, + }); + // This is just needed to maintain indices even/odd + // correctly. + coded_offsets.push(0); + inserted = true; + break; + } + } + if !any_elements { + break; + } + // We always append the huge char::MAX offset to the end which + // should never be able to fit into the u8 offsets. + assert!(inserted); + } + + writeln!( + &mut self.file, + "static SHORT_OFFSET_RUNS: [u32; {}] = [{}];", + short_offset_runs.len(), + fmt_list(short_offset_runs.iter().map(|v| v.pack())) + ) + .unwrap(); + self.bytes_used += 4 * short_offset_runs.len(); + writeln!( + &mut self.file, + "static OFFSETS: [u8; {}] = [{}];", + coded_offsets.len(), + fmt_list(&coded_offsets) + ) + .unwrap(); + self.bytes_used += coded_offsets.len(); + + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " super::skip_search(",).unwrap(); + writeln!(&mut self.file, " c as u32,").unwrap(); + writeln!(&mut self.file, " &SHORT_OFFSET_RUNS,").unwrap(); + writeln!(&mut self.file, " &OFFSETS,").unwrap(); + writeln!(&mut self.file, " )").unwrap(); + writeln!(&mut self.file, "}}").unwrap(); + } +} From b6bc9060041bb5de18d9b31fe935d29193d9bad5 Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Fri, 27 Mar 2020 18:01:14 -0400 Subject: [PATCH 13/14] Remove separate encoding for a single nonzero-mapping byte In practice, for the two data sets that still use the bitset encoding (uppercase and lowercase) this is not a significant win, so just drop it entirely. It costs us about 5 bytes, and the complexity is nontrivial. --- src/libcore/unicode/unicode_data.rs | 22 ++++++----------- .../src/range_search.rs | 9 ++----- .../src/raw_emitter.rs | 24 ------------------- 3 files changed, 9 insertions(+), 46 deletions(-) diff --git a/src/libcore/unicode/unicode_data.rs b/src/libcore/unicode/unicode_data.rs index 72ea8ce038184..48caa21fb0aa1 100644 --- a/src/libcore/unicode/unicode_data.rs +++ b/src/libcore/unicode/unicode_data.rs @@ -10,7 +10,6 @@ fn bitset_search< >( needle: u32, chunk_idx_map: &[u8; N], - last_chunk_idx: u16, bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], bitset_canonical: &[u64; CANONICAL], bitset_canonicalized: &[(u8, u8); CANONICALIZED], @@ -18,12 +17,8 @@ fn bitset_search< let bucket_idx = (needle / 64) as usize; let chunk_map_idx = bucket_idx / CHUNK_SIZE; let chunk_piece = bucket_idx % CHUNK_SIZE; - // The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`, - // so we need to remap it - let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) { - chunk_idx_map[chunk_map_idx] - } else if chunk_map_idx == last_chunk_idx as usize { - chunk_idx_map[chunk_idx_map.len() - 1] + let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) { + v } else { return false; }; @@ -317,12 +312,12 @@ pub mod grapheme_extend { #[rustfmt::skip] pub mod lowercase { - const BITSET_LAST_CHUNK_MAP: u16 = 122; - static BITSET_CHUNKS_MAP: [u8; 119] = [ + static BITSET_CHUNKS_MAP: [u8; 123] = [ 13, 16, 0, 0, 8, 0, 0, 11, 12, 9, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 14, 0, 7, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 6, + 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, + 0, 0, 6, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 18] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], @@ -408,7 +403,6 @@ pub mod lowercase { super::bitset_search( c as u32, &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, &BITSET_INDEX_CHUNKS, &BITSET_CANONICAL, &BITSET_MAPPING, @@ -449,13 +443,12 @@ pub mod n { #[rustfmt::skip] pub mod uppercase { - const BITSET_LAST_CHUNK_MAP: u16 = 124; - static BITSET_CHUNKS_MAP: [u8; 124] = [ + static BITSET_CHUNKS_MAP: [u8; 125] = [ 12, 15, 5, 5, 0, 5, 5, 2, 4, 11, 5, 14, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 13, 5, 10, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 16, 5, 5, - 5, 5, 9, 3, + 5, 5, 9, 5, 3, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [ [41, 41, 5, 33, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 5, 0], @@ -529,7 +522,6 @@ pub mod uppercase { super::bitset_search( c as u32, &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, &BITSET_INDEX_CHUNKS, &BITSET_CANONICAL, &BITSET_MAPPING, diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs index 49e65521c9846..39b47ce703f37 100644 --- a/src/tools/unicode-table-generator/src/range_search.rs +++ b/src/tools/unicode-table-generator/src/range_search.rs @@ -8,7 +8,6 @@ fn bitset_search< >( needle: u32, chunk_idx_map: &[u8; N], - last_chunk_idx: u16, bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], bitset_canonical: &[u64; CANONICAL], bitset_canonicalized: &[(u8, u8); CANONICALIZED], @@ -16,12 +15,8 @@ fn bitset_search< let bucket_idx = (needle / 64) as usize; let chunk_map_idx = bucket_idx / CHUNK_SIZE; let chunk_piece = bucket_idx % CHUNK_SIZE; - // The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`, - // so we need to remap it - let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) { - chunk_idx_map[chunk_map_idx] - } else if chunk_map_idx == last_chunk_idx as usize { - chunk_idx_map[chunk_idx_map.len() - 1] + let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) { + v } else { return false; }; diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index db9d04b3fa9d3..dd0746cf695a6 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -139,7 +139,6 @@ impl RawEmitter { writeln!(&mut self.file, " super::bitset_search(",).unwrap(); writeln!(&mut self.file, " c as u32,").unwrap(); writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); - writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap(); writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap(); writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap(); writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap(); @@ -170,29 +169,6 @@ impl RawEmitter { chunk_indices.push(chunk_map[chunk]); } - // If one of the chunks has all of the entries point to the bitset - // word filled with zeros, then pop those off the end -- we know they - // are useless. - let zero_chunk_idx = chunks.iter().position(|chunk| chunk.iter().all(|e| *e == zero_at)); - while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx { - chunk_indices.pop(); - } - // We do not count the LAST_CHUNK_MAP as adding bytes because it's a - // small constant whose values are inlined directly into the instruction - // stream. - writeln!( - &mut self.file, - "const BITSET_LAST_CHUNK_MAP: u16 = {};", - chunk_indices.len() - 1, - ) - .unwrap(); - let nonzero = chunk_indices.pop().unwrap(); - // Try to pop again, now that we've recorded a non-zero pointing index - // into the LAST_CHUNK_MAP. - while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx { - chunk_indices.pop(); - } - chunk_indices.push(nonzero); writeln!( &mut self.file, "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];", From ad679a7f433b60e9d5a7ce5029d50600aa919fd6 Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Fri, 27 Mar 2020 18:13:22 -0400 Subject: [PATCH 14/14] Update the documentation comment --- src/tools/unicode-table-generator/src/main.rs | 73 +++++++++++++++++++ .../src/raw_emitter.rs | 39 ---------- 2 files changed, 73 insertions(+), 39 deletions(-) diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 053ed825018a9..d5562ff91df4d 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -1,3 +1,76 @@ +//! This implements the core logic of the compression scheme used to compactly +//! encode Unicode properties. +//! +//! We have two primary goals with the encoding: we want to be compact, because +//! these tables often end up in ~every Rust program (especially the +//! grapheme_extend table, used for str debugging), including those for embedded +//! targets (where space is important). We also want to be relatively fast, +//! though this is more of a nice to have rather than a key design constraint. +//! It is expected that libraries/applications which are performance-sensitive +//! to Unicode property lookups are extremely rare, and those that care may find +//! the tradeoff of the raw bitsets worth it. For most applications, a +//! relatively fast but much smaller (and as such less cache-impacting, etc.) +//! data set is likely preferable. +//! +//! We have two separate encoding schemes: a skiplist-like approach, and a +//! compressed bitset. The datasets we consider mostly use the skiplist (it's +//! smaller) but the lowercase and uppercase sets are sufficiently sparse for +//! the bitset to be worthwhile -- for those sets the biset is a 2x size win. +//! Since the bitset is also faster, this seems an obvious choice. (As a +//! historical note, the bitset was also the prior implementation, so its +//! relative complexity had already been paid). +//! +//! ## The bitset +//! +//! The primary idea is that we 'flatten' the Unicode ranges into an enormous +//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need +//! over 17 kilobytes of data per character set -- way too much for our +//! purposes. +//! +//! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF, +//! not skipping the small 'gap') is associated into words (u64) and +//! deduplicated. On random data, this would be useless; on our data, this is +//! incredibly beneficial -- our data sets have (far) less than 256 unique +//! words. +//! +//! This gives us an array that maps `u8 -> word`; the current algorithm does +//! not handle the case of more than 256 unique words, but we are relatively far +//! from coming that close. +//! +//! With that scheme, we now have a single byte for every 64 codepoints. +//! +//! We further chunk these by some constant N (between 1 and 64 per group, +//! dynamically chosen for smallest size), and again deduplicate and store in an +//! array (u8 -> [u8; N]). +//! +//! The bytes of this array map into the words from the bitset above, but we +//! apply another trick here: some of these words are similar enough that they +//! can be represented by some function of another word. The particular +//! functions chosen are rotation, inversion, and shifting (right). +//! +//! ## The skiplist +//! +//! The skip list arose out of the desire for an even smaller encoding than the +//! bitset -- and was the answer to the question "what is the smallest +//! representation we can imagine?". However, it is not necessarily the +//! smallest, and if you have a better proposal, please do suggest it! +//! +//! This is a relatively straightforward encoding. First, we break up all the +//! ranges in the input data into offsets from each other, essentially a gap +//! encoding. In practice, most gaps are small -- less than u8::MAX -- so we +//! store those directly. We make use of the larger gaps (which are nicely +//! interspersed already) throughout the dataset to index this data set. +//! +//! In particular, each run of small gaps (terminating in a large gap) is +//! indexed in a separate dataset. That data set stores an index into the +//! primary offset list and a prefix sum of that offset list. These are packed +//! into a single u32 (11 bits for the offset, 21 bits for the prefix sum). +//! +//! Lookup proceeds via a binary search in the index and then a straightforward +//! linear scan (adding up the offsets) until we reach the needle, and then the +//! index of that offset is utilized as the answer to whether we're in the set +//! or not. + use std::collections::{BTreeMap, HashMap}; use std::ops::Range; use ucd_parse::Codepoints; diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index dd0746cf695a6..95b63aca1549b 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -1,42 +1,3 @@ -//! This implements the core logic of the compression scheme used to compactly -//! encode the Unicode character classes. -//! -//! The primary idea is that we 'flatten' the Unicode ranges into an enormous -//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need -//! over 17 kilobytes of data per character set -- way too much for our -//! purposes. -//! -//! We have two primary goals with the encoding: we want to be compact, because -//! these tables often end up in ~every Rust program (especially the -//! grapheme_extend table, used for str debugging), including those for embedded -//! targets (where space is important). We also want to be relatively fast, -//! though this is more of a nice to have rather than a key design constraint. -//! In practice, due to modern processor design these two are closely related. -//! -//! The encoding scheme here compresses the bitset by first deduplicating the -//! "words" (64 bits on all platforms). In practice very few words are present -//! in most data sets. -//! -//! This gives us an array that maps `u8 -> word` (if we ever went beyond 256 -//! words, we could go to u16 -> word or have some dual compression scheme -//! mapping into two separate sets; currently this is not dealt with). -//! -//! With that scheme, we now have a single byte for every 64 codepoints. We -//! further group these by some constant N (between 1 and 64 per group), and -//! again deduplicate and store in an array (u8 -> [u8; N]). The constant is -//! chosen to be optimal in bytes-in-memory for the given dataset. -//! -//! The indices into this array represent ranges of 64*16 = 1024 codepoints. -//! -//! This already reduces the top-level array to at most 1,086 bytes, but in -//! practice we usually can encode in far fewer (the first couple Unicode planes -//! are dense). -//! -//! The last byte of this top-level array is pulled out to a separate static -//! and trailing zeros are dropped; this is simply because grapheme_extend and -//! case_ignorable have a single entry in the 896th entry, so this shrinks them -//! down considerably. - use crate::fmt_list; use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::convert::TryFrom;