1
+ //! This implements the core logic of the compression scheme used to compactly
2
+ //! encode Unicode properties.
3
+ //!
4
+ //! We have two primary goals with the encoding: we want to be compact, because
5
+ //! these tables often end up in ~every Rust program (especially the
6
+ //! grapheme_extend table, used for str debugging), including those for embedded
7
+ //! targets (where space is important). We also want to be relatively fast,
8
+ //! though this is more of a nice to have rather than a key design constraint.
9
+ //! It is expected that libraries/applications which are performance-sensitive
10
+ //! to Unicode property lookups are extremely rare, and those that care may find
11
+ //! the tradeoff of the raw bitsets worth it. For most applications, a
12
+ //! relatively fast but much smaller (and as such less cache-impacting, etc.)
13
+ //! data set is likely preferable.
14
+ //!
15
+ //! We have two separate encoding schemes: a skiplist-like approach, and a
16
+ //! compressed bitset. The datasets we consider mostly use the skiplist (it's
17
+ //! smaller) but the lowercase and uppercase sets are sufficiently sparse for
18
+ //! the bitset to be worthwhile -- for those sets the biset is a 2x size win.
19
+ //! Since the bitset is also faster, this seems an obvious choice. (As a
20
+ //! historical note, the bitset was also the prior implementation, so its
21
+ //! relative complexity had already been paid).
22
+ //!
23
+ //! ## The bitset
24
+ //!
25
+ //! The primary idea is that we 'flatten' the Unicode ranges into an enormous
26
+ //! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
27
+ //! over 17 kilobytes of data per character set -- way too much for our
28
+ //! purposes.
29
+ //!
30
+ //! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF,
31
+ //! not skipping the small 'gap') is associated into words (u64) and
32
+ //! deduplicated. On random data, this would be useless; on our data, this is
33
+ //! incredibly beneficial -- our data sets have (far) less than 256 unique
34
+ //! words.
35
+ //!
36
+ //! This gives us an array that maps `u8 -> word`; the current algorithm does
37
+ //! not handle the case of more than 256 unique words, but we are relatively far
38
+ //! from coming that close.
39
+ //!
40
+ //! With that scheme, we now have a single byte for every 64 codepoints.
41
+ //!
42
+ //! We further chunk these by some constant N (between 1 and 64 per group,
43
+ //! dynamically chosen for smallest size), and again deduplicate and store in an
44
+ //! array (u8 -> [u8; N]).
45
+ //!
46
+ //! The bytes of this array map into the words from the bitset above, but we
47
+ //! apply another trick here: some of these words are similar enough that they
48
+ //! can be represented by some function of another word. The particular
49
+ //! functions chosen are rotation, inversion, and shifting (right).
50
+ //!
51
+ //! ## The skiplist
52
+ //!
53
+ //! The skip list arose out of the desire for an even smaller encoding than the
54
+ //! bitset -- and was the answer to the question "what is the smallest
55
+ //! representation we can imagine?". However, it is not necessarily the
56
+ //! smallest, and if you have a better proposal, please do suggest it!
57
+ //!
58
+ //! This is a relatively straightforward encoding. First, we break up all the
59
+ //! ranges in the input data into offsets from each other, essentially a gap
60
+ //! encoding. In practice, most gaps are small -- less than u8::MAX -- so we
61
+ //! store those directly. We make use of the larger gaps (which are nicely
62
+ //! interspersed already) throughout the dataset to index this data set.
63
+ //!
64
+ //! In particular, each run of small gaps (terminating in a large gap) is
65
+ //! indexed in a separate dataset. That data set stores an index into the
66
+ //! primary offset list and a prefix sum of that offset list. These are packed
67
+ //! into a single u32 (11 bits for the offset, 21 bits for the prefix sum).
68
+ //!
69
+ //! Lookup proceeds via a binary search in the index and then a straightforward
70
+ //! linear scan (adding up the offsets) until we reach the needle, and then the
71
+ //! index of that offset is utilized as the answer to whether we're in the set
72
+ //! or not.
73
+
1
74
use std:: collections:: { BTreeMap , HashMap } ;
2
75
use std:: ops:: Range ;
3
76
use ucd_parse:: Codepoints ;
4
77
5
78
mod case_mapping;
6
79
mod raw_emitter;
80
+ mod skiplist;
7
81
mod unicode_download;
8
82
9
83
use raw_emitter:: { emit_codepoints, RawEmitter } ;
@@ -152,9 +226,17 @@ fn main() {
152
226
std:: process:: exit ( 1 ) ;
153
227
} ) ;
154
228
229
+ // Optional test path, which is a Rust source file testing that the unicode
230
+ // property lookups are correct.
231
+ let test_path = std:: env:: args ( ) . nth ( 2 ) ;
232
+
155
233
let unicode_data = load_data ( ) ;
156
234
let ranges_by_property = & unicode_data. ranges ;
157
235
236
+ if let Some ( path) = test_path {
237
+ std:: fs:: write ( & path, generate_tests ( & write_location, & ranges_by_property) ) . unwrap ( ) ;
238
+ }
239
+
158
240
let mut total_bytes = 0 ;
159
241
let mut modules = Vec :: new ( ) ;
160
242
for ( property, ranges) in ranges_by_property {
@@ -163,7 +245,16 @@ fn main() {
163
245
emit_codepoints ( & mut emitter, & ranges) ;
164
246
165
247
modules. push ( ( property. to_lowercase ( ) . to_string ( ) , emitter. file ) ) ;
166
- println ! ( "{:15}: {} bytes, {} codepoints" , property, emitter. bytes_used, datapoints, ) ;
248
+ println ! (
249
+ "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}" ,
250
+ property,
251
+ emitter. bytes_used,
252
+ datapoints,
253
+ ranges. len( ) ,
254
+ ranges. first( ) . unwrap( ) . start,
255
+ ranges. last( ) . unwrap( ) . end,
256
+ emitter. desc,
257
+ ) ;
167
258
total_bytes += emitter. bytes_used ;
168
259
}
169
260
@@ -173,7 +264,10 @@ fn main() {
173
264
"///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n " ,
174
265
) ;
175
266
176
- table_file. push_str ( "use super::range_search;\n \n " ) ;
267
+ // Include the range search function
268
+ table_file. push ( '\n' ) ;
269
+ table_file. push_str ( include_str ! ( "range_search.rs" ) ) ;
270
+ table_file. push ( '\n' ) ;
177
271
178
272
table_file. push_str ( & version ( ) ) ;
179
273
@@ -236,26 +330,110 @@ fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
236
330
out
237
331
}
238
332
333
+ fn generate_tests ( data_path : & str , ranges : & [ ( & str , Vec < Range < u32 > > ) ] ) -> String {
334
+ let mut s = String :: new ( ) ;
335
+ s. push_str ( "#![allow(incomplete_features, unused)]\n " ) ;
336
+ s. push_str ( "#![feature(const_generics)]\n \n " ) ;
337
+ s. push_str ( "\n #[allow(unused)]\n use std::hint;\n " ) ;
338
+ s. push_str ( & format ! ( "#[path = \" {}\" ]\n " , data_path) ) ;
339
+ s. push_str ( "mod unicode_data;\n \n " ) ;
340
+
341
+ s. push_str ( "\n fn main() {\n " ) ;
342
+
343
+ for ( property, ranges) in ranges {
344
+ s. push_str ( & format ! ( r#" println!("Testing {}");"# , property) ) ;
345
+ s. push ( '\n' ) ;
346
+ s. push_str ( & format ! ( " {}_true();\n " , property. to_lowercase( ) ) ) ;
347
+ s. push_str ( & format ! ( " {}_false();\n " , property. to_lowercase( ) ) ) ;
348
+ let mut is_true = Vec :: new ( ) ;
349
+ let mut is_false = Vec :: new ( ) ;
350
+ for ch_num in 0 ..( std:: char:: MAX as u32 ) {
351
+ if std:: char:: from_u32 ( ch_num) . is_none ( ) {
352
+ continue ;
353
+ }
354
+ if ranges. iter ( ) . any ( |r| r. contains ( & ch_num) ) {
355
+ is_true. push ( ch_num) ;
356
+ } else {
357
+ is_false. push ( ch_num) ;
358
+ }
359
+ }
360
+
361
+ s. push_str ( & format ! ( " fn {}_true() {{\n " , property. to_lowercase( ) ) ) ;
362
+ generate_asserts ( & mut s, property, & is_true, true ) ;
363
+ s. push_str ( " }\n \n " ) ;
364
+ s. push_str ( & format ! ( " fn {}_false() {{\n " , property. to_lowercase( ) ) ) ;
365
+ generate_asserts ( & mut s, property, & is_false, false ) ;
366
+ s. push_str ( " }\n \n " ) ;
367
+ }
368
+
369
+ s. push_str ( "}" ) ;
370
+ s
371
+ }
372
+
373
+ fn generate_asserts ( s : & mut String , property : & str , points : & [ u32 ] , truthy : bool ) {
374
+ for range in ranges_from_set ( points) {
375
+ if range. end == range. start + 1 {
376
+ s. push_str ( & format ! (
377
+ " assert!({}unicode_data::{}::lookup({:?}), \" {}\" );\n " ,
378
+ if truthy { "" } else { "!" } ,
379
+ property. to_lowercase( ) ,
380
+ std:: char :: from_u32( range. start) . unwrap( ) ,
381
+ range. start,
382
+ ) ) ;
383
+ } else {
384
+ s. push_str ( & format ! ( " for chn in {:?}u32 {{\n " , range) ) ;
385
+ s. push_str ( & format ! (
386
+ " assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \" {{:?}}\" , chn);\n " ,
387
+ if truthy { "" } else { "!" } ,
388
+ property. to_lowercase( ) ,
389
+ ) ) ;
390
+ s. push_str ( " }\n " ) ;
391
+ }
392
+ }
393
+ }
394
+
395
+ fn ranges_from_set ( set : & [ u32 ] ) -> Vec < Range < u32 > > {
396
+ let mut ranges = set. iter ( ) . map ( |e| ( * e) ..( * e + 1 ) ) . collect :: < Vec < Range < u32 > > > ( ) ;
397
+ merge_ranges ( & mut ranges) ;
398
+ ranges
399
+ }
400
+
239
401
fn merge_ranges ( ranges : & mut Vec < Range < u32 > > ) {
240
402
loop {
241
403
let mut new_ranges = Vec :: new ( ) ;
242
404
let mut idx_iter = 0 ..( ranges. len ( ) - 1 ) ;
405
+ let mut should_insert_last = true ;
243
406
while let Some ( idx) = idx_iter. next ( ) {
244
407
let cur = ranges[ idx] . clone ( ) ;
245
408
let next = ranges[ idx + 1 ] . clone ( ) ;
246
409
if cur. end == next. start {
247
- let _ = idx_iter. next ( ) ; // skip next as we're merging it in
410
+ if idx_iter. next ( ) . is_none ( ) {
411
+ // We're merging the last element
412
+ should_insert_last = false ;
413
+ }
248
414
new_ranges. push ( cur. start ..next. end ) ;
249
415
} else {
416
+ // We're *not* merging the last element
417
+ should_insert_last = true ;
250
418
new_ranges. push ( cur) ;
251
419
}
252
420
}
253
- new_ranges. push ( ranges. last ( ) . unwrap ( ) . clone ( ) ) ;
421
+ if should_insert_last {
422
+ new_ranges. push ( ranges. last ( ) . unwrap ( ) . clone ( ) ) ;
423
+ }
254
424
if new_ranges. len ( ) == ranges. len ( ) {
255
425
* ranges = new_ranges;
256
426
break ;
257
427
} else {
258
428
* ranges = new_ranges;
259
429
}
260
430
}
431
+
432
+ let mut last_end = None ;
433
+ for range in ranges {
434
+ if let Some ( last) = last_end {
435
+ assert ! ( range. start > last, "{:?}" , range) ;
436
+ }
437
+ last_end = Some ( range. end ) ;
438
+ }
261
439
}
0 commit comments