24
24
25
25
import enum
26
26
import math
27
+ import operator
27
28
import os
28
29
import re
29
30
import sys
30
31
import urllib .request
31
32
from collections import defaultdict
32
33
from itertools import batched
34
+ from typing import Callable
33
35
34
36
UNICODE_VERSION = "15.1.0"
35
37
"""The version of the Unicode data files to download."""
@@ -90,13 +92,32 @@ def fetch_open(filename: str, local_prefix: str = ""):
90
92
sys .exit (1 )
91
93
92
94
93
- def load_unicode_version () -> " tuple[int, int, int]" :
95
+ def load_unicode_version () -> tuple [int , int , int ]:
94
96
"""Returns the current Unicode version by fetching and processing `ReadMe.txt`."""
95
97
with fetch_open ("ReadMe.txt" ) as readme :
96
98
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
97
99
return tuple (map (int , re .search (pattern , readme .read ()).groups ()))
98
100
99
101
102
+ def load_property (filename : str , pattern : str , action : Callable [[int ], None ]):
103
+ with fetch_open (filename ) as properties :
104
+ single = re .compile (rf"^([0-9A-F]+)\s*;\s*{ pattern } \s+" )
105
+ multiple = re .compile (rf"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*{ pattern } \s+" )
106
+
107
+ for line in properties .readlines ():
108
+ raw_data = None # (low, high)
109
+ if match := single .match (line ):
110
+ raw_data = (match .group (1 ), match .group (1 ))
111
+ elif match := multiple .match (line ):
112
+ raw_data = (match .group (1 ), match .group (2 ))
113
+ else :
114
+ continue
115
+ low = int (raw_data [0 ], 16 )
116
+ high = int (raw_data [1 ], 16 )
117
+ for cp in range (low , high + 1 ):
118
+ action (cp )
119
+
120
+
100
121
class EffectiveWidth (enum .IntEnum ):
101
122
"""Represents the width of a Unicode character. All East Asian Width classes resolve into
102
123
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
@@ -112,15 +133,15 @@ class EffectiveWidth(enum.IntEnum):
112
133
""" Two columns wide in a CJK context. One column wide in all other contexts. """
113
134
114
135
115
- def load_east_asian_widths () -> " list[EffectiveWidth]" :
136
+ def load_east_asian_widths () -> list [EffectiveWidth ]:
116
137
"""Return a list of effective widths, indexed by codepoint.
117
138
Widths are determined by fetching and parsing `EastAsianWidth.txt`.
118
139
119
140
`Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`.
120
141
121
142
`Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.
122
143
123
- `Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
144
+ `Ambiguous` characters are assigned `EffectiveWidth.AMBIGUOUS`."""
124
145
with fetch_open ("EastAsianWidth.txt" ) as eaw :
125
146
# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
126
147
single = re .compile (r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)" )
@@ -161,7 +182,7 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
161
182
return width_map
162
183
163
184
164
- def load_zero_widths () -> " list[bool]" :
185
+ def load_zero_widths () -> list [bool ]:
165
186
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
166
187
character. `c` is considered a zero-width character if
167
188
@@ -180,26 +201,11 @@ def load_zero_widths() -> "list[bool]":
180
201
# `Grapheme_Extend` includes characters with general category `Mn` or `Me`,
181
202
# as well as a few `Mc` characters that need to be included so that
182
203
# canonically equivalent sequences have the same width.
183
- with fetch_open ("DerivedCoreProperties.txt" ) as properties :
184
- single = re .compile (
185
- r"^([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
186
- )
187
- multiple = re .compile (
188
- r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
189
- )
190
-
191
- for line in properties .readlines ():
192
- raw_data = None # (low, high)
193
- if match := single .match (line ):
194
- raw_data = (match .group (1 ), match .group (1 ))
195
- elif match := multiple .match (line ):
196
- raw_data = (match .group (1 ), match .group (2 ))
197
- else :
198
- continue
199
- low = int (raw_data [0 ], 16 )
200
- high = int (raw_data [1 ], 16 )
201
- for cp in range (low , high + 1 ):
202
- zw_map [cp ] = True
204
+ load_property (
205
+ "DerivedCoreProperties.txt" ,
206
+ r"(?:Default_Ignorable_Code_Point|Grapheme_Extend)" ,
207
+ lambda cp : operator .setitem (zw_map , cp , True ),
208
+ )
203
209
204
210
# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
205
211
# as they canonically decompose to two characters with this property,
@@ -217,29 +223,11 @@ def load_zero_widths() -> "list[bool]":
217
223
# and the resulting grapheme has width 2.
218
224
#
219
225
# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
220
- with fetch_open ("HangulSyllableType.txt" ) as categories :
221
- single = re .compile (r"^([0-9A-F]+)\s*;\s*(V|T)\s+" )
222
- multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(V|T)\s+" )
223
-
224
- for line in categories .readlines ():
225
- raw_data = None # (low, high)
226
- if match := single .match (line ):
227
- raw_data = (match .group (1 ), match .group (1 ))
228
- elif match := multiple .match (line ):
229
- raw_data = (match .group (1 ), match .group (2 ))
230
- else :
231
- continue
232
- low = int (raw_data [0 ], 16 )
233
- high = int (raw_data [1 ], 16 )
234
- for cp in range (low , high + 1 ):
235
- zw_map [cp ] = True
236
-
237
- # Special case: U+115F HANGUL CHOSEONG FILLER.
238
- # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
239
- # zero width. However, the expected usage is to combine it with vowel or trailing jamo
240
- # (which are considered 0-width on their own) to form a composed Hangul syllable with
241
- # width 2. Therefore, we treat it as having width 2.
242
- zw_map [0x115F ] = False
226
+ load_property (
227
+ "HangulSyllableType.txt" ,
228
+ r"(?:V|T)" ,
229
+ lambda cp : operator .setitem (zw_map , cp , True ),
230
+ )
243
231
244
232
# Syriac abbreviation mark:
245
233
# Zero-width `Prepended_Concatenation_Mark`
@@ -252,7 +240,14 @@ def load_zero_widths() -> "list[bool]":
252
240
zw_map [0x0891 ] = True
253
241
zw_map [0x08E2 ] = True
254
242
255
- # U+A8FA DEVANAGARI CARET
243
+ # HANGUL CHOSEONG FILLER
244
+ # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
245
+ # zero width. However, the expected usage is to combine it with vowel or trailing jamo
246
+ # (which are considered 0-width on their own) to form a composed Hangul syllable with
247
+ # width 2. Therefore, we treat it as having width 2.
248
+ zw_map [0x115F ] = False
249
+
250
+ # DEVANAGARI CARET
256
251
# https://www.unicode.org/versions/Unicode15.0.0/ch12.pdf#G667447
257
252
zw_map [0xA8FA ] = True
258
253
@@ -287,13 +282,13 @@ def try_extend(self, attempt: "Bucket") -> bool:
287
282
self .widths = more
288
283
return True
289
284
290
- def entries (self ) -> " list[tuple[Codepoint, EffectiveWidth]]" :
285
+ def entries (self ) -> list [tuple [Codepoint , EffectiveWidth ]]:
291
286
"""Return a list of the codepoint/width pairs in this bucket, sorted by codepoint."""
292
287
result = list (self .entry_set )
293
288
result .sort ()
294
289
return result
295
290
296
- def width (self ) -> " EffectiveWidth | None" :
291
+ def width (self ) -> EffectiveWidth | None :
297
292
"""If all codepoints in this bucket have the same width, return that width; otherwise,
298
293
return `None`."""
299
294
if len (self .widths ) == 0 :
@@ -305,7 +300,7 @@ def width(self) -> "EffectiveWidth | None":
305
300
return potential_width
306
301
307
302
308
- def make_buckets (entries , low_bit : BitPos , cap_bit : BitPos ) -> " list[Bucket]" :
303
+ def make_buckets (entries , low_bit : BitPos , cap_bit : BitPos ) -> list [Bucket ]:
309
304
"""Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All
310
305
codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the
311
306
same bucket. Returns a list of the buckets in increasing order of those bits."""
@@ -373,7 +368,7 @@ def buckets(self):
373
368
"""Returns an iterator over this table's buckets."""
374
369
return self .indexed
375
370
376
- def to_bytes (self ) -> " list[int]" :
371
+ def to_bytes (self ) -> list [int ]:
377
372
"""Returns this table's entries as a list of bytes. The bytes are formatted according to
378
373
the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries
379
374
to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will
@@ -389,8 +384,8 @@ def to_bytes(self) -> "list[int]":
389
384
390
385
391
386
def make_tables (
392
- table_cfgs : " list[tuple[BitPos, BitPos, OffsetType]]" , entries
393
- ) -> " list[Table]" :
387
+ table_cfgs : list [tuple [BitPos , BitPos , OffsetType ]], entries
388
+ ) -> list [Table ]:
394
389
"""Creates a table for each configuration in `table_cfgs`, with the first config corresponding
395
390
to the top-level lookup table, the second config corresponding to the second-level lookup
396
391
table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs
@@ -404,7 +399,7 @@ def make_tables(
404
399
return tables
405
400
406
401
407
- def load_emoji_presentation_sequences () -> " list[int]" :
402
+ def load_emoji_presentation_sequences () -> list [int ]:
408
403
"""Outputs a list of character ranages, corresponding to all the valid characters for starting
409
404
an emoji presentation sequence."""
410
405
@@ -420,7 +415,7 @@ def load_emoji_presentation_sequences() -> "list[int]":
420
415
return codepoints
421
416
422
417
423
- def load_text_presentation_sequences () -> " list[int]" :
418
+ def load_text_presentation_sequences () -> list [int ]:
424
419
"""Outputs a list of character ranages, corresponding to all the valid characters
425
420
whose widths change with a text presentation sequence."""
426
421
@@ -435,24 +430,12 @@ def load_text_presentation_sequences() -> "list[int]":
435
430
text_presentation_seq_codepoints .add (cp )
436
431
437
432
default_emoji_codepoints = set ()
438
- with fetch_open ("emoji/emoji-data.txt" ) as emoji_data :
439
- single = re .compile (r"^([0-9A-F]+)\s*;\s*Emoji_Presentation\s+" )
440
- multiple = re .compile (
441
- r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Presentation\s+"
442
- )
443
433
444
- for line in emoji_data .readlines ():
445
- raw_data = None # (low, high)
446
- if match := single .match (line ):
447
- raw_data = (match .group (1 ), match .group (1 ))
448
- elif match := multiple .match (line ):
449
- raw_data = (match .group (1 ), match .group (2 ))
450
- else :
451
- continue
452
- low = int (raw_data [0 ], 16 )
453
- high = int (raw_data [1 ], 16 )
454
- for cp in range (low , high + 1 ):
455
- default_emoji_codepoints .add (cp )
434
+ load_property (
435
+ "emoji/emoji-data.txt" ,
436
+ "Emoji_Presentation" ,
437
+ lambda cp : default_emoji_codepoints .add (cp ),
438
+ )
456
439
457
440
codepoints = []
458
441
for cp in text_presentation_seq_codepoints .intersection (default_emoji_codepoints ):
@@ -466,11 +449,11 @@ def load_text_presentation_sequences() -> "list[int]":
466
449
467
450
468
451
def make_presentation_sequence_table (
469
- seqs : " list[int]" ,
470
- width_map : " list[EffectiveWidth]" ,
471
- spurious_false : " set[EffectiveWidth]" ,
472
- spurious_true : " set[EffectiveWidth]" ,
473
- ) -> " tuple[list[tuple[int, int]], list[list[int]]]" :
452
+ seqs : list [Codepoint ] ,
453
+ width_map : list [EffectiveWidth ],
454
+ spurious_false : set [EffectiveWidth ],
455
+ spurious_true : set [EffectiveWidth ],
456
+ ) -> tuple [list [tuple [int , int ]], list [list [int ]]]:
474
457
"""Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.
475
458
The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
476
459
"""
@@ -488,13 +471,13 @@ def make_presentation_sequence_table(
488
471
):
489
472
del prefixes_dict [k ]
490
473
491
- msbs : " list[int]" = list (prefixes_dict .keys ())
474
+ msbs : list [int ] = list (prefixes_dict .keys ())
492
475
493
476
for cp , width in enumerate (width_map ):
494
477
if width in spurious_true and (cp >> 10 ) in msbs :
495
478
prefixes_dict [cp >> 10 ].add (cp & 0x3FF )
496
479
497
- leaves : " list[list[int]]" = []
480
+ leaves : list [list [int ]] = []
498
481
for cps in prefixes_dict .values ():
499
482
leaf = [0 ] * 128
500
483
for cp in cps :
@@ -524,10 +507,10 @@ def make_presentation_sequence_table(
524
507
525
508
def emit_module (
526
509
out_name : str ,
527
- unicode_version : " tuple[int, int, int]" ,
528
- tables : " list[Table]" ,
529
- emoji_presentation_table : " tuple[list[tuple[int, int]], list[list[int]]]" ,
530
- text_presentation_table : " tuple[list[tuple[int, int]], list[list[int]]]" ,
510
+ unicode_version : tuple [int , int , int ],
511
+ tables : list [Table ],
512
+ emoji_presentation_table : tuple [list [tuple [int , int ]], list [list [int ]]],
513
+ text_presentation_table : tuple [list [tuple [int , int ]], list [list [int ]]],
531
514
):
532
515
"""Outputs a Rust module to `out_name` using table data from `tables`.
533
516
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -574,18 +557,18 @@ def emit_module(
574
557
pub fn lookup_width(c: char, is_cjk: bool) -> usize {
575
558
let cp = c as usize;
576
559
577
- let t1_offset = TABLES_0[cp >> 13 & 0xFF];
560
+ let t1_offset = TABLES_0.0 [cp >> 13 & 0xFF];
578
561
579
562
// Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte,
580
563
// so each sub-table is 128 bytes in size.
581
564
// (Sub-tables are selected using the computed offset from the previous table.)
582
- let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];
565
+ let t2_offset = TABLES_1.0 [128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];
583
566
584
567
// Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits.
585
568
// This is accomplished by packing four stored entries into one byte.
586
569
// So each sub-table is 2**(6-2) == 16 bytes in size.
587
570
// Since this is the last table, each entry represents an encoded width.
588
- let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];
571
+ let packed_widths = TABLES_2.0 [16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];
589
572
590
573
// Extract the packed width
591
574
let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;
@@ -669,6 +652,12 @@ def emit_module(
669
652
// Use the 3 LSB of `cp` to index into `leaf_byte`.
670
653
((leaf_byte >> (cp & 7)) & 1) == 1
671
654
}
655
+
656
+ #[repr(align(128))]
657
+ struct Align128<T>(T);
658
+
659
+ #[repr(align(16))]
660
+ struct Align16<T>(T);
672
661
"""
673
662
)
674
663
@@ -677,26 +666,27 @@ def emit_module(
677
666
new_subtable_count = len (table .buckets ())
678
667
if i == len (tables ) - 1 :
679
668
table .indices_to_widths () # for the last table, indices == widths
669
+ align = 16
670
+ else :
671
+ align = 128
680
672
byte_array = table .to_bytes ()
681
673
module .write (
682
674
f"""
683
675
/// Autogenerated. { subtable_count } sub-table(s). Consult [`lookup_width`] for layout info.
684
- static TABLES_{ i } : [u8; { len (byte_array )} ] = ["""
676
+ static TABLES_{ i } : Align { align } < [u8; { len (byte_array )} ]> = Align { align } ( ["""
685
677
)
686
678
for j , byte in enumerate (byte_array ):
687
679
# Add line breaks for every 15th entry (chosen to match what rustfmt does)
688
680
if j % 15 == 0 :
689
681
module .write ("\n " )
690
682
module .write (f" 0x{ byte :02X} ," )
691
- module .write ("\n ];\n " )
683
+ module .write ("\n ]) ;\n " )
692
684
subtable_count = new_subtable_count
693
685
694
686
# emoji table
695
687
696
688
module .write (
697
689
f"""
698
- #[repr(align(128))]
699
- struct Align128<T>(T);
700
690
/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
701
691
/// to get whether it can start an emoji presentation sequence.
702
692
static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; { len (emoji_presentation_leaves )} ]> = Align128([
0 commit comments