Skip to content

Commit a2db56b

Browse files
Refactor unicode.py
- Align tables - Use helper function to parse properties
1 parent da626ef commit a2db56b

File tree

2 files changed

+96
-102
lines changed

2 files changed

+96
-102
lines changed

scripts/unicode.py

+81-91
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,14 @@
2424

2525
import enum
2626
import math
27+
import operator
2728
import os
2829
import re
2930
import sys
3031
import urllib.request
3132
from collections import defaultdict
3233
from itertools import batched
34+
from typing import Callable
3335

3436
UNICODE_VERSION = "15.1.0"
3537
"""The version of the Unicode data files to download."""
@@ -90,13 +92,32 @@ def fetch_open(filename: str, local_prefix: str = ""):
9092
sys.exit(1)
9193

9294

93-
def load_unicode_version() -> "tuple[int, int, int]":
95+
def load_unicode_version() -> tuple[int, int, int]:
9496
"""Returns the current Unicode version by fetching and processing `ReadMe.txt`."""
9597
with fetch_open("ReadMe.txt") as readme:
9698
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
9799
return tuple(map(int, re.search(pattern, readme.read()).groups()))
98100

99101

102+
def load_property(filename: str, pattern: str, action: Callable[[int], None]):
103+
with fetch_open(filename) as properties:
104+
single = re.compile(rf"^([0-9A-F]+)\s*;\s*{pattern}\s+")
105+
multiple = re.compile(rf"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*{pattern}\s+")
106+
107+
for line in properties.readlines():
108+
raw_data = None # (low, high)
109+
if match := single.match(line):
110+
raw_data = (match.group(1), match.group(1))
111+
elif match := multiple.match(line):
112+
raw_data = (match.group(1), match.group(2))
113+
else:
114+
continue
115+
low = int(raw_data[0], 16)
116+
high = int(raw_data[1], 16)
117+
for cp in range(low, high + 1):
118+
action(cp)
119+
120+
100121
class EffectiveWidth(enum.IntEnum):
101122
"""Represents the width of a Unicode character. All East Asian Width classes resolve into
102123
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
@@ -112,15 +133,15 @@ class EffectiveWidth(enum.IntEnum):
112133
""" Two columns wide in a CJK context. One column wide in all other contexts. """
113134

114135

115-
def load_east_asian_widths() -> "list[EffectiveWidth]":
136+
def load_east_asian_widths() -> list[EffectiveWidth]:
116137
"""Return a list of effective widths, indexed by codepoint.
117138
Widths are determined by fetching and parsing `EastAsianWidth.txt`.
118139
119140
`Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`.
120141
121142
`Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.
122143
123-
`Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
144+
`Ambiguous` characters are assigned `EffectiveWidth.AMBIGUOUS`."""
124145
with fetch_open("EastAsianWidth.txt") as eaw:
125146
# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
126147
single = re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
@@ -161,7 +182,7 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
161182
return width_map
162183

163184

164-
def load_zero_widths() -> "list[bool]":
185+
def load_zero_widths() -> list[bool]:
165186
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
166187
character. `c` is considered a zero-width character if
167188
@@ -180,26 +201,11 @@ def load_zero_widths() -> "list[bool]":
180201
# `Grapheme_Extend` includes characters with general category `Mn` or `Me`,
181202
# as well as a few `Mc` characters that need to be included so that
182203
# canonically equivalent sequences have the same width.
183-
with fetch_open("DerivedCoreProperties.txt") as properties:
184-
single = re.compile(
185-
r"^([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
186-
)
187-
multiple = re.compile(
188-
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
189-
)
190-
191-
for line in properties.readlines():
192-
raw_data = None # (low, high)
193-
if match := single.match(line):
194-
raw_data = (match.group(1), match.group(1))
195-
elif match := multiple.match(line):
196-
raw_data = (match.group(1), match.group(2))
197-
else:
198-
continue
199-
low = int(raw_data[0], 16)
200-
high = int(raw_data[1], 16)
201-
for cp in range(low, high + 1):
202-
zw_map[cp] = True
204+
load_property(
205+
"DerivedCoreProperties.txt",
206+
r"(?:Default_Ignorable_Code_Point|Grapheme_Extend)",
207+
lambda cp: operator.setitem(zw_map, cp, True),
208+
)
203209

204210
# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
205211
# as they canonically decompose to two characters with this property,
@@ -217,29 +223,11 @@ def load_zero_widths() -> "list[bool]":
217223
# and the resulting grapheme has width 2.
218224
#
219225
# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
220-
with fetch_open("HangulSyllableType.txt") as categories:
221-
single = re.compile(r"^([0-9A-F]+)\s*;\s*(V|T)\s+")
222-
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(V|T)\s+")
223-
224-
for line in categories.readlines():
225-
raw_data = None # (low, high)
226-
if match := single.match(line):
227-
raw_data = (match.group(1), match.group(1))
228-
elif match := multiple.match(line):
229-
raw_data = (match.group(1), match.group(2))
230-
else:
231-
continue
232-
low = int(raw_data[0], 16)
233-
high = int(raw_data[1], 16)
234-
for cp in range(low, high + 1):
235-
zw_map[cp] = True
236-
237-
# Special case: U+115F HANGUL CHOSEONG FILLER.
238-
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
239-
# zero width. However, the expected usage is to combine it with vowel or trailing jamo
240-
# (which are considered 0-width on their own) to form a composed Hangul syllable with
241-
# width 2. Therefore, we treat it as having width 2.
242-
zw_map[0x115F] = False
226+
load_property(
227+
"HangulSyllableType.txt",
228+
r"(?:V|T)",
229+
lambda cp: operator.setitem(zw_map, cp, True),
230+
)
243231

244232
# Syriac abbreviation mark:
245233
# Zero-width `Prepended_Concatenation_Mark`
@@ -252,7 +240,14 @@ def load_zero_widths() -> "list[bool]":
252240
zw_map[0x0891] = True
253241
zw_map[0x08E2] = True
254242

255-
# U+A8FA DEVANAGARI CARET
243+
# HANGUL CHOSEONG FILLER
244+
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
245+
# zero width. However, the expected usage is to combine it with vowel or trailing jamo
246+
# (which are considered 0-width on their own) to form a composed Hangul syllable with
247+
# width 2. Therefore, we treat it as having width 2.
248+
zw_map[0x115F] = False
249+
250+
# DEVANAGARI CARET
256251
# https://www.unicode.org/versions/Unicode15.0.0/ch12.pdf#G667447
257252
zw_map[0xA8FA] = True
258253

@@ -287,13 +282,13 @@ def try_extend(self, attempt: "Bucket") -> bool:
287282
self.widths = more
288283
return True
289284

290-
def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]":
285+
def entries(self) -> list[tuple[Codepoint, EffectiveWidth]]:
291286
"""Return a list of the codepoint/width pairs in this bucket, sorted by codepoint."""
292287
result = list(self.entry_set)
293288
result.sort()
294289
return result
295290

296-
def width(self) -> "EffectiveWidth | None":
291+
def width(self) -> EffectiveWidth | None:
297292
"""If all codepoints in this bucket have the same width, return that width; otherwise,
298293
return `None`."""
299294
if len(self.widths) == 0:
@@ -305,7 +300,7 @@ def width(self) -> "EffectiveWidth | None":
305300
return potential_width
306301

307302

308-
def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
303+
def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> list[Bucket]:
309304
"""Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All
310305
codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the
311306
same bucket. Returns a list of the buckets in increasing order of those bits."""
@@ -373,7 +368,7 @@ def buckets(self):
373368
"""Returns an iterator over this table's buckets."""
374369
return self.indexed
375370

376-
def to_bytes(self) -> "list[int]":
371+
def to_bytes(self) -> list[int]:
377372
"""Returns this table's entries as a list of bytes. The bytes are formatted according to
378373
the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries
379374
to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will
@@ -389,8 +384,8 @@ def to_bytes(self) -> "list[int]":
389384

390385

391386
def make_tables(
392-
table_cfgs: "list[tuple[BitPos, BitPos, OffsetType]]", entries
393-
) -> "list[Table]":
387+
table_cfgs: list[tuple[BitPos, BitPos, OffsetType]], entries
388+
) -> list[Table]:
394389
"""Creates a table for each configuration in `table_cfgs`, with the first config corresponding
395390
to the top-level lookup table, the second config corresponding to the second-level lookup
396391
table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs
@@ -404,7 +399,7 @@ def make_tables(
404399
return tables
405400

406401

407-
def load_emoji_presentation_sequences() -> "list[int]":
402+
def load_emoji_presentation_sequences() -> list[int]:
408403
"""Outputs a list of character ranages, corresponding to all the valid characters for starting
409404
an emoji presentation sequence."""
410405

@@ -420,7 +415,7 @@ def load_emoji_presentation_sequences() -> "list[int]":
420415
return codepoints
421416

422417

423-
def load_text_presentation_sequences() -> "list[int]":
418+
def load_text_presentation_sequences() -> list[int]:
424419
"""Outputs a list of character ranages, corresponding to all the valid characters
425420
whose widths change with a text presentation sequence."""
426421

@@ -435,24 +430,12 @@ def load_text_presentation_sequences() -> "list[int]":
435430
text_presentation_seq_codepoints.add(cp)
436431

437432
default_emoji_codepoints = set()
438-
with fetch_open("emoji/emoji-data.txt") as emoji_data:
439-
single = re.compile(r"^([0-9A-F]+)\s*;\s*Emoji_Presentation\s+")
440-
multiple = re.compile(
441-
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Presentation\s+"
442-
)
443433

444-
for line in emoji_data.readlines():
445-
raw_data = None # (low, high)
446-
if match := single.match(line):
447-
raw_data = (match.group(1), match.group(1))
448-
elif match := multiple.match(line):
449-
raw_data = (match.group(1), match.group(2))
450-
else:
451-
continue
452-
low = int(raw_data[0], 16)
453-
high = int(raw_data[1], 16)
454-
for cp in range(low, high + 1):
455-
default_emoji_codepoints.add(cp)
434+
load_property(
435+
"emoji/emoji-data.txt",
436+
"Emoji_Presentation",
437+
lambda cp: default_emoji_codepoints.add(cp),
438+
)
456439

457440
codepoints = []
458441
for cp in text_presentation_seq_codepoints.intersection(default_emoji_codepoints):
@@ -466,11 +449,11 @@ def load_text_presentation_sequences() -> "list[int]":
466449

467450

468451
def make_presentation_sequence_table(
469-
seqs: "list[int]",
470-
width_map: "list[EffectiveWidth]",
471-
spurious_false: "set[EffectiveWidth]",
472-
spurious_true: "set[EffectiveWidth]",
473-
) -> "tuple[list[tuple[int, int]], list[list[int]]]":
452+
seqs: list[Codepoint],
453+
width_map: list[EffectiveWidth],
454+
spurious_false: set[EffectiveWidth],
455+
spurious_true: set[EffectiveWidth],
456+
) -> tuple[list[tuple[int, int]], list[list[int]]]:
474457
"""Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.
475458
The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
476459
"""
@@ -488,13 +471,13 @@ def make_presentation_sequence_table(
488471
):
489472
del prefixes_dict[k]
490473

491-
msbs: "list[int]" = list(prefixes_dict.keys())
474+
msbs: list[int] = list(prefixes_dict.keys())
492475

493476
for cp, width in enumerate(width_map):
494477
if width in spurious_true and (cp >> 10) in msbs:
495478
prefixes_dict[cp >> 10].add(cp & 0x3FF)
496479

497-
leaves: "list[list[int]]" = []
480+
leaves: list[list[int]] = []
498481
for cps in prefixes_dict.values():
499482
leaf = [0] * 128
500483
for cp in cps:
@@ -524,10 +507,10 @@ def make_presentation_sequence_table(
524507

525508
def emit_module(
526509
out_name: str,
527-
unicode_version: "tuple[int, int, int]",
528-
tables: "list[Table]",
529-
emoji_presentation_table: "tuple[list[tuple[int, int]], list[list[int]]]",
530-
text_presentation_table: "tuple[list[tuple[int, int]], list[list[int]]]",
510+
unicode_version: tuple[int, int, int],
511+
tables: list[Table],
512+
emoji_presentation_table: tuple[list[tuple[int, int]], list[list[int]]],
513+
text_presentation_table: tuple[list[tuple[int, int]], list[list[int]]],
531514
):
532515
"""Outputs a Rust module to `out_name` using table data from `tables`.
533516
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -574,18 +557,18 @@ def emit_module(
574557
pub fn lookup_width(c: char, is_cjk: bool) -> usize {
575558
let cp = c as usize;
576559
577-
let t1_offset = TABLES_0[cp >> 13 & 0xFF];
560+
let t1_offset = TABLES_0.0[cp >> 13 & 0xFF];
578561
579562
// Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte,
580563
// so each sub-table is 128 bytes in size.
581564
// (Sub-tables are selected using the computed offset from the previous table.)
582-
let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];
565+
let t2_offset = TABLES_1.0[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];
583566
584567
// Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits.
585568
// This is accomplished by packing four stored entries into one byte.
586569
// So each sub-table is 2**(6-2) == 16 bytes in size.
587570
// Since this is the last table, each entry represents an encoded width.
588-
let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];
571+
let packed_widths = TABLES_2.0[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];
589572
590573
// Extract the packed width
591574
let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;
@@ -669,6 +652,12 @@ def emit_module(
669652
// Use the 3 LSB of `cp` to index into `leaf_byte`.
670653
((leaf_byte >> (cp & 7)) & 1) == 1
671654
}
655+
656+
#[repr(align(128))]
657+
struct Align128<T>(T);
658+
659+
#[repr(align(16))]
660+
struct Align16<T>(T);
672661
"""
673662
)
674663

@@ -677,26 +666,27 @@ def emit_module(
677666
new_subtable_count = len(table.buckets())
678667
if i == len(tables) - 1:
679668
table.indices_to_widths() # for the last table, indices == widths
669+
align = 16
670+
else:
671+
align = 128
680672
byte_array = table.to_bytes()
681673
module.write(
682674
f"""
683675
/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
684-
static TABLES_{i}: [u8; {len(byte_array)}] = ["""
676+
static TABLES_{i}: Align{align}<[u8; {len(byte_array)}]> = Align{align}(["""
685677
)
686678
for j, byte in enumerate(byte_array):
687679
# Add line breaks for every 15th entry (chosen to match what rustfmt does)
688680
if j % 15 == 0:
689681
module.write("\n ")
690682
module.write(f" 0x{byte:02X},")
691-
module.write("\n ];\n")
683+
module.write("\n ]);\n")
692684
subtable_count = new_subtable_count
693685

694686
# emoji table
695687

696688
module.write(
697689
f"""
698-
#[repr(align(128))]
699-
struct Align128<T>(T);
700690
/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
701691
/// to get whether it can start an emoji presentation sequence.
702692
static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(emoji_presentation_leaves)}]> = Align128([

0 commit comments

Comments
 (0)