Skip to content

Commit 064f888

Browse files
Add unicode table generator
1 parent 8a87b94 commit 064f888

File tree

8 files changed

+564
-8
lines changed

8 files changed

+564
-8
lines changed

.gitignore

+1-8
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,7 @@ __pycache__/
3434
# Created by default with `src/ci/docker/run.sh`:
3535
/obj/
3636
/rustllvm/
37-
/src/libcore/unicode/DerivedCoreProperties.txt
38-
/src/libcore/unicode/DerivedNormalizationProps.txt
39-
/src/libcore/unicode/PropList.txt
40-
/src/libcore/unicode/ReadMe.txt
41-
/src/libcore/unicode/Scripts.txt
42-
/src/libcore/unicode/SpecialCasing.txt
43-
/src/libcore/unicode/UnicodeData.txt
44-
/src/libcore/unicode/downloaded
37+
/unicode-downloads
4538
/target/
4639
# Generated by compiletest for incremental:
4740
/tmp/

Cargo.lock

+17
Original file line numberDiff line numberDiff line change
@@ -4930,6 +4930,16 @@ version = "1.10.0"
49304930
source = "registry+https://github.com/rust-lang/crates.io-index"
49314931
checksum = "612d636f949607bdf9b123b4a6f6d966dedf3ff669f7f045890d3a4a73948169"
49324932

4933+
[[package]]
4934+
name = "ucd-parse"
4935+
version = "0.1.4"
4936+
source = "registry+https://github.com/rust-lang/crates.io-index"
4937+
checksum = "ca6b52bf4da6512f0f07785a04769222e50d29639e7ecd016b7806fd2de306b4"
4938+
dependencies = [
4939+
"lazy_static 1.3.0",
4940+
"regex",
4941+
]
4942+
49334943
[[package]]
49344944
name = "ucd-trie"
49354945
version = "0.1.1"
@@ -4951,6 +4961,13 @@ dependencies = [
49514961
"version_check 0.1.5",
49524962
]
49534963

4964+
[[package]]
4965+
name = "unicode-bdd"
4966+
version = "0.1.0"
4967+
dependencies = [
4968+
"ucd-parse",
4969+
]
4970+
49544971
[[package]]
49554972
name = "unicode-bidi"
49564973
version = "0.3.4"

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ members = [
2323
"src/tools/rustfmt",
2424
"src/tools/miri",
2525
"src/tools/rustdoc-themes",
26+
"src/tools/unicode-table-generator",
2627
]
2728
exclude = [
2829
"build",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "unicode-bdd"
3+
version = "0.1.0"
4+
authors = ["Mark Rousskov <[email protected]>"]
5+
edition = "2018"
6+
7+
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
8+
9+
[dependencies]
10+
ucd-parse = "0.1.3"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
use crate::{fmt_list, UnicodeData};
2+
use std::fmt;
3+
4+
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String {
5+
let mut file = String::new();
6+
7+
file.push_str(HEADER.trim_start());
8+
9+
let decl_type = "&[(char, [char; 3])]";
10+
11+
file.push_str(&format!(
12+
"static LOWERCASE_TABLE: {} = &[{}];",
13+
decl_type,
14+
fmt_list(data.to_lower.iter().map(to_mapping))
15+
));
16+
file.push_str("\n\n");
17+
file.push_str(&format!(
18+
"static UPPERCASE_TABLE: {} = &[{}];",
19+
decl_type,
20+
fmt_list(data.to_upper.iter().map(to_mapping))
21+
));
22+
file
23+
}
24+
25+
fn to_mapping((key, (a, b, c)): (&u32, &(u32, u32, u32))) -> (CharEscape, [CharEscape; 3]) {
26+
(
27+
CharEscape(std::char::from_u32(*key).unwrap()),
28+
[
29+
CharEscape(std::char::from_u32(*a).unwrap()),
30+
CharEscape(std::char::from_u32(*b).unwrap()),
31+
CharEscape(std::char::from_u32(*c).unwrap()),
32+
],
33+
)
34+
}
35+
36+
struct CharEscape(char);
37+
38+
impl fmt::Debug for CharEscape {
39+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
40+
write!(f, "'{}'", self.0.escape_default())
41+
}
42+
}
43+
44+
static HEADER: &str = "
45+
pub fn to_lower(c: char) -> [char; 3] {
46+
match bsearch_case_table(c, LOWERCASE_TABLE) {
47+
None => [c, '\\0', '\\0'],
48+
Some(index) => LOWERCASE_TABLE[index].1,
49+
}
50+
}
51+
52+
pub fn to_upper(c: char) -> [char; 3] {
53+
match bsearch_case_table(c, UPPERCASE_TABLE) {
54+
None => [c, '\\0', '\\0'],
55+
Some(index) => UPPERCASE_TABLE[index].1,
56+
}
57+
}
58+
59+
fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
60+
table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
61+
}
62+
";
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
use std::collections::{BTreeMap, HashMap};
2+
use std::ops::Range;
3+
use ucd_parse::Codepoints;
4+
5+
mod case_mapping;
6+
mod raw_emitter;
7+
mod unicode_download;
8+
9+
use raw_emitter::{emit_codepoints, RawEmitter};
10+
11+
static PROPERTIES: &[&str] = &[
12+
"Alphabetic",
13+
"Lowercase",
14+
"Uppercase",
15+
"Cased",
16+
"Case_Ignorable",
17+
"Grapheme_Extend",
18+
"White_Space",
19+
"Cc",
20+
"N",
21+
];
22+
23+
struct UnicodeData {
24+
ranges: Vec<(&'static str, Vec<Range<u32>>)>,
25+
to_upper: BTreeMap<u32, (u32, u32, u32)>,
26+
to_lower: BTreeMap<u32, (u32, u32, u32)>,
27+
}
28+
29+
fn to_mapping(origin: u32, codepoints: Vec<ucd_parse::Codepoint>) -> Option<(u32, u32, u32)> {
30+
let mut a = None;
31+
let mut b = None;
32+
let mut c = None;
33+
34+
for codepoint in codepoints {
35+
if origin == codepoint.value() {
36+
return None;
37+
}
38+
39+
if a.is_none() {
40+
a = Some(codepoint.value());
41+
} else if b.is_none() {
42+
b = Some(codepoint.value());
43+
} else if c.is_none() {
44+
c = Some(codepoint.value());
45+
} else {
46+
panic!("more than 3 mapped codepoints")
47+
}
48+
}
49+
50+
Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)))
51+
}
52+
53+
static UNICODE_DIRECTORY: &str = "unicode-downloads";
54+
55+
fn load_data() -> UnicodeData {
56+
unicode_download::fetch_latest();
57+
58+
let mut properties = HashMap::new();
59+
for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() {
60+
if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
61+
properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
62+
}
63+
}
64+
for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() {
65+
if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
66+
properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
67+
}
68+
}
69+
70+
let mut to_lower = BTreeMap::new();
71+
let mut to_upper = BTreeMap::new();
72+
for row in ucd_parse::UnicodeDataExpander::new(
73+
ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(),
74+
) {
75+
let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) {
76+
"N"
77+
} else {
78+
row.general_category.as_str()
79+
};
80+
if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) {
81+
properties
82+
.entry(*name)
83+
.or_insert_with(Vec::new)
84+
.push(Codepoints::Single(row.codepoint));
85+
}
86+
87+
if let Some(mapped) = row.simple_lowercase_mapping {
88+
if mapped != row.codepoint {
89+
to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0));
90+
}
91+
}
92+
if let Some(mapped) = row.simple_uppercase_mapping {
93+
if mapped != row.codepoint {
94+
to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0));
95+
}
96+
}
97+
}
98+
99+
for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() {
100+
if !row.conditions.is_empty() {
101+
// Skip conditional case mappings
102+
continue;
103+
}
104+
105+
let key = row.codepoint.value();
106+
if let Some(lower) = to_mapping(key, row.lowercase) {
107+
to_lower.insert(key, lower);
108+
}
109+
if let Some(upper) = to_mapping(key, row.uppercase) {
110+
to_upper.insert(key, upper);
111+
}
112+
}
113+
114+
let mut properties: HashMap<&'static str, Vec<Range<u32>>> = properties
115+
.into_iter()
116+
.map(|(k, v)| {
117+
(
118+
k,
119+
v.into_iter()
120+
.flat_map(|codepoints| match codepoints {
121+
Codepoints::Single(c) => c
122+
.scalar()
123+
.map(|ch| (ch as u32..ch as u32 + 1))
124+
.into_iter()
125+
.collect::<Vec<_>>(),
126+
Codepoints::Range(c) => c
127+
.into_iter()
128+
.flat_map(|c| c.scalar().map(|ch| (ch as u32..ch as u32 + 1)))
129+
.collect::<Vec<_>>(),
130+
})
131+
.collect::<Vec<Range<u32>>>(),
132+
)
133+
})
134+
.collect();
135+
136+
for ranges in properties.values_mut() {
137+
merge_ranges(ranges);
138+
}
139+
140+
let mut properties = properties.into_iter().collect::<Vec<_>>();
141+
properties.sort_by_key(|p| p.0);
142+
UnicodeData { ranges: properties, to_lower, to_upper }
143+
}
144+
145+
fn main() {
146+
let write_location = std::env::args().nth(1).unwrap_or_else(|| {
147+
eprintln!("Must provide path to write unicode tables to");
148+
eprintln!(
149+
"e.g. {} src/libcore/unicode/unicode_data.rs",
150+
std::env::args().nth(0).unwrap_or_default()
151+
);
152+
std::process::exit(1);
153+
});
154+
155+
let unicode_data = load_data();
156+
let ranges_by_property = &unicode_data.ranges;
157+
158+
let mut total_bytes = 0;
159+
let mut modules = Vec::new();
160+
for (property, ranges) in ranges_by_property {
161+
let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
162+
let mut emitter = RawEmitter::new();
163+
emit_codepoints(&mut emitter, &ranges);
164+
165+
modules.push((property.to_lowercase().to_string(), emitter.file));
166+
println!("{:15}: {} bytes, {} codepoints", property, emitter.bytes_used, datapoints,);
167+
total_bytes += emitter.bytes_used;
168+
}
169+
170+
let mut table_file = String::new();
171+
172+
table_file.push_str(
173+
"///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
174+
);
175+
176+
table_file.push_str("use super::range_search;\n\n");
177+
178+
table_file.push_str(&version());
179+
180+
table_file.push('\n');
181+
182+
modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data)));
183+
184+
for (name, contents) in modules {
185+
table_file.push_str("#[rustfmt::skip]\n");
186+
table_file.push_str(&format!("pub mod {} {{\n", name));
187+
for line in contents.lines() {
188+
if !line.trim().is_empty() {
189+
table_file.push_str(" ");
190+
table_file.push_str(&line);
191+
}
192+
table_file.push('\n');
193+
}
194+
table_file.push_str("}\n\n");
195+
}
196+
197+
std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap();
198+
199+
println!("Total table sizes: {} bytes", total_bytes);
200+
}
201+
202+
fn version() -> String {
203+
let mut out = String::new();
204+
out.push_str("pub const UNICODE_VERSION: (u32, u32, u32) = ");
205+
206+
let readme =
207+
std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt"))
208+
.unwrap();
209+
210+
let prefix = "for Version ";
211+
let start = readme.find(prefix).unwrap() + prefix.len();
212+
let end = readme.find(" of the Unicode Standard.").unwrap();
213+
let version =
214+
readme[start..end].split('.').map(|v| v.parse::<u32>().expect(&v)).collect::<Vec<_>>();
215+
let [major, minor, micro] = [version[0], version[1], version[2]];
216+
217+
out.push_str(&format!("({}, {}, {});\n", major, minor, micro));
218+
out
219+
}
220+
221+
fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
222+
let pieces = values.into_iter().map(|b| format!("{:?}, ", b)).collect::<Vec<_>>();
223+
let mut out = String::new();
224+
let mut line = format!("\n ");
225+
for piece in pieces {
226+
if line.len() + piece.len() < 98 {
227+
line.push_str(&piece);
228+
} else {
229+
out.push_str(line.trim_end());
230+
out.push('\n');
231+
line = format!(" {}", piece);
232+
}
233+
}
234+
out.push_str(line.trim_end());
235+
out.push('\n');
236+
out
237+
}
238+
239+
fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
240+
loop {
241+
let mut new_ranges = Vec::new();
242+
let mut idx_iter = 0..(ranges.len() - 1);
243+
while let Some(idx) = idx_iter.next() {
244+
let cur = ranges[idx].clone();
245+
let next = ranges[idx + 1].clone();
246+
if cur.end == next.start {
247+
let _ = idx_iter.next(); // skip next as we're merging it in
248+
new_ranges.push(cur.start..next.end);
249+
} else {
250+
new_ranges.push(cur);
251+
}
252+
}
253+
new_ranges.push(ranges.last().unwrap().clone());
254+
if new_ranges.len() == ranges.len() {
255+
*ranges = new_ranges;
256+
break;
257+
} else {
258+
*ranges = new_ranges;
259+
}
260+
}
261+
}

0 commit comments

Comments
 (0)