Skip to content

Commit d864250

Browse files
committed
add implementation of UAX#29 word bounds algorithm
This patch does the following: 1. Adds three new structs in libunicode/str.rs: a. UnicodeWords: a filter on the UWordBounds iterator that yields only the "words" of a string as defined in Section 4 of Unicode Standard Annex rust-lang#29 (UAX#29), http://unicode.org/reports/tr29/#Word_Boundaries b. UWordBounds: an iterator that segments a string on its word boundaries as defined in UAX#29. Note that this *only* segments the string, and does *not* drop whitespace and other non-word pieces of the text (that's what UnicodeWords does). Note that UWordBounds has both a forward and backward iterator that have total running time (that is, to segment the entire string) linear in the size of the string. It should be noted that with pathological inputs the reverse iterator could be about 2x less efficient than the forward iterator, but on reasonable inputs their costs are similar. c. UWordBoundIndices: the above iterator, but returning tuples of (offset, &str). 2. Adds three new functions in the `UnicodeStr` trait: a. words_unicode(): returns a UnicodeWords iterator. b. split_words_uax29(): returns a UWordBounds iterator. c. split_words_uax29_indices(): returns a UWordBoundIndices iterator. 3. Updates the `src/etc/unicode.py` script to generate tables necessary for running the UWordBounds iterators. 4. Adds a new script, `src/etc/unicode_gen_breaktests.py`, which processes the grapheme and word break tests published by the Unicode consortium into a format for inclusion in libcollectionstest. 5. Adds new impls in libcollections's `str` corresponding to the `UnicodeStr` functions of (2). Note that this new functionality is gated with `feature(unicode)`. 6. Adds tests in libcollectionstest to exercise this new functionality. In addition, updates the test data for the graphemes test to correspond to the output from the script of (4). (Note that at the moment this change is primarily cosmetic.) This patch does not settle the question raised by @huonw in rust-lang#15628; rather, it introduces a new function alongside `words()` that follows UAX#29. In addition, it does not address the concerns that @SimonSapin raises in rust-lang/rfcs#1054 since it leaves `words()` alone.
1 parent 6790b0e commit d864250

File tree

7 files changed

+2596
-374
lines changed

7 files changed

+2596
-374
lines changed

src/etc/unicode.py

+36-21
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# - DerivedNormalizationProps.txt
1616
# - EastAsianWidth.txt
1717
# - auxiliary/GraphemeBreakProperty.txt
18+
# - auxiliary/WordBreakProperty.txt
1819
# - PropList.txt
1920
# - ReadMe.txt
2021
# - Scripts.txt
@@ -290,11 +291,13 @@ def emit_bsearch_range_table(f):
290291
""")
291292

292293
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
293-
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
294-
pub_string = ""
294+
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
295+
pub_string = "const"
296+
if not is_const:
297+
pub_string = "let"
295298
if is_pub:
296-
pub_string = "pub "
297-
f.write(" %sconst %s: %s = &[\n" % (pub_string, name, t_type))
299+
pub_string = "pub " + pub_string
300+
f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type))
298301
data = ""
299302
first = True
300303
for dat in t_data:
@@ -375,21 +378,25 @@ def emit_conversions_module(f, lowerupper, upperlower):
375378
sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
376379
f.write("}\n\n")
377380

378-
def emit_grapheme_module(f, grapheme_table, grapheme_cats):
379-
f.write("""pub mod grapheme {
381+
def emit_break_module(f, break_table, break_cats, name):
382+
Name = name.capitalize()
383+
f.write("""pub mod %s {
380384
use core::slice::SliceExt;
381-
pub use self::GraphemeCat::*;
385+
pub use self::%sCat::*;
382386
use core::result::Result::{Ok, Err};
383387
384388
#[allow(non_camel_case_types)]
385-
#[derive(Clone, Copy)]
386-
pub enum GraphemeCat {
387-
""")
388-
for cat in grapheme_cats + ["Any"]:
389-
f.write(" GC_" + cat + ",\n")
389+
#[derive(Clone, Copy, PartialEq, Eq)]
390+
pub enum %sCat {
391+
""" % (name, Name, Name))
392+
393+
break_cats.append("Any")
394+
break_cats.sort()
395+
for cat in break_cats:
396+
f.write((" %sC_" % Name[0]) + cat + ",\n")
390397
f.write(""" }
391398
392-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
399+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
393400
use core::cmp::Ordering::{Equal, Less, Greater};
394401
match r.binary_search_by(|&(lo, hi, _)| {
395402
if lo <= c && c <= hi { Equal }
@@ -400,19 +407,19 @@ def emit_grapheme_module(f, grapheme_table, grapheme_cats):
400407
let (_, _, cat) = r[idx];
401408
cat
402409
}
403-
Err(_) => GC_Any
410+
Err(_) => %sC_Any
404411
}
405412
}
406413
407-
pub fn grapheme_category(c: char) -> GraphemeCat {
408-
bsearch_range_value_table(c, grapheme_cat_table)
414+
pub fn %s_category(c: char) -> %sCat {
415+
bsearch_range_value_table(c, %s_cat_table)
409416
}
410417
411-
""")
418+
""" % (Name, Name, Name[0], name, Name, name))
412419

413-
emit_table(f, "grapheme_cat_table", grapheme_table, "&'static [(char, char, GraphemeCat)]",
414-
pfun=lambda x: "(%s,%s,GC_%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]),
415-
is_pub=False)
420+
emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
421+
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
422+
is_pub=False, is_const=True)
416423
f.write("}\n")
417424

418425
def emit_charwidth_module(f, width_table):
@@ -690,4 +697,12 @@ def optimize_width_table(wtable):
690697
for cat in grapheme_cats:
691698
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
692699
grapheme_table.sort(key=lambda w: w[0])
693-
emit_grapheme_module(rf, grapheme_table, grapheme_cats.keys())
700+
emit_break_module(rf, grapheme_table, grapheme_cats.keys(), "grapheme")
701+
rf.write("\n")
702+
703+
word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
704+
word_table = []
705+
for cat in word_cats:
706+
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
707+
word_table.sort(key=lambda w: w[0])
708+
emit_break_module(rf, word_table, word_cats.keys(), "word")

src/etc/unicode_gen_breaktests.py

+196
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8
3+
#
4+
# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
5+
# file at the top-level directory of this distribution and at
6+
# http://rust-lang.org/COPYRIGHT.
7+
#
8+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
9+
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
10+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
11+
# option. This file may not be copied, modified, or distributed
12+
# except according to those terms.
13+
14+
# This script uses the following Unicode tables:
15+
# - GraphemeBreakTest.txt
16+
# - WordBreakTest.txt
17+
#
18+
# Since this should not require frequent updates, we just store this
19+
# out-of-line and check the unicode.rs file into git.
20+
21+
import unicode, re, os, fileinput
22+
23+
def load_test_data(f, optsplit=[]):
24+
outls = []
25+
testRe1 = re.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
26+
27+
unicode.fetch(f)
28+
data = []
29+
for line in fileinput.input(os.path.basename(f)):
30+
# lines that include a test start with the ÷ character
31+
if len(line) < 2 or line[0:2] != '÷':
32+
continue
33+
34+
m = testRe1.match(line)
35+
if not m:
36+
print "error: no match on line where test was expected: %s" % line
37+
continue
38+
39+
# process the characters in this test case
40+
chars = process_split_string(m.group(1))
41+
# skip test case if it contains invalid characters (viz., surrogates)
42+
if not chars:
43+
continue
44+
45+
# now process test cases
46+
(chars, info) = process_split_info(m.group(2), chars, optsplit)
47+
48+
# make sure that we have break info for each break!
49+
assert len(chars) - 1 == len(info)
50+
51+
outls.append((chars, info))
52+
53+
return outls
54+
55+
def process_split_info(s, c, o):
56+
outcs = []
57+
outis = []
58+
workcs = c.pop(0)
59+
60+
# are we on a × or a ÷?
61+
isX = False
62+
if s[0:2] == '×':
63+
isX = True
64+
65+
# find each instance of '(÷|×) [x.y] '
66+
while s:
67+
# find the currently considered rule number
68+
sInd = s.index('[') + 1
69+
eInd = s.index(']')
70+
71+
# if it's '× [a.b]' where 'a.b' is in o, then
72+
# we consider it a split even though it's not
73+
# marked as one
74+
# if it's ÷ then it's always a split
75+
if not isX or s[sInd:eInd] in o:
76+
outis.append(s[sInd:eInd])
77+
outcs.append(workcs)
78+
workcs = c.pop(0)
79+
else:
80+
workcs.extend(c.pop(0))
81+
82+
idx = 1
83+
while idx < len(s):
84+
if s[idx:idx+2] == '×':
85+
isX = True
86+
break
87+
if s[idx:idx+2] == '÷':
88+
isX = False
89+
break
90+
idx += 1
91+
s = s[idx:]
92+
93+
outcs.append(workcs)
94+
return (outcs, outis)
95+
96+
def process_split_string(s):
97+
outls = []
98+
workls = []
99+
100+
inls = s.split()
101+
102+
for i in inls:
103+
if i == '÷' or i == '×':
104+
outls.append(workls)
105+
workls = []
106+
continue
107+
108+
ival = int(i,16)
109+
110+
if unicode.is_surrogate(ival):
111+
return []
112+
113+
workls.append(ival)
114+
115+
if workls:
116+
outls.append(workls)
117+
118+
return outls
119+
120+
def showfun(x):
121+
outstr = '("'
122+
for c in x[0]:
123+
outstr += "\\u{%x}" % c
124+
outstr += '",&['
125+
xfirst = True
126+
for xx in x[1:]:
127+
if not xfirst:
128+
outstr += '],&['
129+
xfirst = False
130+
sfirst = True
131+
for sp in xx:
132+
if not sfirst:
133+
outstr += ','
134+
sfirst = False
135+
outstr += '"'
136+
for c in sp:
137+
outstr += "\\u{%x}" % c
138+
outstr += '"'
139+
outstr += '])'
140+
return outstr
141+
142+
def create_grapheme_data():
143+
# rules 9.1 and 9.2 are for extended graphemes only
144+
optsplits = ['9.1','9.2']
145+
d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
146+
147+
test_same = []
148+
test_diff = []
149+
150+
for (c, i) in d:
151+
allchars = [cn for s in c for cn in s]
152+
extgraphs = []
153+
extwork = []
154+
155+
extwork.extend(c[0])
156+
for n in range(0,len(i)):
157+
if i[n] in optsplits:
158+
extwork.extend(c[n+1])
159+
else:
160+
extgraphs.append(extwork)
161+
extwork = []
162+
extwork.extend(c[n+1])
163+
164+
# these are the extended grapheme clusters
165+
extgraphs.append(extwork)
166+
167+
if extgraphs == c:
168+
test_same.append((allchars, c))
169+
else:
170+
test_diff.append((allchars, extgraphs, c))
171+
172+
stype = "&[(&str, &[&str])]"
173+
dtype = "&[(&str, &[&str], &[&str])]"
174+
with open("graph_tests.rs", "w") as rf:
175+
rf.write(" // official Unicode test data\n")
176+
rf.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
177+
unicode.emit_table(rf, "test_same", test_same, stype, False, showfun, False)
178+
unicode.emit_table(rf, "test_diff", test_diff, dtype, False, showfun, False)
179+
180+
def create_words_data():
181+
d = load_test_data("auxiliary/WordBreakTest.txt")
182+
183+
test = []
184+
185+
for (c, i) in d:
186+
allchars = [cn for s in c for cn in s]
187+
test.append((allchars, c))
188+
189+
wtype = "&[(&str, &[&str])]"
190+
with open("word_tests.rs", "w") as rf:
191+
rf.write(" // official Unicode test data\n")
192+
rf.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
193+
unicode.emit_table(rf, "test_word", test, wtype, False, showfun, False)
194+
195+
create_grapheme_data()
196+
create_words_data()

src/libcollections/str.rs

+64
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ pub use core::str::{MatchIndices, RMatchIndices};
7979
pub use core::str::{from_utf8, Chars, CharIndices, Bytes};
8080
pub use core::str::{from_utf8_unchecked, ParseBoolError};
8181
pub use unicode::str::{Words, Graphemes, GraphemeIndices};
82+
pub use unicode::str::{UnicodeWords, UWordBounds, UWordBoundIndices};
8283
pub use core::str::pattern;
8384

8485
/*
@@ -1736,6 +1737,30 @@ impl str {
17361737
UnicodeStr::words(&self[..])
17371738
}
17381739

1740+
/// An iterator over the words of `self`, separated on
1741+
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
1742+
///
1743+
/// In this function, "words" are just those substrings which, after splitting on
1744+
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
1745+
/// substring must contain at least one character with the
1746+
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
1747+
/// property, or with
1748+
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
1749+
///
1750+
/// # Example
1751+
/// # #![feature(unicode, core)]
1752+
/// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
1753+
/// let uw1 = uws.words_unicode().collect::<Vec<&str>>();
1754+
/// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
1755+
///
1756+
/// assert_eq!(&uw1[..], b);
1757+
/// ```
1758+
#[unstable(feature = "unicode",
1759+
reason = "questions remain regarding the naming of words() and words_unicode()")]
1760+
pub fn words_unicode(&self) -> UnicodeWords {
1761+
UnicodeStr::words_unicode(&self[..])
1762+
}
1763+
17391764
/// Returns a string's displayed width in columns.
17401765
///
17411766
/// Control characters have zero width.
@@ -1819,4 +1844,43 @@ impl str {
18191844
s.extend(self[..].chars().flat_map(|c| c.to_uppercase()));
18201845
return s;
18211846
}
1847+
1848+
/// Returns an iterator over substrings of `self` separated on
1849+
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
1850+
///
1851+
/// The concatenation of the substrings returned by this function is just the original string.
1852+
///
1853+
/// # Example
1854+
///
1855+
/// ```
1856+
/// # #![feature(unicode, core)]
1857+
/// let swu1 = "The quick (\"brown\") fox".split_words_uax29().collect::<Vec<&str>>();
1858+
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
1859+
///
1860+
/// assert_eq!(&swu1[..], b);
1861+
/// ```
1862+
#[unstable(feature = "unicode",
1863+
reason = "this functionality may only be provided by libunicode")]
1864+
pub fn split_words_uax29(&self) -> UWordBounds {
1865+
UnicodeStr::split_words_uax29(&self[..])
1866+
}
1867+
1868+
/// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
1869+
/// and their offsets. See `split_words_uax29()` for more information.
1870+
///
1871+
/// # Example
1872+
///
1873+
/// ```
1874+
/// # #![feature(unicode, core)]
1875+
/// let swi1 = "Brr, it's 29.3°F!".split_words_uax29_indices().collect::<Vec<(usize, &str)>>();
1876+
/// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
1877+
/// (14, "°"), (16, "F"), (17, "!")];
1878+
///
1879+
/// assert_eq!(&swi1[..], b);
1880+
/// ```
1881+
#[unstable(feature = "unicode",
1882+
reason = "this functionality may only be provided by libunicode")]
1883+
pub fn split_words_uax29_indices(&self) -> UWordBoundIndices {
1884+
UnicodeStr::split_words_uax29_indices(&self[..])
1885+
}
18221886
}

0 commit comments

Comments
 (0)