Skip to content

Commit d316487

Browse files
committed
Add char::to_titlecase
But not str::to_titlecase which would require UAX#29 Unicode Text Segmentation which we decided not to include in of `std`: rust-lang/rfcs#1054
1 parent addaa5b commit d316487

File tree

4 files changed

+590
-15
lines changed

4 files changed

+590
-15
lines changed

src/etc/unicode.py

+27-15
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ def load_unicode_data(f):
7474
gencats = {}
7575
to_lower = {}
7676
to_upper = {}
77+
to_title = {}
7778
combines = {}
7879
canon_decomp = {}
7980
compat_decomp = {}
@@ -110,6 +111,10 @@ def load_unicode_data(f):
110111
if upcase != "" and code_org != upcase:
111112
to_upper[code] = (int(upcase, 16), 0, 0)
112113

114+
# title case
115+
if titlecase.strip() != "" and code_org != titlecase:
116+
to_title[code] = (int(titlecase, 16), 0, 0)
117+
113118
# store decomposition, if given
114119
if decomp != "":
115120
if decomp.startswith('<'):
@@ -144,9 +149,9 @@ def load_unicode_data(f):
144149
gencats = group_cats(gencats)
145150
combines = to_combines(group_cats(combines))
146151

147-
return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower)
152+
return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower, to_title)
148153

149-
def load_special_casing(f, to_upper, to_lower):
154+
def load_special_casing(f, to_upper, to_lower, to_title):
150155
fetch(f)
151156
for line in fileinput.input(f):
152157
data = line.split('#')[0].split(';')
@@ -163,7 +168,7 @@ def load_special_casing(f, to_upper, to_lower):
163168
title = title.strip()
164169
upper = upper.strip()
165170
key = int(code, 16)
166-
for (map_, values) in [(to_lower, lower), (to_upper, upper)]:
171+
for (map_, values) in [(to_lower, lower), (to_upper, upper), (to_title, title)]:
167172
if values != code:
168173
values = [int(i, 16) for i in values.split()]
169174
for _ in range(len(values), 3):
@@ -344,7 +349,7 @@ def emit_property_module(f, mod, tbl, emit):
344349
f.write(" }\n\n")
345350
f.write("}\n\n")
346351

347-
def emit_conversions_module(f, to_upper, to_lower):
352+
def emit_conversions_module(f, to_upper, to_lower, to_title):
348353
f.write("pub mod conversions {")
349354
f.write("""
350355
use core::cmp::Ordering::{Equal, Less, Greater};
@@ -367,6 +372,13 @@ def emit_conversions_module(f, to_upper, to_lower):
367372
}
368373
}
369374
375+
pub fn to_title(c: char) -> [char; 3] {
376+
match bsearch_case_table(c, to_titlecase_table) {
377+
None => [c, '\\0', '\\0'],
378+
Some(index) => to_titlecase_table[index].1
379+
}
380+
}
381+
370382
fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
371383
match table.binary_search_by(|&(key, _)| {
372384
if c == key { Equal }
@@ -379,18 +391,18 @@ def emit_conversions_module(f, to_upper, to_lower):
379391
}
380392
381393
""")
394+
t_type = "&'static [(char, [char; 3])]"
395+
pfun = lambda x: "(%s,[%s,%s,%s])" % (
396+
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2]))
382397
emit_table(f, "to_lowercase_table",
383398
sorted(to_lower.iteritems(), key=operator.itemgetter(0)),
384-
is_pub=False,
385-
t_type = "&'static [(char, [char; 3])]",
386-
pfun=lambda x: "(%s,[%s,%s,%s])" % (
387-
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])))
399+
is_pub=False, t_type = t_type, pfun=pfun)
388400
emit_table(f, "to_uppercase_table",
389401
sorted(to_upper.iteritems(), key=operator.itemgetter(0)),
390-
is_pub=False,
391-
t_type = "&'static [(char, [char; 3])]",
392-
pfun=lambda x: "(%s,[%s,%s,%s])" % (
393-
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])))
402+
is_pub=False, t_type = t_type, pfun=pfun)
403+
emit_table(f, "to_titlecase_table",
404+
sorted(to_title.iteritems(), key=operator.itemgetter(0)),
405+
is_pub=False, t_type = t_type, pfun=pfun)
394406
f.write("}\n\n")
395407

396408
def emit_grapheme_module(f, grapheme_table, grapheme_cats):
@@ -624,8 +636,8 @@ def optimize_width_table(wtable):
624636
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
625637
""" % unicode_version)
626638
(canon_decomp, compat_decomp, gencats, combines,
627-
to_upper, to_lower) = load_unicode_data("UnicodeData.txt")
628-
load_special_casing("SpecialCasing.txt", to_upper, to_lower)
639+
to_upper, to_lower, to_title) = load_unicode_data("UnicodeData.txt")
640+
load_special_casing("SpecialCasing.txt", to_upper, to_lower, to_title)
629641
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
630642
derived = load_properties("DerivedCoreProperties.txt", want_derived)
631643
scripts = load_properties("Scripts.txt", [])
@@ -645,7 +657,7 @@ def optimize_width_table(wtable):
645657

646658
# normalizations and conversions module
647659
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
648-
emit_conversions_module(rf, to_upper, to_lower)
660+
emit_conversions_module(rf, to_upper, to_lower, to_title)
649661

650662
### character width module
651663
width_table = []

src/libcollectionstest/char.rs

+10
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,16 @@ fn char_to_uppercase() {
2727
assert_iter_eq('ᾀ'.to_uppercase(), &['Ἀ', 'Ι']);
2828
}
2929

30+
#[test]
31+
fn char_to_titlecase() {
32+
assert_iter_eq('a'.to_titlecase(), &['A']);
33+
assert_iter_eq('é'.to_titlecase(), &['É']);
34+
assert_iter_eq('DŽ'.to_titlecase(), &['Dž']);
35+
assert_iter_eq('ß'.to_titlecase(), &['S', 's']);
36+
assert_iter_eq('fi'.to_titlecase(), &['F', 'i']);
37+
assert_iter_eq('ᾀ'.to_titlecase(), &['ᾈ']);
38+
}
39+
3040
fn assert_iter_eq<I: Iterator<Item=char>>(iter: I, expected: &[char]) {
3141
assert_eq!(iter.collect::<Vec<_>>(), expected);
3242
}

src/librustc_unicode/char.rs

+33
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,18 @@ impl Iterator for ToUppercase {
6767
fn next(&mut self) -> Option<char> { self.0.next() }
6868
}
6969

70+
/// An iterator over the titlecase mapping of a given character, returned from
71+
/// the [`to_titlecase` method](../primitive.char.html#method.to_titlecase) on
72+
/// characters.
73+
#[stable(feature = "char_to_titlecase", since = "1.2.0")]
74+
pub struct ToTitlecase(CaseMappingIter);
75+
76+
#[stable(feature = "char_to_titlecase", since = "1.2.0")]
77+
impl Iterator for ToTitlecase {
78+
type Item = char;
79+
fn next(&mut self) -> Option<char> { self.0.next() }
80+
}
81+
7082

7183
enum CaseMappingIter {
7284
Three(char, char, char),
@@ -454,6 +466,27 @@ impl char {
454466
ToLowercase(CaseMappingIter::new(conversions::to_lower(self)))
455467
}
456468

469+
/// Converts a character to its titlecase equivalent.
470+
///
471+
/// This performs complex unconditional mappings with no tailoring.
472+
/// See `to_uppercase()` for references and more information.
473+
///
474+
/// This differs from `to_uppercase()` since Unicode contains
475+
/// digraphs and ligature characters.
476+
/// For example, U+01F3 “dz” and U+FB01 “fi”
477+
/// map to U+01F1 “DZ” and U+0046 U+0069 “Fi”, respectively.
478+
///
479+
/// # Return value
480+
///
481+
/// Returns an iterator which yields the characters corresponding to the
482+
/// lowercase equivalent of the character. If no conversion is possible then
483+
/// an iterator with just the input character is returned.
484+
#[stable(feature = "char_to_titlecase", since = "1.2.0")]
485+
#[inline]
486+
pub fn to_titlecase(self) -> ToTitlecase {
487+
ToTitlecase(CaseMappingIter::new(conversions::to_title(self)))
488+
}
489+
457490
/// Converts a character to its uppercase equivalent.
458491
///
459492
/// This performs complex unconditional mappings with no tailoring:

0 commit comments

Comments
 (0)