Skip to content

Commit addaa5b

Browse files
committed
Add complex (but unconditional) Unicode case mapping. Fix rust-lang#25800
As a result, the iterator returned by `char::to_uppercase` sometimes yields two or three `char`s instead of just one.
1 parent 66af127 commit addaa5b

File tree

5 files changed

+1154
-670
lines changed

5 files changed

+1154
-670
lines changed

src/etc/unicode.py

+44-10
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,11 @@ def load_unicode_data(f):
104104
# generate char to char direct common and simple conversions
105105
# uppercase to lowercase
106106
if lowcase != "" and code_org != lowcase:
107-
to_lower[code] = int(lowcase, 16)
107+
to_lower[code] = (int(lowcase, 16), 0, 0)
108108

109109
# lowercase to uppercase
110110
if upcase != "" and code_org != upcase:
111-
to_upper[code] = int(upcase, 16)
111+
to_upper[code] = (int(upcase, 16), 0, 0)
112112

113113
# store decomposition, if given
114114
if decomp != "":
@@ -146,6 +146,31 @@ def load_unicode_data(f):
146146

147147
return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower)
148148

149+
def load_special_casing(f, to_upper, to_lower):
150+
fetch(f)
151+
for line in fileinput.input(f):
152+
data = line.split('#')[0].split(';')
153+
if len(data) == 5:
154+
code, lower, title, upper, _comment = data
155+
elif len(data) == 6:
156+
code, lower, title, upper, condition, _comment = data
157+
if condition.strip(): # Only keep unconditional mappins
158+
continue
159+
else:
160+
continue
161+
code = code.strip()
162+
lower = lower.strip()
163+
title = title.strip()
164+
upper = upper.strip()
165+
key = int(code, 16)
166+
for (map_, values) in [(to_lower, lower), (to_upper, upper)]:
167+
if values != code:
168+
values = [int(i, 16) for i in values.split()]
169+
for _ in range(len(values), 3):
170+
values.append(0)
171+
assert len(values) == 3
172+
map_[key] = values
173+
149174
def group_cats(cats):
150175
cats_out = {}
151176
for cat in cats:
@@ -279,7 +304,7 @@ def load_east_asian_width(want_widths, except_cats):
279304
return widths
280305

281306
def escape_char(c):
282-
return "'\\u{%x}'" % c
307+
return "'\\u{%x}'" % c if c != 0 else "'\\0'"
283308

284309
def emit_bsearch_range_table(f):
285310
f.write("""
@@ -328,21 +353,21 @@ def emit_conversions_module(f, to_upper, to_lower):
328353
use core::option::Option::{Some, None};
329354
use core::result::Result::{Ok, Err};
330355
331-
pub fn to_lower(c: char) -> char {
356+
pub fn to_lower(c: char) -> [char; 3] {
332357
match bsearch_case_table(c, to_lowercase_table) {
333-
None => c,
358+
None => [c, '\\0', '\\0'],
334359
Some(index) => to_lowercase_table[index].1
335360
}
336361
}
337362
338-
pub fn to_upper(c: char) -> char {
363+
pub fn to_upper(c: char) -> [char; 3] {
339364
match bsearch_case_table(c, to_uppercase_table) {
340-
None => c,
365+
None => [c, '\\0', '\\0'],
341366
Some(index) => to_uppercase_table[index].1
342367
}
343368
}
344369
345-
fn bsearch_case_table(c: char, table: &'static [(char, char)]) -> Option<usize> {
370+
fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
346371
match table.binary_search_by(|&(key, _)| {
347372
if c == key { Equal }
348373
else if key < c { Less }
@@ -355,9 +380,17 @@ def emit_conversions_module(f, to_upper, to_lower):
355380
356381
""")
357382
emit_table(f, "to_lowercase_table",
358-
sorted(to_lower.iteritems(), key=operator.itemgetter(0)), is_pub=False)
383+
sorted(to_lower.iteritems(), key=operator.itemgetter(0)),
384+
is_pub=False,
385+
t_type = "&'static [(char, [char; 3])]",
386+
pfun=lambda x: "(%s,[%s,%s,%s])" % (
387+
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])))
359388
emit_table(f, "to_uppercase_table",
360-
sorted(to_upper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
389+
sorted(to_upper.iteritems(), key=operator.itemgetter(0)),
390+
is_pub=False,
391+
t_type = "&'static [(char, [char; 3])]",
392+
pfun=lambda x: "(%s,[%s,%s,%s])" % (
393+
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])))
361394
f.write("}\n\n")
362395

363396
def emit_grapheme_module(f, grapheme_table, grapheme_cats):
@@ -592,6 +625,7 @@ def optimize_width_table(wtable):
592625
""" % unicode_version)
593626
(canon_decomp, compat_decomp, gencats, combines,
594627
to_upper, to_lower) = load_unicode_data("UnicodeData.txt")
628+
load_special_casing("SpecialCasing.txt", to_upper, to_lower)
595629
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
596630
derived = load_properties("DerivedCoreProperties.txt", want_derived)
597631
scripts = load_properties("Scripts.txt", [])

src/libcollectionstest/char.rs

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
use collections::vec::Vec;
12+
13+
#[test]
14+
fn char_to_lowercase() {
15+
assert_iter_eq('A'.to_lowercase(), &['a']);
16+
assert_iter_eq('É'.to_lowercase(), &['é']);
17+
assert_iter_eq('Dž'.to_lowercase(), &['dž']);
18+
}
19+
20+
#[test]
21+
fn char_to_uppercase() {
22+
assert_iter_eq('a'.to_uppercase(), &['A']);
23+
assert_iter_eq('é'.to_uppercase(), &['É']);
24+
assert_iter_eq('Dž'.to_uppercase(), &['DŽ']);
25+
assert_iter_eq('ß'.to_uppercase(), &['S', 'S']);
26+
assert_iter_eq('fi'.to_uppercase(), &['F', 'I']);
27+
assert_iter_eq('ᾀ'.to_uppercase(), &['Ἀ', 'Ι']);
28+
}
29+
30+
fn assert_iter_eq<I: Iterator<Item=char>>(iter: I, expected: &[char]) {
31+
assert_eq!(iter.collect::<Vec<_>>(), expected);
32+
}

src/libcollectionstest/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ extern crate rustc_unicode;
3737
mod binary_heap;
3838
mod bit;
3939
mod btree;
40+
mod char; // char isn't really a collection, but didn't find a better place for this.
4041
mod enum_set;
4142
mod fmt;
4243
mod linked_list;

src/librustc_unicode/char.rs

+60-17
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
#![doc(primitive = "char")]
3030

3131
use core::char::CharExt as C;
32-
use core::option::Option::{self, Some};
32+
use core::option::Option::{self, Some, None};
3333
use core::iter::Iterator;
3434
use tables::{derived_property, property, general_category, conversions, charwidth};
3535

@@ -47,24 +47,67 @@ pub use tables::UNICODE_VERSION;
4747
/// the [`to_lowercase` method](../primitive.char.html#method.to_lowercase) on
4848
/// characters.
4949
#[stable(feature = "rust1", since = "1.0.0")]
50-
pub struct ToLowercase(Option<char>);
50+
pub struct ToLowercase(CaseMappingIter);
5151

5252
#[stable(feature = "rust1", since = "1.0.0")]
5353
impl Iterator for ToLowercase {
5454
type Item = char;
55-
fn next(&mut self) -> Option<char> { self.0.take() }
55+
fn next(&mut self) -> Option<char> { self.0.next() }
5656
}
5757

5858
/// An iterator over the uppercase mapping of a given character, returned from
5959
/// the [`to_uppercase` method](../primitive.char.html#method.to_uppercase) on
6060
/// characters.
6161
#[stable(feature = "rust1", since = "1.0.0")]
62-
pub struct ToUppercase(Option<char>);
62+
pub struct ToUppercase(CaseMappingIter);
6363

6464
#[stable(feature = "rust1", since = "1.0.0")]
6565
impl Iterator for ToUppercase {
6666
type Item = char;
67-
fn next(&mut self) -> Option<char> { self.0.take() }
67+
fn next(&mut self) -> Option<char> { self.0.next() }
68+
}
69+
70+
71+
enum CaseMappingIter {
72+
Three(char, char, char),
73+
Two(char, char),
74+
One(char),
75+
Zero
76+
}
77+
78+
impl CaseMappingIter {
79+
fn new(chars: [char; 3]) -> CaseMappingIter {
80+
if chars[2] == '\0' {
81+
if chars[1] == '\0' {
82+
CaseMappingIter::One(chars[0]) // Including if chars[0] == '\0'
83+
} else {
84+
CaseMappingIter::Two(chars[0], chars[1])
85+
}
86+
} else {
87+
CaseMappingIter::Three(chars[0], chars[1], chars[2])
88+
}
89+
}
90+
}
91+
92+
impl Iterator for CaseMappingIter {
93+
type Item = char;
94+
fn next(&mut self) -> Option<char> {
95+
match *self {
96+
CaseMappingIter::Three(a, b, c) => {
97+
*self = CaseMappingIter::Two(b, c);
98+
Some(a)
99+
}
100+
CaseMappingIter::Two(b, c) => {
101+
*self = CaseMappingIter::One(c);
102+
Some(b)
103+
}
104+
CaseMappingIter::One(c) => {
105+
*self = CaseMappingIter::Zero;
106+
Some(c)
107+
}
108+
CaseMappingIter::Zero => None,
109+
}
110+
}
68111
}
69112

70113
#[stable(feature = "rust1", since = "1.0.0")]
@@ -397,45 +440,45 @@ impl char {
397440

398441
/// Converts a character to its lowercase equivalent.
399442
///
400-
/// The case-folding performed is the common or simple mapping. See
401-
/// `to_uppercase()` for references and more information.
443+
/// This performs complex unconditional mappings with no tailoring.
444+
/// See `to_uppercase()` for references and more information.
402445
///
403446
/// # Return value
404447
///
405448
/// Returns an iterator which yields the characters corresponding to the
406449
/// lowercase equivalent of the character. If no conversion is possible then
407-
/// the input character is returned.
450+
/// an iterator with just the input character is returned.
408451
#[stable(feature = "rust1", since = "1.0.0")]
409452
#[inline]
410453
pub fn to_lowercase(self) -> ToLowercase {
411-
ToLowercase(Some(conversions::to_lower(self)))
454+
ToLowercase(CaseMappingIter::new(conversions::to_lower(self)))
412455
}
413456

414457
/// Converts a character to its uppercase equivalent.
415458
///
416-
/// The case-folding performed is the common or simple mapping: it maps
417-
/// one Unicode codepoint to its uppercase equivalent according to the
418-
/// Unicode database [1]. The additional [`SpecialCasing.txt`] is not yet
419-
/// considered here, but the iterator returned will soon support this form
420-
/// of case folding.
459+
/// This performs complex unconditional mappings with no tailoring:
460+
/// it maps one Unicode character to its uppercase equivalent
461+
/// according to the Unicode database [1]
462+
/// and the additional complex mappings [`SpecialCasing.txt`].
463+
/// Conditional mappings (based on context or language) are not considerd here.
421464
///
422465
/// A full reference can be found here [2].
423466
///
424467
/// # Return value
425468
///
426469
/// Returns an iterator which yields the characters corresponding to the
427470
/// uppercase equivalent of the character. If no conversion is possible then
428-
/// the input character is returned.
471+
/// an iterator with just the input character is returned.
429472
///
430473
/// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
431474
///
432475
/// [`SpecialCasing.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
433476
///
434-
/// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
477+
/// [2]: http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
435478
#[stable(feature = "rust1", since = "1.0.0")]
436479
#[inline]
437480
pub fn to_uppercase(self) -> ToUppercase {
438-
ToUppercase(Some(conversions::to_upper(self)))
481+
ToUppercase(CaseMappingIter::new(conversions::to_upper(self)))
439482
}
440483

441484
/// Returns this character's displayed width in columns, or `None` if it is a

0 commit comments

Comments
 (0)