Skip to content

Commit 4efb180

Browse files
Control characters have width 1
1 parent 86970a1 commit 4efb180

File tree

5 files changed

+547
-565
lines changed

5 files changed

+547
-565
lines changed

README.md

+4-4
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use unicode_width::UnicodeWidthStr;
1616

1717
fn main() {
1818
let teststr = "Hello, world!";
19-
let width = UnicodeWidthStr::width(teststr);
19+
let width = teststr.width();
2020
println!("{}", teststr);
2121
println!("The above string is {} columns wide.", width);
2222
let width = teststr.width_cjk();
@@ -34,9 +34,9 @@ extern crate unicode_width;
3434
use unicode_width::UnicodeWidthStr;
3535

3636
fn main() {
37-
assert_eq!(UnicodeWidthStr::width("👩"), 2); // Woman
38-
assert_eq!(UnicodeWidthStr::width("🔬"), 2); // Microscope
39-
assert_eq!(UnicodeWidthStr::width("👩‍🔬"), 4); // Woman scientist
37+
assert_eq!("👩".width(), 2); // Woman
38+
assert_eq!("🔬".width(), 2); // Microscope
39+
assert_eq!("👩‍🔬".width(), 4); // Woman scientist
4040
}
4141
```
4242

scripts/unicode.py

+2-39
Original file line numberDiff line numberDiff line change
@@ -165,21 +165,14 @@ def load_zero_widths() -> "list[bool]":
165165
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
166166
character. `c` is considered a zero-width character if
167167
168-
- it is a control character,
169-
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
168+
- it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
170169
- or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
171170
- or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug,
172171
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
173172
"""
174173

175174
zw_map = [False] * NUM_CODEPOINTS
176175

177-
# Control characters have width 0
178-
for c in range(0x00, 0x20):
179-
zw_map[c] = True
180-
for c in range(0x7F, 0xA0):
181-
zw_map[c] = True
182-
183176
# `Default_Ignorable_Code_Point`s also have 0 width:
184177
# https://www.unicode.org/faq/unsup_char.html#3
185178
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
@@ -563,7 +556,7 @@ def emit_module(
563556
/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
564557
/// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes.
565558
#[inline]
566-
fn lookup_width(c: char, is_cjk: bool) -> usize {
559+
pub fn lookup_width(c: char, is_cjk: bool) -> usize {
567560
let cp = c as usize;
568561
569562
let t1_offset = TABLES_0[cp >> 13 & 0xFF];
@@ -664,36 +657,6 @@ def emit_module(
664657
"""
665658
)
666659

667-
module.write(
668-
"""
669-
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
670-
/// `None` if `c` is a control character other than `'\\x00'`.
671-
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
672-
/// they're treated as single width.
673-
#[inline]
674-
pub fn width(c: char, is_cjk: bool) -> Option<usize> {
675-
if c < '\\u{7F}' {
676-
if c >= '\\u{20}' {
677-
// U+0020 to U+007F (exclusive) are single-width ASCII codepoints
678-
Some(1)
679-
} else if c == '\\0' {
680-
// U+0000 *is* a control code, but it's special-cased
681-
Some(0)
682-
} else {
683-
// U+0001 to U+0020 (exclusive) are control codes
684-
None
685-
}
686-
} else if c >= '\\u{A0}' {
687-
// No characters >= U+00A0 are control codes, so we can consult the lookup tables
688-
Some(lookup_width(c, is_cjk))
689-
} else {
690-
// U+007F to U+00A0 (exclusive) are control codes
691-
None
692-
}
693-
}
694-
"""
695-
)
696-
697660
subtable_count = 1
698661
for i, table in enumerate(tables):
699662
new_subtable_count = len(table.buckets())

src/lib.rs

+68-37
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,13 @@
3434
//!
3535
//! 1. [Emoji presentation sequences] have width 2.
3636
//! (The width of a string may therefore differ from the sum of the widths of its characters.)
37-
//! 2. Outside of an East Asian context, [text presentation sequences] have width 1
38-
//! iff their base character fulfills all the following requirements:
37+
//! 2. Outside of an East Asian context, [text presentation sequences] fulfilling all the following requirements
38+
//! have width 1:
3939
//! - Has the [`Emoji_Presentation`] property, and
4040
//! - Not in the [Enclosed Ideographic Supplement] block.
41-
//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
42-
//! 4. The following have width 0:
41+
//! 3. The sequence `"\r\n"` has width 1.
42+
//! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
43+
//! 5. The following have width 0:
4344
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
4445
//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
4546
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -55,9 +56,6 @@
5556
//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
5657
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
5758
//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
58-
//! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
59-
//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
60-
//! have no defined width, and are ignored when determining the width of a string.
6159
//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
6260
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
6361
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
@@ -99,7 +97,7 @@ mod tables;
9997
/// Methods for determining displayed width of Unicode characters.
10098
pub trait UnicodeWidthChar {
10199
/// Returns the character's displayed width in columns, or `None` if the
102-
/// character is a control character other than `'\x00'`.
100+
/// character is a control character.
103101
///
104102
/// This function treats characters in the Ambiguous category according
105103
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -108,7 +106,7 @@ pub trait UnicodeWidthChar {
108106
fn width(self) -> Option<usize>;
109107

110108
/// Returns the character's displayed width in columns, or `None` if the
111-
/// character is a control character other than `'\x00'`.
109+
/// character is a control character.
112110
///
113111
/// This function treats characters in the Ambiguous category according
114112
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -120,23 +118,42 @@ pub trait UnicodeWidthChar {
120118
impl UnicodeWidthChar for char {
121119
#[inline]
122120
fn width(self) -> Option<usize> {
123-
cw::width(self, false)
121+
single_char_width(self, false)
124122
}
125123

126124
#[inline]
127125
fn width_cjk(self) -> Option<usize> {
128-
cw::width(self, true)
126+
single_char_width(self, true)
127+
}
128+
}
129+
130+
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
131+
/// `None` if `c` is a control character.
132+
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
133+
/// they're treated as single width.
134+
#[inline]
135+
fn single_char_width(c: char, is_cjk: bool) -> Option<usize> {
136+
if c < '\u{7F}' {
137+
if c >= '\u{20}' {
138+
// U+0020 to U+007F (exclusive) are single-width ASCII codepoints
139+
Some(1)
140+
} else {
141+
// U+0001 to U+0020 (exclusive) are control codes
142+
None
143+
}
144+
} else if c >= '\u{A0}' {
145+
// No characters >= U+00A0 are control codes, so we can consult the lookup tables
146+
Some(cw::lookup_width(c, is_cjk))
147+
} else {
148+
// U+007F to U+00A0 (exclusive) are control codes
149+
None
129150
}
130151
}
131152

132153
/// Methods for determining displayed width of Unicode strings.
133154
pub trait UnicodeWidthStr {
134155
/// Returns the string's displayed width in columns.
135156
///
136-
/// Control characters are treated as having zero width,
137-
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
138-
/// are assigned width 2.
139-
///
140157
/// This function treats characters in the Ambiguous category according
141158
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
142159
/// as 1 column wide. This is consistent with the recommendations for
@@ -145,10 +162,6 @@ pub trait UnicodeWidthStr {
145162

146163
/// Returns the string's displayed width in columns.
147164
///
148-
/// Control characters are treated as having zero width,
149-
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
150-
/// are assigned width 2.
151-
///
152165
/// This function treats characters in the Ambiguous category according
153166
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
154167
/// as 2 column wide. This is consistent with the recommendations for
@@ -168,30 +181,48 @@ impl UnicodeWidthStr for str {
168181
}
169182
}
170183

171-
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
172-
enum VariationSelector {
184+
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
185+
enum NextCharInfo {
186+
#[default]
187+
Default,
188+
LineFeed = 0x0A,
173189
Vs15 = 0x0E,
174190
Vs16 = 0x0F,
175191
}
176192

177193
fn str_width(s: &str, is_cjk: bool) -> usize {
178194
s.chars()
179-
.rfold((0, None), |(sum, vsel), c| match c {
180-
'\u{FE0E}' => (sum, Some(VariationSelector::Vs15)),
181-
'\u{FE0F}' => (sum, Some(VariationSelector::Vs16)),
182-
_ => {
183-
let add = match vsel {
184-
Some(VariationSelector::Vs15)
185-
if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) =>
186-
{
187-
1
188-
}
189-
190-
Some(VariationSelector::Vs16) if cw::starts_emoji_presentation_seq(c) => 2,
191-
_ => cw::width(c, is_cjk).unwrap_or(0),
192-
};
193-
(sum + add, None)
194-
}
195+
.rfold((0, NextCharInfo::Default), |(sum, next_info), c| {
196+
let (add, info) = width_in_str(c, is_cjk, next_info);
197+
(sum + add, info)
195198
})
196199
.0
197200
}
201+
202+
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`.
203+
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
204+
/// they're treated as single width.
205+
#[inline]
206+
fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextCharInfo) {
207+
match next_info {
208+
NextCharInfo::Vs15 if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) => {
209+
(1, NextCharInfo::Default)
210+
}
211+
NextCharInfo::Vs16 if cw::starts_emoji_presentation_seq(c) => (2, NextCharInfo::Default),
212+
_ => {
213+
if c <= '\u{A0}' {
214+
match c {
215+
'\n' => (1, NextCharInfo::LineFeed),
216+
'\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
217+
_ => (1, NextCharInfo::Default),
218+
}
219+
} else {
220+
match c {
221+
'\u{FE0E}' => (0, NextCharInfo::Vs15),
222+
'\u{FE0F}' => (0, NextCharInfo::Vs16),
223+
_ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
224+
}
225+
}
226+
}
227+
}
228+
}

0 commit comments

Comments
 (0)