34
34
//!
35
35
//! 1. [Emoji presentation sequences] have width 2.
36
36
//! (The width of a string may therefore differ from the sum of the widths of its characters.)
37
- //! 2. Outside of an East Asian context, [text presentation sequences] have width 1
38
- //! iff their base character fulfills all the following requirements :
37
+ //! 2. Outside of an East Asian context, [text presentation sequences] fulfilling all the following requirements
38
+ //! have width 1 :
39
39
//! - Has the [`Emoji_Presentation`] property, and
40
40
//! - Not in the [Enclosed Ideographic Supplement] block.
41
- //! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
42
- //! 4. The following have width 0:
41
+ //! 3. The sequence `"\r\n"` has width 1.
42
+ //! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
43
+ //! 5. The following have width 0:
43
44
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
44
45
//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
45
46
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
55
56
//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
56
57
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
57
58
//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
58
- //! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
59
- //! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
60
- //! have no defined width, and are ignored when determining the width of a string.
61
59
//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
62
60
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
63
61
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
@@ -99,7 +97,7 @@ mod tables;
99
97
/// Methods for determining displayed width of Unicode characters.
100
98
pub trait UnicodeWidthChar {
101
99
/// Returns the character's displayed width in columns, or `None` if the
102
- /// character is a control character other than `'\x00'` .
100
+ /// character is a control character.
103
101
///
104
102
/// This function treats characters in the Ambiguous category according
105
103
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -108,7 +106,7 @@ pub trait UnicodeWidthChar {
108
106
fn width ( self ) -> Option < usize > ;
109
107
110
108
/// Returns the character's displayed width in columns, or `None` if the
111
- /// character is a control character other than `'\x00'` .
109
+ /// character is a control character.
112
110
///
113
111
/// This function treats characters in the Ambiguous category according
114
112
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -120,23 +118,42 @@ pub trait UnicodeWidthChar {
120
118
impl UnicodeWidthChar for char {
121
119
#[ inline]
122
120
fn width ( self ) -> Option < usize > {
123
- cw :: width ( self , false )
121
+ single_char_width ( self , false )
124
122
}
125
123
126
124
#[ inline]
127
125
fn width_cjk ( self ) -> Option < usize > {
128
- cw:: width ( self , true )
126
+ single_char_width ( self , true )
127
+ }
128
+ }
129
+
130
+ /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
131
+ /// `None` if `c` is a control character.
132
+ /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
133
+ /// they're treated as single width.
134
+ #[ inline]
135
+ fn single_char_width ( c : char , is_cjk : bool ) -> Option < usize > {
136
+ if c < '\u{7F}' {
137
+ if c >= '\u{20}' {
138
+ // U+0020 to U+007F (exclusive) are single-width ASCII codepoints
139
+ Some ( 1 )
140
+ } else {
141
+ // U+0001 to U+0020 (exclusive) are control codes
142
+ None
143
+ }
144
+ } else if c >= '\u{A0}' {
145
+ // No characters >= U+00A0 are control codes, so we can consult the lookup tables
146
+ Some ( cw:: lookup_width ( c, is_cjk) )
147
+ } else {
148
+ // U+007F to U+00A0 (exclusive) are control codes
149
+ None
129
150
}
130
151
}
131
152
132
153
/// Methods for determining displayed width of Unicode strings.
133
154
pub trait UnicodeWidthStr {
134
155
/// Returns the string's displayed width in columns.
135
156
///
136
- /// Control characters are treated as having zero width,
137
- /// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
138
- /// are assigned width 2.
139
- ///
140
157
/// This function treats characters in the Ambiguous category according
141
158
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
142
159
/// as 1 column wide. This is consistent with the recommendations for
@@ -145,10 +162,6 @@ pub trait UnicodeWidthStr {
145
162
146
163
/// Returns the string's displayed width in columns.
147
164
///
148
- /// Control characters are treated as having zero width,
149
- /// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
150
- /// are assigned width 2.
151
- ///
152
165
/// This function treats characters in the Ambiguous category according
153
166
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
154
167
/// as 2 column wide. This is consistent with the recommendations for
@@ -168,30 +181,48 @@ impl UnicodeWidthStr for str {
168
181
}
169
182
}
170
183
171
- #[ derive( Clone , Copy , Debug , PartialEq , Eq ) ]
172
- enum VariationSelector {
184
+ #[ derive( Clone , Copy , Debug , Default , PartialEq , Eq ) ]
185
+ enum NextCharInfo {
186
+ #[ default]
187
+ Default ,
188
+ LineFeed = 0x0A ,
173
189
Vs15 = 0x0E ,
174
190
Vs16 = 0x0F ,
175
191
}
176
192
177
193
fn str_width ( s : & str , is_cjk : bool ) -> usize {
178
194
s. chars ( )
179
- . rfold ( ( 0 , None ) , |( sum, vsel) , c| match c {
180
- '\u{FE0E}' => ( sum, Some ( VariationSelector :: Vs15 ) ) ,
181
- '\u{FE0F}' => ( sum, Some ( VariationSelector :: Vs16 ) ) ,
182
- _ => {
183
- let add = match vsel {
184
- Some ( VariationSelector :: Vs15 )
185
- if !is_cjk && cw:: starts_non_ideographic_text_presentation_seq ( c) =>
186
- {
187
- 1
188
- }
189
-
190
- Some ( VariationSelector :: Vs16 ) if cw:: starts_emoji_presentation_seq ( c) => 2 ,
191
- _ => cw:: width ( c, is_cjk) . unwrap_or ( 0 ) ,
192
- } ;
193
- ( sum + add, None )
194
- }
195
+ . rfold ( ( 0 , NextCharInfo :: Default ) , |( sum, next_info) , c| {
196
+ let ( add, info) = width_in_str ( c, is_cjk, next_info) ;
197
+ ( sum + add, info)
195
198
} )
196
199
. 0
197
200
}
201
+
202
+ /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`.
203
+ /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
204
+ /// they're treated as single width.
205
+ #[ inline]
206
+ fn width_in_str ( c : char , is_cjk : bool , next_info : NextCharInfo ) -> ( usize , NextCharInfo ) {
207
+ match next_info {
208
+ NextCharInfo :: Vs15 if !is_cjk && cw:: starts_non_ideographic_text_presentation_seq ( c) => {
209
+ ( 1 , NextCharInfo :: Default )
210
+ }
211
+ NextCharInfo :: Vs16 if cw:: starts_emoji_presentation_seq ( c) => ( 2 , NextCharInfo :: Default ) ,
212
+ _ => {
213
+ if c <= '\u{A0}' {
214
+ match c {
215
+ '\n' => ( 1 , NextCharInfo :: LineFeed ) ,
216
+ '\r' if next_info == NextCharInfo :: LineFeed => ( 0 , NextCharInfo :: Default ) ,
217
+ _ => ( 1 , NextCharInfo :: Default ) ,
218
+ }
219
+ } else {
220
+ match c {
221
+ '\u{FE0E}' => ( 0 , NextCharInfo :: Vs15 ) ,
222
+ '\u{FE0F}' => ( 0 , NextCharInfo :: Vs16 ) ,
223
+ _ => ( cw:: lookup_width ( c, is_cjk) , NextCharInfo :: Default ) ,
224
+ }
225
+ }
226
+ }
227
+ }
228
+ }
0 commit comments