Skip to content

Commit b3ab633

Browse files
Support Lisu tone letters
1 parent ded852c commit b3ab633

File tree

2 files changed

+59
-22
lines changed

2 files changed

+59
-22
lines changed

src/lib.rs

+35-22
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,10 @@
3838
//! - Has the [`Emoji_Presentation`] property, and
3939
//! - Is not in the [Enclosed Ideographic Supplement] block.
4040
//! 3. The sequence `"\r\n"` has width 1.
41-
//! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
42-
//! 5. The following have width 0:
41+
//! 4. [Lisu tone letter] combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
42+
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1.
43+
//! 5. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
44+
//! 6. The following have width 0:
4345
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
4446
//! with the [`Default_Ignorable_Code_Point`] property.
4547
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -55,11 +57,11 @@
5557
//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
5658
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
5759
//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
58-
//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
60+
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
5961
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
60-
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
62+
//! 8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
6163
//! with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
62-
//! 8. All other characters have width 1.
64+
//! 9. All other characters have width 1.
6365
//!
6466
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
6567
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
@@ -76,6 +78,8 @@
7678
//!
7779
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf
7880
//!
81+
//! [Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078
82+
//!
7983
//! ## Canonical equivalence
8084
//!
8185
//! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.
@@ -185,8 +189,14 @@ impl UnicodeWidthStr for str {
185189
enum NextCharInfo {
186190
#[default]
187191
Default,
192+
/// `'\n'`
188193
LineFeed = 0x0A,
194+
/// `'\u{A4FC}'..='\u{A4FD}'`
195+
/// <https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078>
196+
TrailingLisuToneLetter,
197+
/// `'\u{FE0E}'`
189198
Vs15 = 0x0E,
199+
/// `'\u{FE0F}'`
190200
Vs16 = 0x0F,
191201
}
192202

@@ -204,25 +214,28 @@ fn str_width(s: &str, is_cjk: bool) -> usize {
204214
/// they're treated as single width.
205215
#[inline]
206216
fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextCharInfo) {
207-
match next_info {
208-
NextCharInfo::Vs15 if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) => {
209-
(1, NextCharInfo::Default)
217+
if next_info == NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c) {
218+
(2, NextCharInfo::Default)
219+
} else if c <= '\u{A0}' {
220+
match c {
221+
'\n' => (1, NextCharInfo::LineFeed),
222+
'\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
223+
_ => (1, NextCharInfo::Default),
210224
}
211-
NextCharInfo::Vs16 if cw::starts_emoji_presentation_seq(c) => (2, NextCharInfo::Default),
212-
_ => {
213-
if c <= '\u{A0}' {
214-
match c {
215-
'\n' => (1, NextCharInfo::LineFeed),
216-
'\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
217-
_ => (1, NextCharInfo::Default),
218-
}
219-
} else {
220-
match c {
221-
'\u{FE0E}' => (0, NextCharInfo::Vs15),
222-
'\u{FE0F}' => (0, NextCharInfo::Vs16),
223-
_ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
224-
}
225+
} else {
226+
match (c, next_info) {
227+
('\u{A4F8}'..='\u{A4FB}', NextCharInfo::TrailingLisuToneLetter) => {
228+
(0, NextCharInfo::Default)
229+
}
230+
('\u{A4FC}'..='\u{A4FD}', _) => (1, NextCharInfo::TrailingLisuToneLetter),
231+
('\u{FE0E}', _) => (0, NextCharInfo::Vs15),
232+
('\u{FE0F}', _) => (0, NextCharInfo::Vs16),
233+
(_, NextCharInfo::Vs15)
234+
if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) =>
235+
{
236+
(1, NextCharInfo::Default)
225237
}
238+
_ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
226239
}
227240
}
228241
}

tests/tests.rs

+24
Original file line numberDiff line numberDiff line change
@@ -234,3 +234,27 @@ fn char_str_consistent() {
234234
assert_eq!(c.width().unwrap_or(1), s.width())
235235
}
236236
}
237+
238+
#[test]
239+
fn test_lisu_tones() {
240+
for c in '\u{A4F8}'..='\u{A4FD}' {
241+
assert_eq!(c.width(), Some(1));
242+
assert_eq!(String::from(c).width(), 1);
243+
}
244+
for c1 in '\u{A4F8}'..='\u{A4FD}' {
245+
for c2 in '\u{A4F8}'..='\u{A4FD}' {
246+
let mut s = String::with_capacity(8);
247+
s.push(c1);
248+
s.push(c2);
249+
match (c1, c2) {
250+
('\u{A4F8}'..='\u{A4FB}', '\u{A4FC}'..='\u{A4FD}') => assert_eq!(s.width(), 1),
251+
_ => assert_eq!(s.width(), 2),
252+
}
253+
}
254+
}
255+
256+
assert_eq!("ꓪꓹ".width(), 2);
257+
assert_eq!("ꓪꓹꓼ".width(), 2);
258+
assert_eq!("ꓪꓹꓹ".width(), 3);
259+
assert_eq!("ꓪꓼꓼ".width(), 3);
260+
}

0 commit comments

Comments
 (0)