Support Lisu tone letters

Jules-Bertholet · Jules-Bertholet · commit b3ab63320475 · 2024-05-21T10:04:33.000-04:00
diff --git a/src/lib.rs b/src/lib.rs
@@ -38,8 +38,10 @@
 //!    - Has the [`Emoji_Presentation`] property, and
 //!    - Is not in the [Enclosed Ideographic Supplement] block.
 //! 3. The sequence `"\r\n"` has width 1.
-//! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
-//! 5. The following have width 0:
+//! 4. [Lisu tone letter] combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
+//!    followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1.
+//! 5. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
+//! 6. The following have width 0:
 //!    - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
 //!       with the [`Default_Ignorable_Code_Point`] property.
 //!    - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -55,11 +57,11 @@
 //!      - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
 //!    - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
 //!       with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
-//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
+//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
 //!    with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
-//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
+//! 8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
 //!    with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
-//! 8. All other characters have width 1.
+//! 9. All other characters have width 1.
 //!
 //! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
 //! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
@@ -76,6 +78,8 @@
 //!
 //! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf
 //!
+//! [Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078
+//!
 //! ## Canonical equivalence
 //!
 //! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.
@@ -185,8 +189,14 @@ impl UnicodeWidthStr for str {
 enum NextCharInfo {
     #[default]
     Default,
+    /// `'\n'`
     LineFeed = 0x0A,
+    /// `'\u{A4FC}'..='\u{A4FD}'`
+    /// <https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078>
+    TrailingLisuToneLetter,
+    /// `'\u{FE0E}'`
     Vs15 = 0x0E,
+    /// `'\u{FE0F}'`
     Vs16 = 0x0F,
 }
 
@@ -204,25 +214,28 @@ fn str_width(s: &str, is_cjk: bool) -> usize {
 /// they're treated as single width.
 #[inline]
 fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextCharInfo) {
-    match next_info {
-        NextCharInfo::Vs15 if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) => {
-            (1, NextCharInfo::Default)
+    if next_info == NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c) {
+        (2, NextCharInfo::Default)
+    } else if c <= '\u{A0}' {
+        match c {
+            '\n' => (1, NextCharInfo::LineFeed),
+            '\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
+            _ => (1, NextCharInfo::Default),
         }
-        NextCharInfo::Vs16 if cw::starts_emoji_presentation_seq(c) => (2, NextCharInfo::Default),
-        _ => {
-            if c <= '\u{A0}' {
-                match c {
-                    '\n' => (1, NextCharInfo::LineFeed),
-                    '\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
-                    _ => (1, NextCharInfo::Default),
-                }
-            } else {
-                match c {
-                    '\u{FE0E}' => (0, NextCharInfo::Vs15),
-                    '\u{FE0F}' => (0, NextCharInfo::Vs16),
-                    _ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
-                }
+    } else {
+        match (c, next_info) {
+            ('\u{A4F8}'..='\u{A4FB}', NextCharInfo::TrailingLisuToneLetter) => {
+                (0, NextCharInfo::Default)
+            }
+            ('\u{A4FC}'..='\u{A4FD}', _) => (1, NextCharInfo::TrailingLisuToneLetter),
+            ('\u{FE0E}', _) => (0, NextCharInfo::Vs15),
+            ('\u{FE0F}', _) => (0, NextCharInfo::Vs16),
+            (_, NextCharInfo::Vs15)
+                if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) =>
+            {
+                (1, NextCharInfo::Default)
             }
+            _ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
         }
     }
 }
diff --git a/tests/tests.rs b/tests/tests.rs
@@ -234,3 +234,27 @@ fn char_str_consistent() {
         assert_eq!(c.width().unwrap_or(1), s.width())
     }
 }
+
+#[test]
+fn test_lisu_tones() {
+    for c in '\u{A4F8}'..='\u{A4FD}' {
+        assert_eq!(c.width(), Some(1));
+        assert_eq!(String::from(c).width(), 1);
+    }
+    for c1 in '\u{A4F8}'..='\u{A4FD}' {
+        for c2 in '\u{A4F8}'..='\u{A4FD}' {
+            let mut s = String::with_capacity(8);
+            s.push(c1);
+            s.push(c2);
+            match (c1, c2) {
+                ('\u{A4F8}'..='\u{A4FB}', '\u{A4FC}'..='\u{A4FD}') => assert_eq!(s.width(), 1),
+                _ => assert_eq!(s.width(), 2),
+            }
+        }
+    }
+
+    assert_eq!("ꓪꓹ".width(), 2);
+    assert_eq!("ꓪꓹꓼ".width(), 2);
+    assert_eq!("ꓪꓹꓹ".width(), 3);
+    assert_eq!("ꓪꓼꓼ".width(), 3);
+}