@@ -593,16 +593,7 @@ impl char {
593
593
#[ stable( feature = "rust1" , since = "1.0.0" ) ]
594
594
#[ inline]
595
595
pub fn len_utf8 ( self ) -> usize {
596
- let code = self as u32 ;
597
- if code < MAX_ONE_B {
598
- 1
599
- } else if code < MAX_TWO_B {
600
- 2
601
- } else if code < MAX_THREE_B {
602
- 3
603
- } else {
604
- 4
605
- }
596
+ len_utf8 ( self as u32 )
606
597
}
607
598
608
599
/// Returns the number of 16-bit code units this `char` would need if
@@ -670,36 +661,8 @@ impl char {
670
661
#[ stable( feature = "unicode_encode_char" , since = "1.15.0" ) ]
671
662
#[ inline]
672
663
pub fn encode_utf8 ( self , dst : & mut [ u8 ] ) -> & mut str {
673
- let code = self as u32 ;
674
- let len = self . len_utf8 ( ) ;
675
- match ( len, & mut dst[ ..] ) {
676
- ( 1 , [ a, ..] ) => {
677
- * a = code as u8 ;
678
- }
679
- ( 2 , [ a, b, ..] ) => {
680
- * a = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
681
- * b = ( code & 0x3F ) as u8 | TAG_CONT ;
682
- }
683
- ( 3 , [ a, b, c, ..] ) => {
684
- * a = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
685
- * b = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
686
- * c = ( code & 0x3F ) as u8 | TAG_CONT ;
687
- }
688
- ( 4 , [ a, b, c, d, ..] ) => {
689
- * a = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
690
- * b = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
691
- * c = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
692
- * d = ( code & 0x3F ) as u8 | TAG_CONT ;
693
- }
694
- _ => panic ! (
695
- "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}" ,
696
- len,
697
- code,
698
- dst. len( ) ,
699
- ) ,
700
- } ;
701
- // SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
702
- unsafe { from_utf8_unchecked_mut ( & mut dst[ ..len] ) }
664
+ // SAFETY: `char` is not a surrogate, so this is valid UTF-8.
665
+ unsafe { from_utf8_unchecked_mut ( encode_utf8_raw ( self as u32 , dst) ) }
703
666
}
704
667
705
668
/// Encodes this character as UTF-16 into the provided `u16` buffer,
@@ -739,28 +702,7 @@ impl char {
739
702
#[ stable( feature = "unicode_encode_char" , since = "1.15.0" ) ]
740
703
#[ inline]
741
704
pub fn encode_utf16 ( self , dst : & mut [ u16 ] ) -> & mut [ u16 ] {
742
- let mut code = self as u32 ;
743
- // SAFETY: each arm checks whether there are enough bits to write into
744
- unsafe {
745
- if ( code & 0xFFFF ) == code && !dst. is_empty ( ) {
746
- // The BMP falls through (assuming non-surrogate, as it should)
747
- * dst. get_unchecked_mut ( 0 ) = code as u16 ;
748
- slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , 1 )
749
- } else if dst. len ( ) >= 2 {
750
- // Supplementary planes break into surrogates.
751
- code -= 0x1_0000 ;
752
- * dst. get_unchecked_mut ( 0 ) = 0xD800 | ( ( code >> 10 ) as u16 ) ;
753
- * dst. get_unchecked_mut ( 1 ) = 0xDC00 | ( ( code as u16 ) & 0x3FF ) ;
754
- slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , 2 )
755
- } else {
756
- panic ! (
757
- "encode_utf16: need {} units to encode U+{:X}, but the buffer has {}" ,
758
- from_u32_unchecked( code) . len_utf16( ) ,
759
- code,
760
- dst. len( ) ,
761
- )
762
- }
763
- }
705
+ encode_utf16_raw ( self as u32 , dst)
764
706
}
765
707
766
708
/// Returns `true` if this `char` has the `Alphabetic` property.
@@ -1673,3 +1615,100 @@ impl char {
1673
1615
}
1674
1616
}
1675
1617
}
1618
+
1619
+ #[ inline]
1620
+ fn len_utf8 ( code : u32 ) -> usize {
1621
+ if code < MAX_ONE_B {
1622
+ 1
1623
+ } else if code < MAX_TWO_B {
1624
+ 2
1625
+ } else if code < MAX_THREE_B {
1626
+ 3
1627
+ } else {
1628
+ 4
1629
+ }
1630
+ }
1631
+
1632
+ /// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
1633
+ /// and then returns the subslice of the buffer that contains the encoded character.
1634
+ ///
1635
+ /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
1636
+ /// (Creating a `char` in the surrogate range is UB.)
1637
+ /// The result is valid [generalized UTF-8] but not valid UTF-8.
1638
+ ///
1639
+ /// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
1640
+ ///
1641
+ /// # Panics
1642
+ ///
1643
+ /// Panics if the buffer is not large enough.
1644
+ /// A buffer of length four is large enough to encode any `char`.
1645
+ #[ unstable( feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" ) ]
1646
+ #[ doc( hidden) ]
1647
+ #[ inline]
1648
+ pub fn encode_utf8_raw ( code : u32 , dst : & mut [ u8 ] ) -> & mut [ u8 ] {
1649
+ let len = len_utf8 ( code) ;
1650
+ match ( len, & mut dst[ ..] ) {
1651
+ ( 1 , [ a, ..] ) => {
1652
+ * a = code as u8 ;
1653
+ }
1654
+ ( 2 , [ a, b, ..] ) => {
1655
+ * a = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
1656
+ * b = ( code & 0x3F ) as u8 | TAG_CONT ;
1657
+ }
1658
+ ( 3 , [ a, b, c, ..] ) => {
1659
+ * a = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
1660
+ * b = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1661
+ * c = ( code & 0x3F ) as u8 | TAG_CONT ;
1662
+ }
1663
+ ( 4 , [ a, b, c, d, ..] ) => {
1664
+ * a = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
1665
+ * b = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
1666
+ * c = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1667
+ * d = ( code & 0x3F ) as u8 | TAG_CONT ;
1668
+ }
1669
+ _ => panic ! (
1670
+ "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}" ,
1671
+ len,
1672
+ code,
1673
+ dst. len( ) ,
1674
+ ) ,
1675
+ } ;
1676
+ & mut dst[ ..len]
1677
+ }
1678
+
1679
+ /// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
1680
+ /// and then returns the subslice of the buffer that contains the encoded character.
1681
+ ///
1682
+ /// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range.
1683
+ /// (Creating a `char` in the surrogate range is UB.)
1684
+ ///
1685
+ /// # Panics
1686
+ ///
1687
+ /// Panics if the buffer is not large enough.
1688
+ /// A buffer of length 2 is large enough to encode any `char`.
1689
+ #[ unstable( feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" ) ]
1690
+ #[ doc( hidden) ]
1691
+ #[ inline]
1692
+ pub fn encode_utf16_raw ( mut code : u32 , dst : & mut [ u16 ] ) -> & mut [ u16 ] {
1693
+ // SAFETY: each arm checks whether there are enough bits to write into
1694
+ unsafe {
1695
+ if ( code & 0xFFFF ) == code && !dst. is_empty ( ) {
1696
+ // The BMP falls through
1697
+ * dst. get_unchecked_mut ( 0 ) = code as u16 ;
1698
+ slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , 1 )
1699
+ } else if dst. len ( ) >= 2 {
1700
+ // Supplementary planes break into surrogates.
1701
+ code -= 0x1_0000 ;
1702
+ * dst. get_unchecked_mut ( 0 ) = 0xD800 | ( ( code >> 10 ) as u16 ) ;
1703
+ * dst. get_unchecked_mut ( 1 ) = 0xDC00 | ( ( code as u16 ) & 0x3FF ) ;
1704
+ slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , 2 )
1705
+ } else {
1706
+ panic ! (
1707
+ "encode_utf16: need {} units to encode U+{:X}, but the buffer has {}" ,
1708
+ from_u32_unchecked( code) . len_utf16( ) ,
1709
+ code,
1710
+ dst. len( ) ,
1711
+ )
1712
+ }
1713
+ }
1714
+ }
0 commit comments