Skip to content

Commit f48c5ec

Browse files
committed
Mark and implement 'char::encode_utf16' as const; Rewrite 'encode_utf16_raw';
1 parent bfadadf commit f48c5ec

File tree

2 files changed

+36
-25
lines changed

2 files changed

+36
-25
lines changed

library/core/src/char/methods.rs

+35-25
Original file line numberDiff line numberDiff line change
@@ -638,8 +638,7 @@ impl char {
638638
#[rustc_const_stable(feature = "const_char_len_utf", since = "1.52.0")]
639639
#[inline]
640640
pub const fn len_utf16(self) -> usize {
641-
let ch = self as u32;
642-
if (ch & 0xFFFF) == ch { 1 } else { 2 }
641+
len_utf16(self as u32)
643642
}
644643

645644
/// Encodes this character as UTF-8 into the provided byte buffer,
@@ -709,8 +708,9 @@ impl char {
709708
/// '𝕊'.encode_utf16(&mut b);
710709
/// ```
711710
#[stable(feature = "unicode_encode_char", since = "1.15.0")]
711+
#[rustc_const_unstable(feature = "const_char_encode_utf16", issue = "130660")]
712712
#[inline]
713-
pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
713+
pub const fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
714714
encode_utf16_raw(self as u32, dst)
715715
}
716716

@@ -1745,7 +1745,12 @@ const fn len_utf8(code: u32) -> usize {
17451745
}
17461746
}
17471747

1748-
/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
1748+
#[inline]
1749+
const fn len_utf16(code: u32) -> usize {
1750+
if (code & 0xFFFF) == code { 1 } else { 2 }
1751+
}
1752+
1753+
/// Encodes a raw `u32` value as UTF-8 into the provided byte buffer,
17491754
/// and then returns the subslice of the buffer that contains the encoded character.
17501755
///
17511756
/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
@@ -1799,7 +1804,7 @@ pub const fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
17991804
unsafe { slice::from_raw_parts_mut(dst.as_mut_ptr(), len) }
18001805
}
18011806

1802-
/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
1807+
/// Encodes a raw `u32` value as UTF-16 into the provided `u16` buffer,
18031808
/// and then returns the subslice of the buffer that contains the encoded character.
18041809
///
18051810
/// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range.
@@ -1810,28 +1815,33 @@ pub const fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
18101815
/// Panics if the buffer is not large enough.
18111816
/// A buffer of length 2 is large enough to encode any `char`.
18121817
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
1818+
#[rustc_const_unstable(feature = "const_char_encode_utf16", issue = "130660")]
18131819
#[doc(hidden)]
18141820
#[inline]
1815-
pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] {
1816-
// SAFETY: each arm checks whether there are enough bits to write into
1817-
unsafe {
1818-
if (code & 0xFFFF) == code && !dst.is_empty() {
1819-
// The BMP falls through
1820-
*dst.get_unchecked_mut(0) = code as u16;
1821-
slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
1822-
} else if dst.len() >= 2 {
1823-
// Supplementary planes break into surrogates.
1821+
pub const fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] {
1822+
const fn panic_at_const(_code: u32, _len: usize, _dst_len: usize) {
1823+
// Note that we cannot format in constant expressions.
1824+
panic!("encode_utf16: buffer does not have enough bytes to encode code point");
1825+
}
1826+
fn panic_at_rt(code: u32, len: usize, dst_len: usize) {
1827+
panic!(
1828+
"encode_utf16: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}",
1829+
);
1830+
}
1831+
let len = len_utf16(code);
1832+
match (len, &mut *dst) {
1833+
(1, [a, ..]) => {
1834+
*a = code as u16;
1835+
}
1836+
(2, [a, b, ..]) => {
18241837
code -= 0x1_0000;
1825-
*dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16);
1826-
*dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF);
1827-
slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)
1828-
} else {
1829-
panic!(
1830-
"encode_utf16: need {} units to encode U+{:X}, but the buffer has {}",
1831-
char::from_u32_unchecked(code).len_utf16(),
1832-
code,
1833-
dst.len(),
1834-
)
1838+
1839+
*a = (code >> 10) as u16 | 0xD800;
1840+
*b = (code & 0x3FF) as u16 | 0xDC00;
18351841
}
1836-
}
1842+
// FIXME(const-hack): We would prefer to have streamlined panics when formatters become const-friendly.
1843+
_ => const_eval_select((code, len, dst.len()), panic_at_const, panic_at_rt),
1844+
};
1845+
// SAFETY: `<&mut [u16]>::as_mut_ptr` is guaranteed to return a valid pointer and `len` has been tested to be within bounds.
1846+
unsafe { slice::from_raw_parts_mut(dst.as_mut_ptr(), len) }
18371847
}

library/core/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@
119119
#![feature(const_bigint_helper_methods)]
120120
#![feature(const_black_box)]
121121
#![feature(const_cell_into_inner)]
122+
#![feature(const_char_encode_utf16)]
122123
#![feature(const_char_encode_utf8)]
123124
#![feature(const_eval_select)]
124125
#![feature(const_exact_div)]

0 commit comments

Comments
 (0)