Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add {floor,ceil}_char_boundary methods to str #86497

Merged
merged 1 commit into from
Feb 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions library/alloc/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#![feature(binary_heap_as_slice)]
#![feature(inplace_iteration)]
#![feature(iter_advance_by)]
#![feature(round_char_boundary)]
#![feature(slice_group_by)]
#![feature(slice_partition_dedup)]
#![feature(string_remove_matches)]
Expand Down
92 changes: 92 additions & 0 deletions library/alloc/tests/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2272,3 +2272,95 @@ fn utf8_char_counts() {
}
}
}

#[test]
fn floor_char_boundary() {
fn check_many(s: &str, arg: impl IntoIterator<Item = usize>, ret: usize) {
for idx in arg {
assert_eq!(
s.floor_char_boundary(idx),
ret,
"{:?}.floor_char_boundary({:?}) != {:?}",
s,
idx,
ret
);
}
}

// edge case
check_many("", [0, 1, isize::MAX as usize, usize::MAX], 0);

// basic check
check_many("x", [0], 0);
check_many("x", [1, isize::MAX as usize, usize::MAX], 1);

// 1-byte chars
check_many("jp", [0], 0);
check_many("jp", [1], 1);
check_many("jp", 2..4, 2);

// 2-byte chars
check_many("ĵƥ", 0..2, 0);
check_many("ĵƥ", 2..4, 2);
check_many("ĵƥ", 4..6, 4);

// 3-byte chars
check_many("日本", 0..3, 0);
check_many("日本", 3..6, 3);
check_many("日本", 6..8, 6);

// 4-byte chars
check_many("🇯🇵", 0..4, 0);
check_many("🇯🇵", 4..8, 4);
check_many("🇯🇵", 8..10, 8);
}

#[test]
fn ceil_char_boundary() {
fn check_many(s: &str, arg: impl IntoIterator<Item = usize>, ret: usize) {
for idx in arg {
assert_eq!(
s.ceil_char_boundary(idx),
ret,
"{:?}.ceil_char_boundary({:?}) != {:?}",
s,
idx,
ret
);
}
}

// edge case
check_many("", [0], 0);

// basic check
check_many("x", [0], 0);
check_many("x", [1], 1);

// 1-byte chars
check_many("jp", [0], 0);
check_many("jp", [1], 1);
check_many("jp", [2], 2);

// 2-byte chars
check_many("ĵƥ", 0..=0, 0);
check_many("ĵƥ", 1..=2, 2);
check_many("ĵƥ", 3..=4, 4);

// 3-byte chars
check_many("日本", 0..=0, 0);
check_many("日本", 1..=3, 3);
check_many("日本", 4..=6, 6);

// 4-byte chars
check_many("🇯🇵", 0..=0, 0);
check_many("🇯🇵", 1..=4, 4);
check_many("🇯🇵", 5..=8, 8);
}

#[test]
#[should_panic]
fn ceil_char_boundary_above_len_panic() {
let _ = "x".ceil_char_boundary(2);
}
5 changes: 5 additions & 0 deletions library/core/src/num/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,11 @@ impl u8 {
pub fn escape_ascii(&self) -> ascii::EscapeDefault {
ascii::escape_default(*self)
}

pub(crate) fn is_utf8_char_boundary(self) -> bool {
// This is bit magic equivalent to: b < 128 || b >= 192
(self as i8) >= -0x40
}
}

#[lang = "u16"]
Expand Down
88 changes: 78 additions & 10 deletions library/core/src/str/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,14 @@ use iter::MatchIndicesInternal;
use iter::SplitInternal;
use iter::{MatchesInternal, SplitNInternal};

use validations::truncate_to_char_boundary;

#[inline(never)]
#[cold]
#[track_caller]
fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! {
const MAX_DISPLAY_LENGTH: usize = 256;
let (truncated, s_trunc) = truncate_to_char_boundary(s, MAX_DISPLAY_LENGTH);
let ellipsis = if truncated { "[...]" } else { "" };
let trunc_len = s.floor_char_boundary(MAX_DISPLAY_LENGTH);
let s_trunc = &s[..trunc_len];
let ellipsis = if trunc_len < s.len() { "[...]" } else { "" };

// 1. out of bounds
if begin > s.len() || end > s.len() {
Expand All @@ -105,10 +104,7 @@ fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! {
// 3. character boundary
let index = if !s.is_char_boundary(begin) { begin } else { end };
// find the character
let mut char_start = index;
while !s.is_char_boundary(char_start) {
char_start -= 1;
}
let char_start = s.floor_char_boundary(index);
// `char_start` must be less than len and a char boundary
let ch = s[char_start..].chars().next().unwrap();
let char_range = char_start..char_start + ch.len_utf8();
Expand Down Expand Up @@ -215,8 +211,80 @@ impl str {
// code on higher opt-levels. See PR #84751 for more details.
None => index == self.len(),

// This is bit magic equivalent to: b < 128 || b >= 192
Some(&b) => (b as i8) >= -0x40,
Some(&b) => b.is_utf8_char_boundary(),
}
}

/// Finds the closest `x` not exceeding `index` where `is_char_boundary(x)` is `true`.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a suggestion...

Suggested change
/// Finds the closest `x` not exceeding `index` where `is_char_boundary(x)` is `true`.
/// Finds the nearest char boundary at or below `index`.
///
/// See the [is_char_boundary] method.

///
/// This method can help you truncate a string so that it's still valid UTF-8, but doesn't
/// exceed a given number of bytes. Note that this is done purely at the character level
/// and can still visually split graphemes, even though the underlying characters aren't
/// split. For example, the emoji 🧑‍🔬 (scientist) could be split so that the string only
/// includes 🧑 (person) instead.
///
/// # Examples
///
/// ```
/// #![feature(round_char_boundary)]
/// let s = "❤️🧡💛💚💙💜";
/// assert_eq!(s.len(), 26);
/// assert!(!s.is_char_boundary(13));
///
/// let closest = s.floor_char_boundary(13);
/// assert_eq!(closest, 10);
/// assert_eq!(&s[..closest], "❤️🧡");
/// ```
#[unstable(feature = "round_char_boundary", issue = "93743")]
#[inline]
pub fn floor_char_boundary(&self, index: usize) -> usize {
if index >= self.len() {
self.len()
} else {
let lower_bound = index.saturating_sub(3);
let new_index = self.as_bytes()[lower_bound..=index]
.iter()
.rposition(|b| b.is_utf8_char_boundary());

// SAFETY: we know that the character boundary will be within four bytes
unsafe { lower_bound + new_index.unwrap_unchecked() }
}
}

/// Finds the closest `x` not below `index` where `is_char_boundary(x)` is `true`.
///
/// This method is the natural complement to [`floor_char_boundary`]. See that method
/// for more details.
///
/// [`floor_char_boundary`]: str::floor_char_boundary
///
/// # Panics
///
/// Panics if `index > self.len()`.
///
/// # Examples
///
/// ```
/// #![feature(round_char_boundary)]
/// let s = "❤️🧡💛💚💙💜";
/// assert_eq!(s.len(), 26);
/// assert!(!s.is_char_boundary(13));
///
/// let closest = s.ceil_char_boundary(13);
/// assert_eq!(closest, 14);
/// assert_eq!(&s[..closest], "❤️🧡💛");
/// ```
#[unstable(feature = "round_char_boundary", issue = "93743")]
#[inline]
pub fn ceil_char_boundary(&self, index: usize) -> usize {
if index > self.len() {
slice_error_fail(self, index, index)
} else {
let upper_bound = Ord::min(index + 4, self.len());
self.as_bytes()[index..upper_bound]
.iter()
.position(|b| b.is_utf8_char_boundary())
.map_or(upper_bound, |pos| pos + index)
}
}

Expand Down
13 changes: 0 additions & 13 deletions library/core/src/str/validations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -273,16 +273,3 @@ pub const fn utf8_char_width(b: u8) -> usize {

/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111;

// truncate `&str` to length at most equal to `max`
// return `true` if it were truncated, and the new str.
pub(super) fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) {
if max >= s.len() {
(false, s)
} else {
while !s.is_char_boundary(max) {
max -= 1;
}
(true, &s[..max])
}
}