Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit c11b3aa

Browse files
committedSep 21, 2024
Avoid re-validating UTF-8 in FromUtf8Error::into_utf8_lossy
Refactor `into_utf8_lossy` to copy valid UTF-8 bytes into the buffer, avoiding double validation of bytes. Add tests that mirror the `String::from_utf8_lossy` tests
1 parent b66efdd commit c11b3aa

File tree

3 files changed

+63
-1
lines changed

3 files changed

+63
-1
lines changed
 

‎alloc/src/string.rs

+25-1
Original file line numberDiff line numberDiff line change
@@ -2081,7 +2081,31 @@ impl FromUtf8Error {
20812081
#[cfg(not(no_global_oom_handling))]
20822082
#[unstable(feature = "string_from_utf8_lossy_owned", issue = "129436")]
20832083
pub fn into_utf8_lossy(self) -> String {
2084-
String::from_utf8_lossy_owned(self.bytes)
2084+
const REPLACEMENT: &str = "\u{FFFD}";
2085+
2086+
let mut res = {
2087+
let mut v = Vec::with_capacity(self.bytes.len());
2088+
2089+
// `Utf8Error::valid_up_to` returns the maximum index of validated
2090+
// UTF-8 bytes. Copy the valid bytes into the output buffer.
2091+
v.extend_from_slice(&self.bytes[..self.error.valid_up_to()]);
2092+
2093+
// SAFETY: This is safe because the only bytes present in the buffer
2094+
// were validated as UTF-8 by the call to `String::from_utf8` which
2095+
// produced this `FromUtf8Error`.
2096+
unsafe { String::from_utf8_unchecked(v) }
2097+
};
2098+
2099+
let iter = self.bytes[self.error.valid_up_to()..].utf8_chunks();
2100+
2101+
for chunk in iter {
2102+
res.push_str(chunk.valid());
2103+
if !chunk.invalid().is_empty() {
2104+
res.push_str(REPLACEMENT);
2105+
}
2106+
}
2107+
2108+
res
20852109
}
20862110

20872111
/// Returns the bytes that were attempted to convert to a `String`.

‎alloc/tests/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#![feature(iter_next_chunk)]
2929
#![feature(round_char_boundary)]
3030
#![feature(slice_partition_dedup)]
31+
#![feature(string_from_utf8_lossy_owned)]
3132
#![feature(string_remove_matches)]
3233
#![feature(const_btree_len)]
3334
#![feature(const_trait_impl)]

‎alloc/tests/string.rs

+37
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,43 @@ fn test_from_utf8_lossy() {
114114
);
115115
}
116116

117+
#[test]
118+
fn test_fromutf8error_into_lossy() {
119+
fn func(input: &[u8]) -> String {
120+
String::from_utf8(input.to_owned()).unwrap_or_else(|e| e.into_utf8_lossy())
121+
}
122+
123+
let xs = b"hello";
124+
let ys = "hello".to_owned();
125+
assert_eq!(func(xs), ys);
126+
127+
let xs = "ศไทย中华Việt Nam".as_bytes();
128+
let ys = "ศไทย中华Việt Nam".to_owned();
129+
assert_eq!(func(xs), ys);
130+
131+
let xs = b"Hello\xC2 There\xFF Goodbye";
132+
assert_eq!(func(xs), "Hello\u{FFFD} There\u{FFFD} Goodbye".to_owned());
133+
134+
let xs = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
135+
assert_eq!(func(xs), "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye".to_owned());
136+
137+
let xs = b"\xF5foo\xF5\x80bar";
138+
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}\u{FFFD}bar".to_owned());
139+
140+
let xs = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
141+
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz".to_owned());
142+
143+
let xs = b"\xF4foo\xF4\x80bar\xF4\xBFbaz";
144+
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz".to_owned());
145+
146+
let xs = b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar";
147+
assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar".to_owned());
148+
149+
// surrogates
150+
let xs = b"\xED\xA0\x80foo\xED\xBF\xBFbar";
151+
assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar".to_owned());
152+
}
153+
117154
#[test]
118155
fn test_from_utf16() {
119156
let pairs = [

0 commit comments

Comments
 (0)