Skip to content

Commit c2ce520

Browse files
committed
Auto merge of rust-lang#130223 - LaihoE:faster_str_replace, r=thomcc
optimize str.replace Adds a fast path for str.replace for the ascii to ascii case. This allows for autovectorizing the code. Also should this instead be done with specialization? This way we could remove one branch. I think it is the kind of branch that is easy to predict though. Benchmark for the fast path (replace all "a" with "b" in the rust wikipedia article, using criterion) : | N | Speedup | Time New (ns) | Time Old (ns) | |----------|---------|---------------|---------------| | 2 | 2.03 | 13.567 | 27.576 | | 8 | 1.73 | 17.478 | 30.259 | | 11 | 2.46 | 18.296 | 45.055 | | 16 | 2.71 | 17.181 | 46.526 | | 37 | 4.43 | 18.526 | 81.997 | | 64 | 8.54 | 18.670 | 159.470 | | 200 | 9.82 | 29.634 | 291.010 | | 2000 | 24.34 | 81.114 | 1974.300 | | 20000 | 30.61 | 598.520 | 18318.000 | | 1000000 | 29.31 | 33458.000 | 980540.000 |
2 parents d30c392 + 4484085 commit c2ce520

File tree

3 files changed

+63
-2
lines changed

3 files changed

+63
-2
lines changed

library/alloc/src/str.rs

+24-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ pub use core::str::SplitInclusive;
2020
pub use core::str::SplitWhitespace;
2121
#[stable(feature = "rust1", since = "1.0.0")]
2222
pub use core::str::pattern;
23-
use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher};
23+
use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher, Utf8Pattern};
2424
#[stable(feature = "rust1", since = "1.0.0")]
2525
pub use core::str::{Bytes, CharIndices, Chars, from_utf8, from_utf8_mut};
2626
#[stable(feature = "str_escape", since = "1.34.0")]
@@ -269,6 +269,18 @@ impl str {
269269
#[stable(feature = "rust1", since = "1.0.0")]
270270
#[inline]
271271
pub fn replace<P: Pattern>(&self, from: P, to: &str) -> String {
272+
// Fast path for ASCII to ASCII case.
273+
274+
if let Some(from_byte) = match from.as_utf8_pattern() {
275+
Some(Utf8Pattern::StringPattern([from_byte])) => Some(*from_byte),
276+
Some(Utf8Pattern::CharPattern(c)) => c.as_ascii().map(|ascii_char| ascii_char.to_u8()),
277+
_ => None,
278+
} {
279+
if let [to_byte] = to.as_bytes() {
280+
return unsafe { replace_ascii(self.as_bytes(), from_byte, *to_byte) };
281+
}
282+
}
283+
272284
let mut result = String::new();
273285
let mut last_end = 0;
274286
for (start, part) in self.match_indices(from) {
@@ -686,3 +698,14 @@ pub fn convert_while_ascii(s: &str, convert: fn(&u8) -> u8) -> (String, &str) {
686698
(ascii_string, rest)
687699
}
688700
}
701+
#[inline]
702+
#[cfg(not(test))]
703+
#[cfg(not(no_global_oom_handling))]
704+
#[allow(dead_code)]
705+
/// Faster implementation of string replacement for ASCII to ASCII cases.
706+
/// Should produce fast vectorized code.
707+
unsafe fn replace_ascii(utf8_bytes: &[u8], from: u8, to: u8) -> String {
708+
let result: Vec<u8> = utf8_bytes.iter().map(|b| if *b == from { to } else { *b }).collect();
709+
// SAFETY: We replaced ascii with ascii on valid utf8 strings.
710+
unsafe { String::from_utf8_unchecked(result) }
711+
}

library/alloc/src/string.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ use core::ops::AddAssign;
5353
#[cfg(not(no_global_oom_handling))]
5454
use core::ops::Bound::{Excluded, Included, Unbounded};
5555
use core::ops::{self, Range, RangeBounds};
56-
use core::str::pattern::Pattern;
56+
use core::str::pattern::{Pattern, Utf8Pattern};
5757
use core::{fmt, hash, ptr, slice};
5858

5959
#[cfg(not(no_global_oom_handling))]
@@ -2424,6 +2424,11 @@ impl<'b> Pattern for &'b String {
24242424
{
24252425
self[..].strip_suffix_of(haystack)
24262426
}
2427+
2428+
#[inline]
2429+
fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
2430+
Some(Utf8Pattern::StringPattern(self.as_bytes()))
2431+
}
24272432
}
24282433

24292434
macro_rules! impl_eq {

library/core/src/str/pattern.rs

+33
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,19 @@ pub trait Pattern: Sized {
160160
None
161161
}
162162
}
163+
164+
/// Returns the pattern as utf-8 bytes if possible.
165+
fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>>;
166+
}
167+
/// Result of calling [`Pattern::as_utf8_pattern()`].
168+
/// Can be used for inspecting the contents of a [`Pattern`] in cases
169+
/// where the underlying representation can be represented as UTF-8.
170+
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
171+
pub enum Utf8Pattern<'a> {
172+
/// Type returned by String and str types.
173+
StringPattern(&'a [u8]),
174+
/// Type returned by char types.
175+
CharPattern(char),
163176
}
164177

165178
// Searcher
@@ -599,6 +612,11 @@ impl Pattern for char {
599612
{
600613
self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack)
601614
}
615+
616+
#[inline]
617+
fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
618+
Some(Utf8Pattern::CharPattern(*self))
619+
}
602620
}
603621

604622
/////////////////////////////////////////////////////////////////////////////
@@ -657,6 +675,11 @@ impl<C: MultiCharEq> Pattern for MultiCharEqPattern<C> {
657675
fn into_searcher(self, haystack: &str) -> MultiCharEqSearcher<'_, C> {
658676
MultiCharEqSearcher { haystack, char_eq: self.0, char_indices: haystack.char_indices() }
659677
}
678+
679+
#[inline]
680+
fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
681+
None
682+
}
660683
}
661684

662685
unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> {
@@ -747,6 +770,11 @@ macro_rules! pattern_methods {
747770
{
748771
($pmap)(self).strip_suffix_of(haystack)
749772
}
773+
774+
#[inline]
775+
fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
776+
None
777+
}
750778
};
751779
}
752780

@@ -1022,6 +1050,11 @@ impl<'b> Pattern for &'b str {
10221050
None
10231051
}
10241052
}
1053+
1054+
#[inline]
1055+
fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
1056+
Some(Utf8Pattern::StringPattern(self.as_bytes()))
1057+
}
10251058
}
10261059

10271060
/////////////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)