Skip to content

Commit 9c1e162

Browse files
committed
Auto merge of rust-lang#125679 - clarfonthey:escape_ascii, r=joboet
Optimize `escape_ascii` using a lookup table Based upon my suggestion here: rust-lang#125340 (comment) Effectively, we can take advantage of the fact that ASCII only needs 7 bits to make the eighth bit store whether the value should be escaped or not. This adds a 256-byte lookup table, but 256 bytes *should* be small enough that very few people will mind, according to my probably not incontrovertible opinion. The generated assembly isn't clearly better (although has fewer branches), so, I decided to benchmark on three inputs: first on a random 200KiB, then on `/bin/cat`, then on `Cargo.toml` for this repo. In all cases, the generated code ran faster on my machine. (an old i7-8700) But, if you want to try my benchmarking code for yourself: <details><summary>Criterion code below. Replace <code>/home/ltdk/rustsrc</code> with the appropriate directory.</summary> ```rust #![feature(ascii_char)] #![feature(ascii_char_variants)] #![feature(const_option)] #![feature(let_chains)] use core::ascii; use core::ops::Range; use criterion::{criterion_group, criterion_main, Criterion}; use rand::{thread_rng, Rng}; const HEX_DIGITS: [ascii::Char; 16] = *b"0123456789abcdef".as_ascii().unwrap(); #[inline] const fn backslash<const N: usize>(a: ascii::Char) -> ([ascii::Char; N], Range<u8>) { const { assert!(N >= 2) }; let mut output = [ascii::Char::Null; N]; output[0] = ascii::Char::ReverseSolidus; output[1] = a; (output, 0..2) } #[inline] const fn hex_escape<const N: usize>(byte: u8) -> ([ascii::Char; N], Range<u8>) { const { assert!(N >= 4) }; let mut output = [ascii::Char::Null; N]; let hi = HEX_DIGITS[(byte >> 4) as usize]; let lo = HEX_DIGITS[(byte & 0xf) as usize]; output[0] = ascii::Char::ReverseSolidus; output[1] = ascii::Char::SmallX; output[2] = hi; output[3] = lo; (output, 0..4) } #[inline] const fn verbatim<const N: usize>(a: ascii::Char) -> ([ascii::Char; N], Range<u8>) { const { assert!(N >= 1) }; let mut output = [ascii::Char::Null; N]; output[0] = a; (output, 0..1) } /// Escapes an ASCII character. /// /// Returns a buffer and the length of the escaped representation. const fn escape_ascii_old<const N: usize>(byte: u8) -> ([ascii::Char; N], Range<u8>) { const { assert!(N >= 4) }; match byte { b'\t' => backslash(ascii::Char::SmallT), b'\r' => backslash(ascii::Char::SmallR), b'\n' => backslash(ascii::Char::SmallN), b'\\' => backslash(ascii::Char::ReverseSolidus), b'\'' => backslash(ascii::Char::Apostrophe), b'\"' => backslash(ascii::Char::QuotationMark), 0x00..=0x1F => hex_escape(byte), _ => match ascii::Char::from_u8(byte) { Some(a) => verbatim(a), None => hex_escape(byte), }, } } /// Escapes an ASCII character. /// /// Returns a buffer and the length of the escaped representation. const fn escape_ascii_new<const N: usize>(byte: u8) -> ([ascii::Char; N], Range<u8>) { /// Lookup table helps us determine how to display character. /// /// Since ASCII characters will always be 7 bits, we can exploit this to store the 8th bit to /// indicate whether the result is escaped or unescaped. /// /// We additionally use 0x80 (escaped NUL character) to indicate hex-escaped bytes, since /// escaped NUL will not occur. const LOOKUP: [u8; 256] = { let mut arr = [0; 256]; let mut idx = 0; loop { arr[idx as usize] = match idx { // use 8th bit to indicate escaped b'\t' => 0x80 | b't', b'\r' => 0x80 | b'r', b'\n' => 0x80 | b'n', b'\\' => 0x80 | b'\\', b'\'' => 0x80 | b'\'', b'"' => 0x80 | b'"', // use NUL to indicate hex-escaped 0x00..=0x1F | 0x7F..=0xFF => 0x80 | b'\0', _ => idx, }; if idx == 255 { break; } idx += 1; } arr }; let lookup = LOOKUP[byte as usize]; // 8th bit indicates escape let lookup_escaped = lookup & 0x80 != 0; // SAFETY: We explicitly mask out the eighth bit to get a 7-bit ASCII character. let lookup_ascii = unsafe { ascii::Char::from_u8_unchecked(lookup & 0x7F) }; if lookup_escaped { // NUL indicates hex-escaped if matches!(lookup_ascii, ascii::Char::Null) { hex_escape(byte) } else { backslash(lookup_ascii) } } else { verbatim(lookup_ascii) } } fn escape_bytes(bytes: &[u8], f: impl Fn(u8) -> ([ascii::Char; 4], Range<u8>)) -> Vec<ascii::Char> { let mut vec = Vec::new(); for b in bytes { let (buf, range) = f(*b); vec.extend_from_slice(&buf[range.start as usize..range.end as usize]); } vec } pub fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("escape_ascii"); group.sample_size(1000); let rand_200k = &mut [0; 200 * 1024]; thread_rng().fill(&mut rand_200k[..]); let cat = include_bytes!("/bin/cat"); let cargo_toml = include_bytes!("/home/ltdk/rustsrc/Cargo.toml"); group.bench_function("old_rand", |b| { b.iter(|| escape_bytes(rand_200k, escape_ascii_old)); }); group.bench_function("new_rand", |b| { b.iter(|| escape_bytes(rand_200k, escape_ascii_new)); }); group.bench_function("old_bin", |b| { b.iter(|| escape_bytes(cat, escape_ascii_old)); }); group.bench_function("new_bin", |b| { b.iter(|| escape_bytes(cat, escape_ascii_new)); }); group.bench_function("old_cargo_toml", |b| { b.iter(|| escape_bytes(cargo_toml, escape_ascii_old)); }); group.bench_function("new_cargo_toml", |b| { b.iter(|| escape_bytes(cargo_toml, escape_ascii_new)); }); group.finish(); } criterion_group!(benches, criterion_benchmark); criterion_main!(benches); ``` </details> My benchmark results: ``` escape_ascii/old_rand time: [1.6965 ms 1.7006 ms 1.7053 ms] Found 22 outliers among 1000 measurements (2.20%) 4 (0.40%) high mild 18 (1.80%) high severe escape_ascii/new_rand time: [1.6749 ms 1.6953 ms 1.7158 ms] Found 38 outliers among 1000 measurements (3.80%) 38 (3.80%) high mild escape_ascii/old_bin time: [224.59 µs 225.40 µs 226.33 µs] Found 39 outliers among 1000 measurements (3.90%) 17 (1.70%) high mild 22 (2.20%) high severe escape_ascii/new_bin time: [164.86 µs 165.63 µs 166.58 µs] Found 107 outliers among 1000 measurements (10.70%) 43 (4.30%) high mild 64 (6.40%) high severe escape_ascii/old_cargo_toml time: [23.397 µs 23.699 µs 24.014 µs] Found 204 outliers among 1000 measurements (20.40%) 21 (2.10%) high mild 183 (18.30%) high severe escape_ascii/new_cargo_toml time: [16.404 µs 16.438 µs 16.483 µs] Found 88 outliers among 1000 measurements (8.80%) 56 (5.60%) high mild 32 (3.20%) high severe ``` Random: 1.7006ms => 1.6953ms (<1% speedup) Binary: 225.40µs => 165.63µs (26% speedup) Text: 23.699µs => 16.438µs (30% speedup)
2 parents 1799481 + 86c7526 commit 9c1e162

File tree

2 files changed

+110
-26
lines changed

2 files changed

+110
-26
lines changed

core/src/escape.rs

+91-23
Original file line numberDiff line numberDiff line change
@@ -18,38 +18,106 @@ const fn backslash<const N: usize>(a: ascii::Char) -> ([ascii::Char; N], Range<u
1818
(output, 0..2)
1919
}
2020

21+
#[inline]
22+
const fn hex_escape<const N: usize>(byte: u8) -> ([ascii::Char; N], Range<u8>) {
23+
const { assert!(N >= 4) };
24+
25+
let mut output = [ascii::Char::Null; N];
26+
27+
let hi = HEX_DIGITS[(byte >> 4) as usize];
28+
let lo = HEX_DIGITS[(byte & 0xf) as usize];
29+
30+
output[0] = ascii::Char::ReverseSolidus;
31+
output[1] = ascii::Char::SmallX;
32+
output[2] = hi;
33+
output[3] = lo;
34+
35+
(output, 0..4)
36+
}
37+
38+
#[inline]
39+
const fn verbatim<const N: usize>(a: ascii::Char) -> ([ascii::Char; N], Range<u8>) {
40+
const { assert!(N >= 1) };
41+
42+
let mut output = [ascii::Char::Null; N];
43+
44+
output[0] = a;
45+
46+
(output, 0..1)
47+
}
48+
2149
/// Escapes an ASCII character.
2250
///
2351
/// Returns a buffer and the length of the escaped representation.
2452
const fn escape_ascii<const N: usize>(byte: u8) -> ([ascii::Char; N], Range<u8>) {
2553
const { assert!(N >= 4) };
2654

27-
match byte {
28-
b'\t' => backslash(ascii::Char::SmallT),
29-
b'\r' => backslash(ascii::Char::SmallR),
30-
b'\n' => backslash(ascii::Char::SmallN),
31-
b'\\' => backslash(ascii::Char::ReverseSolidus),
32-
b'\'' => backslash(ascii::Char::Apostrophe),
33-
b'\"' => backslash(ascii::Char::QuotationMark),
34-
byte => {
35-
let mut output = [ascii::Char::Null; N];
36-
37-
if let Some(c) = byte.as_ascii()
38-
&& !byte.is_ascii_control()
39-
{
40-
output[0] = c;
41-
(output, 0..1)
42-
} else {
43-
let hi = HEX_DIGITS[(byte >> 4) as usize];
44-
let lo = HEX_DIGITS[(byte & 0xf) as usize];
55+
#[cfg(feature = "optimize_for_size")]
56+
{
57+
match byte {
58+
b'\t' => backslash(ascii::Char::SmallT),
59+
b'\r' => backslash(ascii::Char::SmallR),
60+
b'\n' => backslash(ascii::Char::SmallN),
61+
b'\\' => backslash(ascii::Char::ReverseSolidus),
62+
b'\'' => backslash(ascii::Char::Apostrophe),
63+
b'"' => backslash(ascii::Char::QuotationMark),
64+
0x00..=0x1F | 0x7F => hex_escape(byte),
65+
_ => match ascii::Char::from_u8(byte) {
66+
Some(a) => verbatim(a),
67+
None => hex_escape(byte),
68+
},
69+
}
70+
}
71+
72+
#[cfg(not(feature = "optimize_for_size"))]
73+
{
74+
/// Lookup table helps us determine how to display character.
75+
///
76+
/// Since ASCII characters will always be 7 bits, we can exploit this to store the 8th bit to
77+
/// indicate whether the result is escaped or unescaped.
78+
///
79+
/// We additionally use 0x80 (escaped NUL character) to indicate hex-escaped bytes, since
80+
/// escaped NUL will not occur.
81+
const LOOKUP: [u8; 256] = {
82+
let mut arr = [0; 256];
83+
let mut idx = 0;
84+
while idx <= 255 {
85+
arr[idx] = match idx as u8 {
86+
// use 8th bit to indicate escaped
87+
b'\t' => 0x80 | b't',
88+
b'\r' => 0x80 | b'r',
89+
b'\n' => 0x80 | b'n',
90+
b'\\' => 0x80 | b'\\',
91+
b'\'' => 0x80 | b'\'',
92+
b'"' => 0x80 | b'"',
93+
94+
// use NUL to indicate hex-escaped
95+
0x00..=0x1F | 0x7F..=0xFF => 0x80 | b'\0',
96+
97+
idx => idx,
98+
};
99+
idx += 1;
100+
}
101+
arr
102+
};
45103

46-
output[0] = ascii::Char::ReverseSolidus;
47-
output[1] = ascii::Char::SmallX;
48-
output[2] = hi;
49-
output[3] = lo;
104+
let lookup = LOOKUP[byte as usize];
50105

51-
(output, 0..4)
106+
// 8th bit indicates escape
107+
let lookup_escaped = lookup & 0x80 != 0;
108+
109+
// SAFETY: We explicitly mask out the eighth bit to get a 7-bit ASCII character.
110+
let lookup_ascii = unsafe { ascii::Char::from_u8_unchecked(lookup & 0x7F) };
111+
112+
if lookup_escaped {
113+
// NUL indicates hex-escaped
114+
if matches!(lookup_ascii, ascii::Char::Null) {
115+
hex_escape(byte)
116+
} else {
117+
backslash(lookup_ascii)
52118
}
119+
} else {
120+
verbatim(lookup_ascii)
53121
}
54122
}
55123
}

core/tests/ascii.rs

+19-3
Original file line numberDiff line numberDiff line change
@@ -481,9 +481,25 @@ fn ascii_ctype_const() {
481481
}
482482

483483
#[test]
484-
fn test_ascii_display() {
485-
assert_eq!(b"foo'bar".escape_ascii().to_string(), r#"foo\'bar"#);
486-
assert_eq!(b"\0\xff".escape_ascii().to_string(), r#"\x00\xff"#);
484+
fn test_escape_ascii() {
485+
let mut buf = [0u8; 0x1F + 7]; // 0..=0x1F plus two quotes, slash, \x7F, \x80, \xFF
486+
for idx in 0..=0x1F {
487+
buf[idx] = idx as u8;
488+
}
489+
buf[0x20] = b'\'';
490+
buf[0x21] = b'"';
491+
buf[0x22] = b'\\';
492+
buf[0x23] = 0x7F;
493+
buf[0x24] = 0x80;
494+
buf[0x25] = 0xff;
495+
assert_eq!(
496+
buf.escape_ascii().to_string(),
497+
r#"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\'\"\\\x7f\x80\xff"#
498+
);
499+
}
500+
501+
#[test]
502+
fn test_escape_ascii_iter() {
487503
let mut it = b"\0fastpath\xffremainder\xff".escape_ascii();
488504
let _ = it.advance_by(4);
489505
let _ = it.advance_back_by(4);

0 commit comments

Comments
 (0)