Skip to content

Commit 33ad366

Browse files
authored
Use REP MOVSQ/STOSQ on x86_64 (#365)
* mem: Move mem* functions to separate directory Signed-off-by: Joe Richey <[email protected]> * memcpy: Create separate memcpy.rs file Signed-off-by: Joe Richey <[email protected]> * benches: Add benchmarks for mem* functions This allows comparing the "normal" implementations to the implementations provided by this crate. Signed-off-by: Joe Richey <[email protected]> * mem: Add REP MOVSB/STOSB implementations The assembly generated seems correct: https://rust.godbolt.org/z/GGnec8 Signed-off-by: Joe Richey <[email protected]> * mem: Add documentations for REP string insturctions Signed-off-by: Joe Richey <[email protected]> * Use quad-word rep string instructions Signed-off-by: Joe Richey <[email protected]> * Prevent panic when compiled in debug mode Signed-off-by: Joe Richey <[email protected]> * Add tests for mem* functions Signed-off-by: Joe Richey <[email protected]> * Add build/test with the "asm" feature Signed-off-by: Joe Richey <[email protected]> * Add byte length to Bencher Signed-off-by: Joe Richey <[email protected]>
1 parent bc235bc commit 33ad366

File tree

6 files changed

+423
-39
lines changed

6 files changed

+423
-39
lines changed

ci/run.sh

+4
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,16 @@ else
1212
$run --release
1313
$run --features c
1414
$run --features c --release
15+
$run --features asm
16+
$run --features asm --release
1517
fi
1618

1719
cargo build --target $1
1820
cargo build --target $1 --release
1921
cargo build --target $1 --features c
2022
cargo build --target $1 --release --features c
23+
cargo build --target $1 --features asm
24+
cargo build --target $1 --release --features asm
2125

2226
PREFIX=$(echo $1 | sed -e 's/unknown-//')-
2327
case $1 in

src/mem/memcpy.rs

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
use super::c_int;
2+
3+
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
4+
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
5+
let mut i = 0;
6+
while i < n {
7+
*dest.offset(i as isize) = *src.offset(i as isize);
8+
i += 1;
9+
}
10+
dest
11+
}
12+
13+
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
14+
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
15+
if src < dest as *const u8 {
16+
// copy from end
17+
let mut i = n;
18+
while i != 0 {
19+
i -= 1;
20+
*dest.offset(i as isize) = *src.offset(i as isize);
21+
}
22+
} else {
23+
// copy from beginning
24+
let mut i = 0;
25+
while i < n {
26+
*dest.offset(i as isize) = *src.offset(i as isize);
27+
i += 1;
28+
}
29+
}
30+
dest
31+
}
32+
33+
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
34+
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
35+
let mut i = 0;
36+
while i < n {
37+
*s.offset(i as isize) = c as u8;
38+
i += 1;
39+
}
40+
s
41+
}

src/mem.rs src/mem/mod.rs

+4-39
Original file line numberDiff line numberDiff line change
@@ -9,45 +9,10 @@ use core::intrinsics::{atomic_load_unordered, atomic_store_unordered, exact_div}
99
use core::mem;
1010
use core::ops::{BitOr, Shl};
1111

12-
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
13-
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
14-
let mut i = 0;
15-
while i < n {
16-
*dest.offset(i as isize) = *src.offset(i as isize);
17-
i += 1;
18-
}
19-
dest
20-
}
21-
22-
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
23-
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
24-
if src < dest as *const u8 {
25-
// copy from end
26-
let mut i = n;
27-
while i != 0 {
28-
i -= 1;
29-
*dest.offset(i as isize) = *src.offset(i as isize);
30-
}
31-
} else {
32-
// copy from beginning
33-
let mut i = 0;
34-
while i < n {
35-
*dest.offset(i as isize) = *src.offset(i as isize);
36-
i += 1;
37-
}
38-
}
39-
dest
40-
}
41-
42-
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
43-
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
44-
let mut i = 0;
45-
while i < n {
46-
*s.offset(i as isize) = c as u8;
47-
i += 1;
48-
}
49-
s
50-
}
12+
// memcpy/memmove/memset have optimized implementations on some architectures
13+
#[cfg_attr(all(feature = "asm", target_arch = "x86_64"), path = "x86_64.rs")]
14+
mod memcpy;
15+
pub use self::memcpy::*;
5116

5217
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
5318
pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {

src/mem/x86_64.rs

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
use super::c_int;
2+
3+
// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
4+
// been enhanced to perform better than an simple qword loop, making them ideal
5+
// for implementing memcpy/memset. Note that "rep cmps" has received no such
6+
// enhancement, so it is not used to implement memcmp.
7+
//
8+
// On certain recent Intel processors, "rep movsb" and "rep stosb" have been
9+
// further enhanced to automatically select the best microarchitectural
10+
// implementation based on length and alignment. See the following features from
11+
// the "Intel® 64 and IA-32 Architectures Optimization Reference Manual":
12+
// - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later)
13+
// - FSRM - Fast Short REP MOV (Ice Lake and later)
14+
// - Fast Zero-Length MOVSB (On no current hardware)
15+
// - Fast Short STOSB (On no current hardware)
16+
// However, to avoid run-time feature detection, we don't use these byte-based
17+
// instructions for most of the copying, preferring the qword variants.
18+
19+
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
20+
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
21+
let qword_count = count >> 3;
22+
let byte_count = count & 0b111;
23+
asm!(
24+
"rep movsq [rdi], [rsi]",
25+
"mov ecx, {byte_count:e}",
26+
"rep movsb [rdi], [rsi]",
27+
byte_count = in(reg) byte_count,
28+
inout("rcx") qword_count => _,
29+
inout("rdi") dest => _,
30+
inout("rsi") src => _,
31+
options(nostack, preserves_flags)
32+
);
33+
dest
34+
}
35+
36+
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
37+
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
38+
let delta = (dest as usize).wrapping_sub(src as usize);
39+
if delta >= count {
40+
// We can copy forwards because either dest is far enough ahead of src,
41+
// or src is ahead of dest (and delta overflowed).
42+
return self::memcpy(dest, src, count);
43+
}
44+
// copy backwards
45+
let qword_count = count >> 3;
46+
let byte_count = count & 0b111;
47+
asm!(
48+
"std",
49+
"rep movsq [rdi], [rsi]",
50+
"mov ecx, {byte_count:e}",
51+
"add rdi, 7",
52+
"add rsi, 7",
53+
"rep movsb [rdi], [rsi]",
54+
"cld",
55+
byte_count = in(reg) byte_count,
56+
inout("rcx") qword_count => _,
57+
inout("rdi") dest.offset(count as isize).wrapping_sub(8) => _,
58+
inout("rsi") src.offset(count as isize).wrapping_sub(8) => _,
59+
options(nostack)
60+
);
61+
dest
62+
}
63+
64+
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
65+
pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 {
66+
let qword_count = count >> 3;
67+
let byte_count = count & 0b111;
68+
asm!(
69+
"rep stosq [rdi], rax",
70+
"mov ecx, {byte_count:e}",
71+
"rep stosb [rdi], al",
72+
byte_count = in(reg) byte_count,
73+
inout("rcx") qword_count => _,
74+
inout("rdi") dest => _,
75+
in("rax") (c as u8 as u64) * 0x0101010101010101,
76+
options(nostack, preserves_flags)
77+
);
78+
dest
79+
}

testcrate/benches/mem.rs

+162
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
#![feature(test)]
2+
3+
extern crate test;
4+
use test::{black_box, Bencher};
5+
6+
extern crate compiler_builtins;
7+
use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
8+
9+
fn memcpy_builtin(b: &mut Bencher, n: usize) {
10+
let v1 = vec![1u8; n];
11+
let mut v2 = vec![0u8; n];
12+
b.bytes = n as u64;
13+
b.iter(|| {
14+
let src: &[u8] = black_box(&v1);
15+
let dst: &mut [u8] = black_box(&mut v2);
16+
dst.copy_from_slice(src);
17+
})
18+
}
19+
20+
fn memcpy_rust(b: &mut Bencher, n: usize) {
21+
let v1 = vec![1u8; n];
22+
let mut v2 = vec![0u8; n];
23+
b.bytes = n as u64;
24+
b.iter(|| {
25+
let src: &[u8] = black_box(&v1);
26+
let dst: &mut [u8] = black_box(&mut v2);
27+
unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) }
28+
})
29+
}
30+
31+
fn memset_builtin(b: &mut Bencher, n: usize) {
32+
let mut v1 = vec![0u8; n];
33+
b.bytes = n as u64;
34+
b.iter(|| {
35+
let dst: &mut [u8] = black_box(&mut v1);
36+
let val: u8 = black_box(27);
37+
for b in dst {
38+
*b = val;
39+
}
40+
})
41+
}
42+
43+
fn memset_rust(b: &mut Bencher, n: usize) {
44+
let mut v1 = vec![0u8; n];
45+
b.bytes = n as u64;
46+
b.iter(|| {
47+
let dst: &mut [u8] = black_box(&mut v1);
48+
let val = black_box(27);
49+
unsafe { memset(dst.as_mut_ptr(), val, n) }
50+
})
51+
}
52+
53+
fn memcmp_builtin(b: &mut Bencher, n: usize) {
54+
let v1 = vec![0u8; n];
55+
let mut v2 = vec![0u8; n];
56+
v2[n - 1] = 1;
57+
b.bytes = n as u64;
58+
b.iter(|| {
59+
let s1: &[u8] = black_box(&v1);
60+
let s2: &[u8] = black_box(&v2);
61+
s1.cmp(s2)
62+
})
63+
}
64+
65+
fn memcmp_rust(b: &mut Bencher, n: usize) {
66+
let v1 = vec![0u8; n];
67+
let mut v2 = vec![0u8; n];
68+
v2[n - 1] = 1;
69+
b.bytes = n as u64;
70+
b.iter(|| {
71+
let s1: &[u8] = black_box(&v1);
72+
let s2: &[u8] = black_box(&v2);
73+
unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n) }
74+
})
75+
}
76+
77+
fn memmove_builtin(b: &mut Bencher, n: usize) {
78+
let mut v = vec![0u8; n + n / 2];
79+
b.bytes = n as u64;
80+
b.iter(|| {
81+
let s: &mut [u8] = black_box(&mut v);
82+
s.copy_within(0..n, n / 2);
83+
})
84+
}
85+
86+
fn memmove_rust(b: &mut Bencher, n: usize) {
87+
let mut v = vec![0u8; n + n / 2];
88+
b.bytes = n as u64;
89+
b.iter(|| {
90+
let dst: *mut u8 = black_box(&mut v[n / 2..]).as_mut_ptr();
91+
let src: *const u8 = black_box(&v).as_ptr();
92+
unsafe { memmove(dst, src, n) };
93+
})
94+
}
95+
96+
#[bench]
97+
fn memcpy_builtin_4096(b: &mut Bencher) {
98+
memcpy_builtin(b, 4096)
99+
}
100+
#[bench]
101+
fn memcpy_rust_4096(b: &mut Bencher) {
102+
memcpy_rust(b, 4096)
103+
}
104+
#[bench]
105+
fn memcpy_builtin_1048576(b: &mut Bencher) {
106+
memcpy_builtin(b, 1048576)
107+
}
108+
#[bench]
109+
fn memcpy_rust_1048576(b: &mut Bencher) {
110+
memcpy_rust(b, 1048576)
111+
}
112+
113+
#[bench]
114+
fn memset_builtin_4096(b: &mut Bencher) {
115+
memset_builtin(b, 4096)
116+
}
117+
#[bench]
118+
fn memset_rust_4096(b: &mut Bencher) {
119+
memset_rust(b, 4096)
120+
}
121+
#[bench]
122+
fn memset_builtin_1048576(b: &mut Bencher) {
123+
memset_builtin(b, 1048576)
124+
}
125+
#[bench]
126+
fn memset_rust_1048576(b: &mut Bencher) {
127+
memset_rust(b, 1048576)
128+
}
129+
130+
#[bench]
131+
fn memcmp_builtin_4096(b: &mut Bencher) {
132+
memcmp_builtin(b, 4096)
133+
}
134+
#[bench]
135+
fn memcmp_rust_4096(b: &mut Bencher) {
136+
memcmp_rust(b, 4096)
137+
}
138+
#[bench]
139+
fn memcmp_builtin_1048576(b: &mut Bencher) {
140+
memcmp_builtin(b, 1048576)
141+
}
142+
#[bench]
143+
fn memcmp_rust_1048576(b: &mut Bencher) {
144+
memcmp_rust(b, 1048576)
145+
}
146+
147+
#[bench]
148+
fn memmove_builtin_4096(b: &mut Bencher) {
149+
memmove_builtin(b, 4096)
150+
}
151+
#[bench]
152+
fn memmove_rust_4096(b: &mut Bencher) {
153+
memmove_rust(b, 4096)
154+
}
155+
#[bench]
156+
fn memmove_builtin_1048576(b: &mut Bencher) {
157+
memmove_builtin(b, 1048576)
158+
}
159+
#[bench]
160+
fn memmove_rust_1048576(b: &mut Bencher) {
161+
memmove_rust(b, 1048576)
162+
}

0 commit comments

Comments
 (0)