From 1d4668ffc45c307705dc3da7d6c66c9f446ae36b Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Wed, 11 Apr 2018 15:20:53 +0200 Subject: [PATCH 01/15] Work arounds for LLVM6 code-gen bugs in all/any reductions This commit adds workarounds for the mask reductions: `all` and `any`. 64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`) `x86_64` with `MMX` enabled ```asm all_8x8: push rbp mov rbp, rsp movzx eax, byte, ptr, [rdi, +, 7] movd xmm0, eax movzx eax, byte, ptr, [rdi, +, 6] movd xmm1, eax punpcklwd xmm1, xmm0 movzx eax, byte, ptr, [rdi, +, 5] movd xmm0, eax movzx eax, byte, ptr, [rdi, +, 4] movd xmm2, eax punpcklwd xmm2, xmm0 punpckldq xmm2, xmm1 movzx eax, byte, ptr, [rdi, +, 3] movd xmm0, eax movzx eax, byte, ptr, [rdi, +, 2] movd xmm1, eax punpcklwd xmm1, xmm0 movzx eax, byte, ptr, [rdi, +, 1] movd xmm0, eax movzx eax, byte, ptr, [rdi] movd xmm3, eax punpcklwd xmm3, xmm0 punpckldq xmm3, xmm1 punpcklqdq xmm3, xmm2 movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0] pand xmm3, xmm0 pcmpeqw xmm3, xmm0 pshufd xmm0, xmm3, 78 pand xmm0, xmm3 pshufd xmm1, xmm0, 229 pand xmm1, xmm0 movdqa xmm0, xmm1 psrld xmm0, 16 pand xmm0, xmm1 movd eax, xmm0 and al, 1 pop rbp ret any_8x8: push rbp mov rbp, rsp movzx eax, byte, ptr, [rdi, +, 7] movd xmm0, eax movzx eax, byte, ptr, [rdi, +, 6] movd xmm1, eax punpcklwd xmm1, xmm0 movzx eax, byte, ptr, [rdi, +, 5] movd xmm0, eax movzx eax, byte, ptr, [rdi, +, 4] movd xmm2, eax punpcklwd xmm2, xmm0 punpckldq xmm2, xmm1 movzx eax, byte, ptr, [rdi, +, 3] movd xmm0, eax movzx eax, byte, ptr, [rdi, +, 2] movd xmm1, eax punpcklwd xmm1, xmm0 movzx eax, byte, ptr, [rdi, +, 1] movd xmm0, eax movzx eax, byte, ptr, [rdi] movd xmm3, eax punpcklwd xmm3, xmm0 punpckldq xmm3, xmm1 punpcklqdq xmm3, xmm2 movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0] pand xmm3, xmm0 pcmpeqw xmm3, xmm0 pshufd xmm0, xmm3, 78 por xmm0, xmm3 pshufd xmm1, xmm0, 229 por xmm1, xmm0 movdqa xmm0, xmm1 psrld xmm0, 16 por xmm0, xmm1 movd eax, xmm0 and al, 1 pop rbp ret ``` After this PR for `m8x8`, `m16x4`, `m32x2`: ```asm all_8x8: push rbp mov rbp, rsp movq mm0, qword, ptr, [rdi] pmovmskb eax, mm0 cmp eax, 255 sete al pop rbp ret any_8x8: push rbp mov rbp, rsp movq mm0, qword, ptr, [rdi] pmovmskb eax, mm0 test eax, eax setne al pop rbp ret ``` x86` with `MMX` enabled Before this PR: ```asm all_8x8: call L9$pb L9$pb: pop eax mov ecx, dword, ptr, [esp, +, 4] movzx edx, byte, ptr, [ecx, +, 7] movd xmm0, edx movzx edx, byte, ptr, [ecx, +, 6] movd xmm1, edx punpcklwd xmm1, xmm0 movzx edx, byte, ptr, [ecx, +, 5] movd xmm0, edx movzx edx, byte, ptr, [ecx, +, 4] movd xmm2, edx punpcklwd xmm2, xmm0 punpckldq xmm2, xmm1 movzx edx, byte, ptr, [ecx, +, 3] movd xmm0, edx movzx edx, byte, ptr, [ecx, +, 2] movd xmm1, edx punpcklwd xmm1, xmm0 movzx edx, byte, ptr, [ecx, +, 1] movd xmm0, edx movzx ecx, byte, ptr, [ecx] movd xmm3, ecx punpcklwd xmm3, xmm0 punpckldq xmm3, xmm1 punpcklqdq xmm3, xmm2 movdqa xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb] pand xmm3, xmm0 pcmpeqw xmm3, xmm0 pshufd xmm0, xmm3, 78 pand xmm0, xmm3 pshufd xmm1, xmm0, 229 pand xmm1, xmm0 movdqa xmm0, xmm1 psrld xmm0, 16 pand xmm0, xmm1 movd eax, xmm0 and al, 1 ret any_8x8: call L8$pb L8$pb: pop eax mov ecx, dword, ptr, [esp, +, 4] movzx edx, byte, ptr, [ecx, +, 7] movd xmm0, edx movzx edx, byte, ptr, [ecx, +, 6] movd xmm1, edx punpcklwd xmm1, xmm0 movzx edx, byte, ptr, [ecx, +, 5] movd xmm0, edx movzx edx, byte, ptr, [ecx, +, 4] movd xmm2, edx punpcklwd xmm2, xmm0 punpckldq xmm2, xmm1 movzx edx, byte, ptr, [ecx, +, 3] movd xmm0, edx movzx edx, byte, ptr, [ecx, +, 2] movd xmm1, edx punpcklwd xmm1, xmm0 movzx edx, byte, ptr, [ecx, +, 1] movd xmm0, edx movzx ecx, byte, ptr, [ecx] movd xmm3, ecx punpcklwd xmm3, xmm0 punpckldq xmm3, xmm1 punpcklqdq xmm3, xmm2 movdqa xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb] pand xmm3, xmm0 pcmpeqw xmm3, xmm0 pshufd xmm0, xmm3, 78 por xmm0, xmm3 pshufd xmm1, xmm0, 229 por xmm1, xmm0 movdqa xmm0, xmm1 psrld xmm0, 16 por xmm0, xmm1 movd eax, xmm0 and al, 1 ret ``` After this PR: ```asm all_8x8: mov eax, dword, ptr, [esp, +, 4] movq mm0, qword, ptr, [eax] pmovmskb eax, mm0 cmp eax, 255 sete al ret any_8x8: mov eax, dword, ptr, [esp, +, 4] movq mm0, qword, ptr, [eax] pmovmskb eax, mm0 test eax, eax setne al ret ``` `aarch64` Before this PR: ```asm all_8x8: ldr d0, [x0] umov w8, v0.b[0] umov w9, v0.b[1] tst w8, #0xff umov w10, v0.b[2] cset w8, ne tst w9, #0xff cset w9, ne tst w10, #0xff umov w10, v0.b[3] and w8, w8, w9 cset w9, ne tst w10, #0xff umov w10, v0.b[4] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[5] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[6] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[7] and w8, w9, w8 cset w9, ne tst w10, #0xff and w8, w9, w8 cset w9, ne and w0, w9, w8 ret any_8x8: ldr d0, [x0] umov w8, v0.b[0] umov w9, v0.b[1] orr w8, w8, w9 umov w9, v0.b[2] orr w8, w8, w9 umov w9, v0.b[3] orr w8, w8, w9 umov w9, v0.b[4] orr w8, w8, w9 umov w9, v0.b[5] orr w8, w8, w9 umov w9, v0.b[6] orr w8, w8, w9 umov w9, v0.b[7] orr w8, w8, w9 tst w8, #0xff cset w0, ne ret ``` After this PR: ```asm all_8x8: ldr d0, [x0] mov v0.d[1], v0.d[0] uminv b0, v0.16b fmov w8, s0 tst w8, #0xff cset w0, ne ret any_8x8: ldr d0, [x0] mov v0.d[1], v0.d[0] umaxv b0, v0.16b fmov w8, s0 tst w8, #0xff cset w0, ne ret ``` `ARMv7` + `neon` Before this PR: ```asm all_8x8: vmov.i8 d0, #0x1 vldr d1, [r0] vtst.8 d0, d1, d0 vext.8 d1, d0, d0, #4 vand d0, d0, d1 vext.8 d1, d0, d0, #2 vand d0, d0, d1 vdup.8 d1, d0[1] vand d0, d0, d1 vmov.u8 r0, d0[0] and r0, r0, #1 bx lr any_8x8: vmov.i8 d0, #0x1 vldr d1, [r0] vtst.8 d0, d1, d0 vext.8 d1, d0, d0, #4 vorr d0, d0, d1 vext.8 d1, d0, d0, #2 vorr d0, d0, d1 vdup.8 d1, d0[1] vorr d0, d0, d1 vmov.u8 r0, d0[0] and r0, r0, #1 bx lr ``` After this PR: ```asm all_8x8: vldr d0, [r0] b ::all ::all: vpmin.u8 d16, d0, d16 vpmin.u8 d16, d16, d16 vpmin.u8 d0, d16, d16 b m8x8::extract any_8x8: vldr d0, [r0] b ::any ::any: vpmax.u8 d16, d0, d16 vpmax.u8 d16, d16, d16 vpmax.u8 d0, d16, d16 b m8x8::extract ``` (note: inlining does not work properly on ARMv7) 128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`) `x86_64` with SSE2 enabled Before this PR: ```asm all_8x16: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0] movdqa xmm1, xmmword, ptr, [rdi] pand xmm1, xmm0 pcmpeqb xmm1, xmm0 pmovmskb eax, xmm1 xor ecx, ecx cmp eax, 65535 mov eax, -1 cmovne eax, ecx and al, 1 pop rbp ret any_8x16: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0] movdqa xmm1, xmmword, ptr, [rdi] pand xmm1, xmm0 pcmpeqb xmm1, xmm0 pmovmskb eax, xmm1 neg eax sbb eax, eax and al, 1 pop rbp ret ``` After this PR: ```asm all_8x16: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rdi] pmovmskb eax, xmm0 cmp eax, 65535 sete al pop rbp ret any_8x16: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rdi] pmovmskb eax, xmm0 test eax, eax setne al pop rbp ret ``` `aarch64` Before this PR: ```asm all_8x16: ldr q0, [x0] umov w8, v0.b[0] umov w9, v0.b[1] tst w8, #0xff umov w10, v0.b[2] cset w8, ne tst w9, #0xff cset w9, ne tst w10, #0xff umov w10, v0.b[3] and w8, w8, w9 cset w9, ne tst w10, #0xff umov w10, v0.b[4] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[5] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[6] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[7] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[8] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[9] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[10] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[11] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[12] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[13] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[14] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[15] and w8, w9, w8 cset w9, ne tst w10, #0xff and w8, w9, w8 cset w9, ne and w0, w9, w8 ret any_8x16: ldr q0, [x0] umov w8, v0.b[0] umov w9, v0.b[1] orr w8, w8, w9 umov w9, v0.b[2] orr w8, w8, w9 umov w9, v0.b[3] orr w8, w8, w9 umov w9, v0.b[4] orr w8, w8, w9 umov w9, v0.b[5] orr w8, w8, w9 umov w9, v0.b[6] orr w8, w8, w9 umov w9, v0.b[7] orr w8, w8, w9 umov w9, v0.b[8] orr w8, w8, w9 umov w9, v0.b[9] orr w8, w8, w9 umov w9, v0.b[10] orr w8, w8, w9 umov w9, v0.b[11] orr w8, w8, w9 umov w9, v0.b[12] orr w8, w8, w9 umov w9, v0.b[13] orr w8, w8, w9 umov w9, v0.b[14] orr w8, w8, w9 umov w9, v0.b[15] orr w8, w8, w9 tst w8, #0xff cset w0, ne ret ``` After this PR: ```asm all_8x16: ldr q0, [x0] uminv b0, v0.16b fmov w8, s0 tst w8, #0xff cset w0, ne ret any_8x16: ldr q0, [x0] umaxv b0, v0.16b fmov w8, s0 tst w8, #0xff cset w0, ne ret ``` `ARMv7` + `neon` Before this PR: ```asm all_8x16: vmov.i8 q0, #0x1 vld1.64 {d2, d3}, [r0] vtst.8 q0, q1, q0 vext.8 q1, q0, q0, #8 vand q0, q0, q1 vext.8 q1, q0, q0, #4 vand q0, q0, q1 vext.8 q1, q0, q0, #2 vand q0, q0, q1 vdup.8 q1, d0[1] vand q0, q0, q1 vmov.u8 r0, d0[0] and r0, r0, #1 bx lr any_8x16: vmov.i8 q0, #0x1 vld1.64 {d2, d3}, [r0] vtst.8 q0, q1, q0 vext.8 q1, q0, q0, #8 vorr q0, q0, q1 vext.8 q1, q0, q0, #4 vorr q0, q0, q1 vext.8 q1, q0, q0, #2 vorr q0, q0, q1 vdup.8 q1, d0[1] vorr q0, q0, q1 vmov.u8 r0, d0[0] and r0, r0, #1 bx lr ``` After this PR: ```asm all_8x16: vld1.64 {d0, d1}, [r0] b ::all ::all: vpmin.u8 d0, d0, d b ::all any_8x16: vld1.64 {d0, d1}, [r0] b ::any ::any: vpmax.u8 d0, d0, d1 b ::any ``` The inlining problems are pretty bad on ARMv7 + NEON. 256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`) With SSE2 enabled Before this PR: ```asm all_8x32: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rip, +, LCPI17_0] movdqa xmm1, xmmword, ptr, [rdi] pand xmm1, xmm0 movdqa xmm2, xmmword, ptr, [rdi, +, 16] pand xmm2, xmm0 pcmpeqb xmm2, xmm0 pcmpeqb xmm1, xmm0 pand xmm1, xmm2 pmovmskb eax, xmm1 xor ecx, ecx cmp eax, 65535 mov eax, -1 cmovne eax, ecx and al, 1 pop rbp ret any_8x32: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rdi] por xmm0, xmmword, ptr, [rdi, +, 16] movdqa xmm1, xmmword, ptr, [rip, +, LCPI16_0] pand xmm0, xmm1 pcmpeqb xmm0, xmm1 pmovmskb eax, xmm0 neg eax sbb eax, eax and al, 1 pop rbp ret ``` After this PR: ```asm all_8x32: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rdi] pmovmskb eax, xmm0 cmp eax, 65535 jne LBB17_1 movdqa xmm0, xmmword, ptr, [rdi, +, 16] pmovmskb ecx, xmm0 mov al, 1 cmp ecx, 65535 je LBB17_3 LBB17_1: xor eax, eax LBB17_3: pop rbp ret any_8x32: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rdi] pmovmskb ecx, xmm0 mov al, 1 test ecx, ecx je LBB16_1 pop rbp ret LBB16_1: movdqa xmm0, xmmword, ptr, [rdi, +, 16] pmovmskb eax, xmm0 test eax, eax setne al pop rbp ret ``` With AVX enabled Before this PR: ```asm all_8x32: push rbp mov rbp, rsp vmovaps ymm0, ymmword, ptr, [rdi] vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0] vextractf128 xmm1, ymm0, 1 vpxor xmm2, xmm2, xmm2 vpcmpeqb xmm1, xmm1, xmm2 vpcmpeqd xmm3, xmm3, xmm3 vpxor xmm1, xmm1, xmm3 vpcmpeqb xmm0, xmm0, xmm2 vpxor xmm0, xmm0, xmm3 vinsertf128 ymm0, ymm0, xmm1, 1 vandps ymm0, ymm0, ymm1 vpermilps xmm1, xmm0, 78 vandps ymm0, ymm0, ymm1 vpermilps xmm1, xmm0, 229 vandps ymm0, ymm0, ymm1 vpsrld xmm1, xmm0, 16 vandps ymm0, ymm0, ymm1 vpsrlw xmm1, xmm0, 8 vandps ymm0, ymm0, ymm1 vpextrb eax, xmm0, 0 and al, 1 pop rbp vzeroupper ret any_8x32: push rbp mov rbp, rsp vmovaps ymm0, ymmword, ptr, [rdi] vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0] vextractf128 xmm1, ymm0, 1 vpxor xmm2, xmm2, xmm2 vpcmpeqb xmm1, xmm1, xmm2 vpcmpeqd xmm3, xmm3, xmm3 vpxor xmm1, xmm1, xmm3 vpcmpeqb xmm0, xmm0, xmm2 vpxor xmm0, xmm0, xmm3 vinsertf128 ymm0, ymm0, xmm1, 1 vorps ymm0, ymm0, ymm1 vpermilps xmm1, xmm0, 78 vorps ymm0, ymm0, ymm1 vpermilps xmm1, xmm0, 229 vorps ymm0, ymm0, ymm1 vpsrld xmm1, xmm0, 16 vorps ymm0, ymm0, ymm1 vpsrlw xmm1, xmm0, 8 vorps ymm0, ymm0, ymm1 vpextrb eax, xmm0, 0 and al, 1 pop rbp vzeroupper ret ``` After this PR: ```asm all_8x32: push rbp mov rbp, rsp vmovdqa ymm0, ymmword, ptr, [rdi] vxorps xmm1, xmm1, xmm1 vcmptrueps ymm1, ymm1, ymm1 vptest ymm0, ymm1 setb al pop rbp vzeroupper ret any_8x32: push rbp mov rbp, rsp vmovdqa ymm0, ymmword, ptr, [rdi] vptest ymm0, ymm0 setne al pop rbp vzeroupper ret ``` --- Closes #362 . --- coresimd/ppsv/api/masks_reductions.rs | 26 +- coresimd/ppsv/codegen/masks_reductions.rs | 603 ++++++++++++++++++++++ coresimd/ppsv/codegen/mod.rs | 6 + coresimd/ppsv/codegen/wrapping.rs | 42 ++ coresimd/ppsv/mod.rs | 50 +- crates/coresimd/Cargo.toml | 3 + crates/coresimd/src/lib.rs | 2 + 7 files changed, 660 insertions(+), 72 deletions(-) create mode 100644 coresimd/ppsv/codegen/masks_reductions.rs create mode 100644 coresimd/ppsv/codegen/mod.rs create mode 100644 coresimd/ppsv/codegen/wrapping.rs diff --git a/coresimd/ppsv/api/masks_reductions.rs b/coresimd/ppsv/api/masks_reductions.rs index e348a42d7b..bc7ac36d34 100644 --- a/coresimd/ppsv/api/masks_reductions.rs +++ b/coresimd/ppsv/api/masks_reductions.rs @@ -5,37 +5,15 @@ macro_rules! impl_mask_reductions { ($id:ident) => { impl $id { /// Are `all` vector lanes `true`? - #[cfg(not(target_arch = "aarch64"))] #[inline] pub fn all(self) -> bool { - use coresimd::simd_llvm::simd_reduce_all; - unsafe { simd_reduce_all(self) } + unsafe { super::codegen::masks_reductions::All::all(self) } } - /// Are `all` vector lanes `true`? - #[cfg(target_arch = "aarch64")] - #[inline] - pub fn all(self) -> bool { - // FIXME: Broken on AArch64 - // https://bugs.llvm.org/show_bug.cgi?id=36796 - self.and() - } - /// Is `any` vector lanes `true`? - #[cfg(not(target_arch = "aarch64"))] #[inline] pub fn any(self) -> bool { - use coresimd::simd_llvm::simd_reduce_any; - unsafe { simd_reduce_any(self) } + unsafe { super::codegen::masks_reductions::Any::any(self) } } - /// Is `any` vector lanes `true`? - #[cfg(target_arch = "aarch64")] - #[inline] - pub fn any(self) -> bool { - // FIXME: Broken on AArch64 - // https://bugs.llvm.org/show_bug.cgi?id=36796 - self.or() - } - /// Are `all` vector lanes `false`? #[inline] pub fn none(self) -> bool { diff --git a/coresimd/ppsv/codegen/masks_reductions.rs b/coresimd/ppsv/codegen/masks_reductions.rs new file mode 100644 index 0000000000..16c7cb8d52 --- /dev/null +++ b/coresimd/ppsv/codegen/masks_reductions.rs @@ -0,0 +1,603 @@ +//! LLVM6 currently generates sub-optimal code for the `all` mask reductions. +//! +//! See https://github.com/rust-lang-nursery/stdsimd/issues/362#issuecomment-372774371 +//! and the associated LLVM bug: +//! https://bugs.llvm.org/show_bug.cgi?id=36702 + +#![allow(unused)] + +use coresimd::simd::*; + +pub trait All: ::marker::Sized { + unsafe fn all(self) -> bool; +} + +pub trait Any: ::marker::Sized { + unsafe fn any(self) -> bool; +} + +// By default we use the simd_reduce_{all,any} intrinsics, which produces +// sub-optimal code, except on aarch64 where that intrinsic is broken +// due to https://bugs.llvm.org/show_bug.cgi?id=36796 so we just use +// full-blown bitwise and/or reduction there. +macro_rules! default_impl { + ($id:ident) => { + impl All for $id { + #[inline] + unsafe fn all(self) -> bool { + #[cfg(not(target_arch = "aarch64"))] { + use coresimd::simd_llvm::simd_reduce_all; + simd_reduce_all(self) + } + #[cfg(target_arch = "aarch64")] { + // FIXME: Broken on AArch64 + // https://bugs.llvm.org/show_bug.cgi?id=36796 + self.and() + } + } + } + + impl Any for $id { + #[inline] + unsafe fn any(self) -> bool { + #[cfg(not(target_arch = "aarch64"))] { + use coresimd::simd_llvm::simd_reduce_any; + simd_reduce_any(self) + } + #[cfg(target_arch = "aarch64")] { + // FIXME: Broken on AArch64 + // https://bugs.llvm.org/show_bug.cgi?id=36796 + self.or() + } + } + } + }; +} + +// On x86 both SSE2 and AVX2 provide movemask instructions that can be used +// here. The AVX2 instructions aren't necessarily better than the AVX +// instructions below, so they aren't implemented here. +// +// FIXME: for mask generated from f32x4 LLVM6 emits pmovmskb but should emit +// movmskps. Since the masks don't track whether they were produced by integer +// or floating point vectors, we can't currently work around this yet. The +// performance impact for this shouldn't be large, but this is filled as: +// https://bugs.llvm.org/show_bug.cgi?id=37087 +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))] +macro_rules! x86_128_sse2_movemask_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use ::coresimd::arch::x86::_mm_movemask_epi8; + #[cfg(target_arch = "x86_64")] + use ::coresimd::arch::x86_64::_mm_movemask_epi8; + // _mm_movemask_epi8(a) creates a 16bit mask containing the most + // significant bit of each byte of `a`. If all bits are set, + // then all 16 lanes of the mask are true. + _mm_movemask_epi8(::mem::transmute(self)) == u16::max_value() as i32 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use ::coresimd::arch::x86::_mm_movemask_epi8; + #[cfg(target_arch = "x86_64")] + use ::coresimd::arch::x86_64::_mm_movemask_epi8; + + _mm_movemask_epi8(::mem::transmute(self)) != 0 + } + } + } +} + +// On x86 with AVX we use _mm256_testc_si256 and _mm256_testz_si256. +// +// FIXME: for masks generated from floating point vectors one should use +// x86_mm256_testc_ps, x86_mm256_testz_ps, x86_mm256_testc_pd, +// x86_mm256_testz_pd.Since the masks don't track whether they were produced by +// integer or floating point vectors, we can't currently work around this yet. +// +// TODO: investigate perf impact and fill LLVM bugs as necessary. +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx"))] +macro_rules! x86_256_avx_test_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "avx")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use ::coresimd::arch::x86::_mm256_testc_si256; + #[cfg(target_arch = "x86_64")] + use ::coresimd::arch::x86_64::_mm256_testc_si256; + _mm256_testc_si256(::mem::transmute(self), + ::mem::transmute($id::splat(true))) != 0 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "avx")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use ::coresimd::arch::x86::_mm256_testz_si256; + #[cfg(target_arch = "x86_64")] + use ::coresimd::arch::x86_64::_mm256_testz_si256; + _mm256_testz_si256(::mem::transmute(self), + ::mem::transmute(self)) == 0 + } + } + } +} + +// On x86 with SSE2 all/any for 256-bit wide vectors is implemented by executing +// the algorithm for 128-bit on the higher and lower elements of the vector +// independently. +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))] +macro_rules! x86_256_sse2_impl { + ($id:ident, $v128:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn all(self) -> bool { + unsafe { + union U { + halves: ($v128, $v128), + vec: $id + } + let halves = U {vec: self}.halves; + halves.0.all() && halves.1.all() + } + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn any(self) -> bool { + unsafe { + union U { + halves: ($v128, $v128), + vec: $id + } + let halves = U {vec: self}.halves; + halves.0.any() || halves.1.any() + } + } + } + } +} + +// Implementation for 64-bit wide masks on x86. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +macro_rules! x86_64_mmx_movemask_impl { + ($id:ident, $vec128:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "mmx")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use ::coresimd::arch::x86::_mm_movemask_pi8; + #[cfg(target_arch = "x86_64")] + use ::coresimd::arch::x86_64::_mm_movemask_pi8; + // _mm_movemask_pi8(a) creates an 8bit mask containing the most + // significant bit of each byte of `a`. If all bits are set, + // then all 8 lanes of the mask are true. + _mm_movemask_pi8(::mem::transmute(self)) == u8::max_value() as i32 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "mmx")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use ::coresimd::arch::x86::_mm_movemask_pi8; + #[cfg(target_arch = "x86_64")] + use ::coresimd::arch::x86_64::_mm_movemask_pi8; + + _mm_movemask_pi8(::mem::transmute(self)) != 0 + } + } + } +} + +// Implementation for 128-bit wide masks on x86 +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +macro_rules! x86_128_impl { + ($id:ident) => { + cfg_if! { + if #[cfg(target_feature = "sse2")] { + x86_128_sse2_movemask_impl!($id); + } else { + default_impl!($id); + } + } + } +} + +// Implementation for 256-bit wide masks on x86 +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +macro_rules! x86_256_impl { + ($id:ident, $half_id:ident) => { + cfg_if! { + if #[cfg(target_feature = "avx")] { + x86_256_avx_test_impl!($id); + } else if #[cfg(target_feature = "sse2")] { + x86_256_sse2_impl!($id, $half_id); + } else { + default_impl!($id); + } + } + } +} + +// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding +// minimum/maximum of adjacent pairs) for 64-bit wide two-element vectors. +#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] +macro_rules! arm_64_x2_v7_neon_impl { + ($id:ident, $vpmin:ident, $vpmax:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "v7,neon")] + unsafe fn all(self) -> bool { + use ::coresimd::arch::arm::$vpmin; + use ::mem::transmute; + // pmin((a, b), (-,-)) => (b, -).0 => b + let tmp: $id = transmute($vpmin(transmute(self), ::mem::uninitialized())); + tmp.extract(0) + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "v7,neon")] + unsafe fn any(self) -> bool { + use ::coresimd::arch::arm::$vpmax; + use ::mem::transmute; + // pmax((a, b), (-,-)) => (b, -).0 => b + let tmp: $id = transmute($vpmax(transmute(self), ::mem::uninitialized())); + tmp.extract(0) + } + } + } +} + +// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding +// minimum/maximum of adjacent pairs) for 64-bit wide four-element vectors. +#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] +macro_rules! arm_64_x4_v7_neon_impl { + ($id:ident, $vpmin:ident, $vpmax:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "v7,neon")] + unsafe fn all(self) -> bool { + use ::coresimd::arch::arm::$vpmin; + use ::mem::transmute; + // tmp = pmin((a, b, c, d), (-,-,-,-)) => (a, c, -, -) + let tmp = $vpmin(transmute(self), ::mem::uninitialized()); + // tmp = pmin((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c + let tmp: $id = transmute($vpmin(tmp, ::mem::uninitialized())); + tmp.extract(0) + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "v7,neon")] + unsafe fn any(self) -> bool { + use ::coresimd::arch::arm::$vpmax; + use ::mem::transmute; + // tmp = pmax((a, b, c, d), (-,-,-,-)) => (a, c, -, -) + let tmp = $vpmax(transmute(self), ::mem::uninitialized()); + // tmp = pmax((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c + let tmp: $id = transmute($vpmax(tmp, ::mem::uninitialized())); + tmp.extract(0) + } + } + } +} + +// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding +// minimum/maximum of adjacent pairs) for 64-bit wide eight-element vectors. +#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] +macro_rules! arm_64_x8_v7_neon_impl { + ($id:ident, $vpmin:ident, $vpmax:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "v7,neon")] + unsafe fn all(self) -> bool { + use ::coresimd::arch::arm::$vpmin; + use ::mem::transmute; + // tmp = pmin( + // (a, b, c, d, e, f, g, h), + // (-, -, -, -, -, -, -, -) + // ) => (a, c, e, g, -, -, -, -) + let tmp = $vpmin(transmute(self), ::mem::uninitialized()); + // tmp = pmin( + // (a, c, e, g, -, -, -, -), + // (-, -, -, -, -, -, -, -) + // ) => (c, g, -, -, -, -, -, -) + let tmp = $vpmin(tmp, ::mem::uninitialized()); + // tmp = pmin( + // (c, g, -, -, -, -, -, -), + // (-, -, -, -, -, -, -, -) + // ) => (g, -, -, -, -, -, -, -).0 => g + let tmp: $id = transmute($vpmin(tmp, ::mem::uninitialized())); + tmp.extract(0) + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "v7,neon")] + unsafe fn any(self) -> bool { + use ::coresimd::arch::arm::$vpmax; + use ::mem::transmute; + // tmp = pmax( + // (a, b, c, d, e, f, g, h), + // (-, -, -, -, -, -, -, -) + // ) => (a, c, e, g, -, -, -, -) + let tmp = $vpmax(transmute(self), ::mem::uninitialized()); + // tmp = pmax( + // (a, c, e, g, -, -, -, -), + // (-, -, -, -, -, -, -, -) + // ) => (c, g, -, -, -, -, -, -) + let tmp = $vpmax(tmp, ::mem::uninitialized()); + // tmp = pmax( + // (c, g, -, -, -, -, -, -), + // (-, -, -, -, -, -, -, -) + // ) => (g, -, -, -, -, -, -, -).0 => g + let tmp: $id = transmute($vpmax(tmp, ::mem::uninitialized())); + tmp.extract(0) + } + } + } +} + + +// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding +// minimum/maximum of adjacent pairs) for 64-bit or 128-bit wide vectors with +// more than two elements. +#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] +macro_rules! arm_128_v7_neon_impl { + ($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "v7,neon")] + unsafe fn all(self) -> bool { + use ::coresimd::arch::arm::$vpmin; + use ::mem::transmute; + union U { + halves: ($half, $half), + vec: $id + } + let halves = U { vec: self }.halves; + let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1))); + h.all() + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "v7,neon")] + unsafe fn any(self) -> bool { + use ::coresimd::arch::arm::$vpmax; + use ::mem::transmute; + union U { + halves: ($half, $half), + vec: $id + } + let halves = U { vec: self }.halves; + let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1))); + h.any() + } + } + } +} + +// Implementation for AArch64 + NEON using vmin and vmax (horizontal vector +// min/max) for 128-bit wide vectors. +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +macro_rules! aarch64_128_neon_impl { + ($id:ident, $vmin:ident, $vmax:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "neon")] + unsafe fn all(self) -> bool { + use ::coresimd::arch::aarch64::$vmin; + $vmin(::mem::transmute(self)) != 0 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "neon")] + unsafe fn any(self) -> bool { + use ::coresimd::arch::aarch64::$vmax; + $vmax(::mem::transmute(self)) != 0 + } + } + } +} + +// Implementation for AArch64 + NEON using vmin and vmax (horizontal vector +// min/max) for 64-bit wide vectors. +// +// This impl duplicates the 64-bit vector into a 128-bit one and calls +// all/any on that. +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +macro_rules! aarch64_64_neon_impl { + ($id:ident, $vec128:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "neon")] + unsafe fn all(self) -> bool { + union U { + halves: ($id, $id), + vec: $vec128 + } + U { halves: (self, self) }.vec.all() + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "neon")] + unsafe fn any(self) -> bool { + union U { + halves: ($id, $id), + vec: $vec128 + } + U { halves: (self, self) }.vec.any() + } + } + } +} + +macro_rules! impl_mask_all_any { + // 64-bit wide masks + (m8x8) => { + cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + x86_64_mmx_movemask_impl!(m8x8, m8x16); + } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] { + arm_64_x8_v7_neon_impl!(m8x8, vpmin_u8, vpmax_u8); + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + aarch64_64_neon_impl!(m8x8, m8x16); + } else { + default_impl!(m8x8); + } + } + }; + (m16x4) => { + cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + x86_64_mmx_movemask_impl!(m16x4, m16x8); + } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] { + arm_64_x4_v7_neon_impl!(m16x4, vpmin_u16, vpmax_u16); + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + aarch64_64_neon_impl!(m16x4, m16x8); + } else { + default_impl!(m16x4); + } + } + }; + (m32x2) => { + cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + x86_64_mmx_movemask_impl!(m32x2, m32x4); + } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] { + arm_64_x2_v7_neon_impl!(m32x2, vpmin_u32, vpmax_u32); + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + aarch64_64_neon_impl!(m32x2, m32x4); + } else { + default_impl!(m32x2); + } + } + }; + // 128-bit wide masks + (m8x16) => { + cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + x86_128_impl!(m8x16); + } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] { + arm_128_v7_neon_impl!(m8x16, m8x8, vpmin_u8, vpmax_u8); + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + aarch64_128_neon_impl!(m8x16, vminvq_u8, vmaxvq_u8); + } else { + default_impl!(m8x16); + } + } + }; + (m16x8) => { + cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + x86_128_impl!(m16x8); + } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] { + arm_128_v7_neon_impl!(m16x8, m16x4, vpmin_u16, vpmax_u16); + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + aarch64_128_neon_impl!(m16x8, vminvq_u16, vmaxvq_u16); + } else { + default_impl!(m16x8); + } + } + }; + (m32x4) => { + cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + x86_128_impl!(m32x4); + } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] { + arm_128_v7_neon_impl!(m32x4, m32x2, vpmin_u32, vpmax_u32); + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + aarch64_128_neon_impl!(m32x4, vminvq_u32, vmaxvq_u32); + } else { + default_impl!(m32x4); + } + } + }; + (m64x2) => { + cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + x86_128_impl!(m64x2); + } else { + default_impl!(m64x2); + } + } + }; + // 256-bit wide masks: + (m8x32) => { + cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + x86_256_impl!(m8x32, m8x16); + } else { + default_impl!(m8x32); + } + } + }; + (m16x16) => { + cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + x86_256_impl!(m16x16, m16x8); + } else { + default_impl!(m16x16); + } + } + }; + (m32x8) => { + cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + x86_256_impl!(m32x8, m32x4); + } else { + default_impl!(m32x8); + } + } + }; + (m64x4) => { + cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + x86_256_impl!(m64x4, m64x2); + } else { + default_impl!(m64x4); + } + } + }; + // Fallback to LLVM's default code-generation: + ($id:ident) => { default_impl!($id); }; +} + +vector_impl!( + [impl_mask_all_any, m1x8], + [impl_mask_all_any, m1x16], + [impl_mask_all_any, m1x32], + [impl_mask_all_any, m1x64], + [impl_mask_all_any, m8x2], + [impl_mask_all_any, m8x4], + [impl_mask_all_any, m8x8], + [impl_mask_all_any, m8x16], + [impl_mask_all_any, m8x32], + [impl_mask_all_any, m16x2], + [impl_mask_all_any, m16x4], + [impl_mask_all_any, m16x8], + [impl_mask_all_any, m16x16], + [impl_mask_all_any, m32x2], + [impl_mask_all_any, m32x4], + [impl_mask_all_any, m32x8], + [impl_mask_all_any, m64x2], + [impl_mask_all_any, m64x4] +); diff --git a/coresimd/ppsv/codegen/mod.rs b/coresimd/ppsv/codegen/mod.rs new file mode 100644 index 0000000000..448587b795 --- /dev/null +++ b/coresimd/ppsv/codegen/mod.rs @@ -0,0 +1,6 @@ +//! Work arounds for code generation issues + +#[cfg(target_arch = "aarch64")] +pub mod wrapping; + +pub mod masks_reductions; diff --git a/coresimd/ppsv/codegen/wrapping.rs b/coresimd/ppsv/codegen/wrapping.rs new file mode 100644 index 0000000000..0e2f306eb0 --- /dev/null +++ b/coresimd/ppsv/codegen/wrapping.rs @@ -0,0 +1,42 @@ +//! Used by the wrapping_sum and wrapping_product algorithms for AArch64. + +pub(crate) trait Wrapping { + fn add(self, other: Self) -> Self; + fn mul(self, other: Self) -> Self; +} + +macro_rules! int_impl { + ($id:ident) => { + impl Wrapping for $id { + fn add(self, other: Self) -> Self { + self.wrapping_add(other) + } + fn mul(self, other: Self) -> Self { + self.wrapping_mul(other) + } + } + }; +} +int_impl!(i8); +int_impl!(i16); +int_impl!(i32); +int_impl!(i64); +int_impl!(u8); +int_impl!(u16); +int_impl!(u32); +int_impl!(u64); + +macro_rules! float_impl { + ($id:ident) => { + impl Wrapping for $id { + fn add(self, other: Self) -> Self { + self + other + } + fn mul(self, other: Self) -> Self { + self * other + } + } + }; +} +float_impl!(f32); +float_impl!(f64); diff --git a/coresimd/ppsv/mod.rs b/coresimd/ppsv/mod.rs index 08b7ce80d6..0d48509e3a 100644 --- a/coresimd/ppsv/mod.rs +++ b/coresimd/ppsv/mod.rs @@ -80,51 +80,5 @@ impl FromBits for T { } } -/// Workarounds code generation issues. -#[cfg(target_arch = "aarch64")] -mod codegen { - #[cfg(target_arch = "aarch64")] - pub mod wrapping { - pub trait Wrapping { - fn add(self, other: Self) -> Self; - fn mul(self, other: Self) -> Self; - } - - macro_rules! int_impl { - ($id:ident) => { - impl Wrapping for $id { - fn add(self, other: Self) -> Self { - self.wrapping_add(other) - } - fn mul(self, other: Self) -> Self { - self.wrapping_mul(other) - } - } - }; - } - int_impl!(i8); - int_impl!(i16); - int_impl!(i32); - int_impl!(i64); - int_impl!(u8); - int_impl!(u16); - int_impl!(u32); - int_impl!(u64); - - macro_rules! float_impl { - ($id:ident) => { - impl Wrapping for $id { - fn add(self, other: Self) -> Self { - self + other - } - fn mul(self, other: Self) -> Self { - self * other - } - } - }; - } - float_impl!(f32); - float_impl!(f64); - } - -} +/// Work arounds code generation issues. +mod codegen; diff --git a/crates/coresimd/Cargo.toml b/crates/coresimd/Cargo.toml index 5bc2e5d7ef..3fb757c544 100644 --- a/crates/coresimd/Cargo.toml +++ b/crates/coresimd/Cargo.toml @@ -18,6 +18,9 @@ is-it-maintained-issue-resolution = { repository = "rust-lang-nursery/stdsimd" } is-it-maintained-open-issues = { repository = "rust-lang-nursery/stdsimd" } maintenance = { status = "experimental" } +[dependencies] +cfg-if = "0.1" + [dev-dependencies] stdsimd-test = { version = "0.*", path = "../stdsimd-test" } stdsimd = { version = "0.0.3", path = "../stdsimd" } diff --git a/crates/coresimd/src/lib.rs b/crates/coresimd/src/lib.rs index be097b9779..a8c5a07645 100644 --- a/crates/coresimd/src/lib.rs +++ b/crates/coresimd/src/lib.rs @@ -45,6 +45,8 @@ extern crate stdsimd; extern crate stdsimd_test; #[cfg(test)] extern crate test; +#[macro_use] +extern crate cfg_if; macro_rules! test_v16 { ($item:item) => {}; From f99000b993b3d125cab81aa900d6761037e8b25d Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 13 Apr 2018 10:13:00 +0200 Subject: [PATCH 02/15] test avx on all x86 targets --- ci/run.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ci/run.sh b/ci/run.sh index 3ea3ae08b5..2717d8eaa9 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -35,3 +35,12 @@ cargo_test() { cargo_test cargo_test "--release" + +case ${TARGET} in + x86*) + RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx" + cargo_test "--release" + ;; + *) + ;; +esac From 81641d64e2b8f82542c8cf525f17718095ab450f Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 27 Apr 2018 16:02:59 +0200 Subject: [PATCH 03/15] disable assert_instr on avx test --- ci/run.sh | 1 + crates/assert-instr-macro/src/lib.rs | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/run.sh b/ci/run.sh index 2717d8eaa9..fe06c876ed 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -39,6 +39,7 @@ cargo_test "--release" case ${TARGET} in x86*) RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx" + export STDSIMD_IGNORE_ASSERT_INSTR=1 cargo_test "--release" ;; *) diff --git a/crates/assert-instr-macro/src/lib.rs b/crates/assert-instr-macro/src/lib.rs index 5320bcba37..17936a1fdc 100644 --- a/crates/assert-instr-macro/src/lib.rs +++ b/crates/assert-instr-macro/src/lib.rs @@ -33,7 +33,8 @@ pub fn assert_instr( }; let instr = &invoc.instr; - let maybe_ignore = if cfg!(optimized) { + let maybe_ignore = if cfg!(optimized) || + ::std::env::var("STDSIMD_IGNORE_ASSERT_INSTR").is_ok() { TokenStream::empty() } else { (quote! { #[ignore] }).into() From 985893ec214112540f3a788edc7382833d56e5e8 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 27 Apr 2018 17:15:54 +0200 Subject: [PATCH 04/15] enable all appropriate features --- ci/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/run.sh b/ci/run.sh index fe06c876ed..f120df472e 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -38,7 +38,7 @@ cargo_test "--release" case ${TARGET} in x86*) - RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx" + RUSTFLAGS="${RUSTFLAGS} -C target-feature=+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+popcnt,+lzcnt" export STDSIMD_IGNORE_ASSERT_INSTR=1 cargo_test "--release" ;; From 2d6cc25e14376986bc4015dc2559e7a092a1bb20 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 4 May 2018 13:35:22 +0200 Subject: [PATCH 05/15] disable assert_instr on x86+avx --- ci/run.sh | 4 ++-- crates/assert-instr-macro/src/lib.rs | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index f120df472e..8a13073f86 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -36,10 +36,10 @@ cargo_test() { cargo_test cargo_test "--release" +# Test x86 targets compiled with AVX. case ${TARGET} in x86*) - RUSTFLAGS="${RUSTFLAGS} -C target-feature=+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+popcnt,+lzcnt" - export STDSIMD_IGNORE_ASSERT_INSTR=1 + RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx" cargo_test "--release" ;; *) diff --git a/crates/assert-instr-macro/src/lib.rs b/crates/assert-instr-macro/src/lib.rs index 17936a1fdc..4fe76bcf07 100644 --- a/crates/assert-instr-macro/src/lib.rs +++ b/crates/assert-instr-macro/src/lib.rs @@ -33,8 +33,12 @@ pub fn assert_instr( }; let instr = &invoc.instr; - let maybe_ignore = if cfg!(optimized) || - ::std::env::var("STDSIMD_IGNORE_ASSERT_INSTR").is_ok() { + // Disable assert_instr for x86 targets compiled with avx enabled, which + // causes LLVM to generate different intrinsics that the ones we are testing + // for. + let x86_with_avx = (cfg!(target_arch = "x86") || cfg!(target_arch = "x86_64")) + && cfg!(target_feature = "avx"); + let maybe_ignore = if cfg!(optimized) && !x86_with_avx { TokenStream::empty() } else { (quote! { #[ignore] }).into() From a6bd57c02d6aa51cc21e146b57a2c2b62929a3fd Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 4 May 2018 13:50:43 +0200 Subject: [PATCH 06/15] the fn_must_use is stable --- crates/coresimd/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/coresimd/src/lib.rs b/crates/coresimd/src/lib.rs index a8c5a07645..a9a4d28862 100644 --- a/crates/coresimd/src/lib.rs +++ b/crates/coresimd/src/lib.rs @@ -13,7 +13,7 @@ simd_ffi, asm, integer_atomics, stmt_expr_attributes, core_intrinsics, crate_in_paths, no_core, attr_literals, rustc_attrs, stdsimd, - staged_api, fn_must_use, core_float, core_slice_ext, align_offset, + staged_api, core_float, core_slice_ext, align_offset, doc_cfg, mmx_target_feature, tbm_target_feature, sse4a_target_feature, arm_target_feature, aarch64_target_feature, mips_target_feature)] From 4462afb34eebb318cd80d69a795fc4bd1b563707 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 4 May 2018 14:13:51 +0200 Subject: [PATCH 07/15] fix nbody example on armv7 --- examples/nbody.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nbody.rs b/examples/nbody.rs index c67fbdc561..2ffa3d4e8c 100644 --- a/examples/nbody.rs +++ b/examples/nbody.rs @@ -59,8 +59,8 @@ impl Frsqrt for f64x2 { all(target_arch = "aarch64", target_feature = "neon"))))] { - self.replace(0, 1. / self.extract(0).sqrt()); - self.replace(1, 1. / self.extract(1).sqrt()); + self = self.replace(0, 1. / self.extract(0).sqrt()); + self = self.replace(1, 1. / self.extract(1).sqrt()); *self } } From bd8e0e83184346e3b9d1001e00243592e147c34e Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 4 May 2018 14:27:28 +0200 Subject: [PATCH 08/15] fixup --- examples/nbody.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nbody.rs b/examples/nbody.rs index 2ffa3d4e8c..83541cde60 100644 --- a/examples/nbody.rs +++ b/examples/nbody.rs @@ -59,8 +59,8 @@ impl Frsqrt for f64x2 { all(target_arch = "aarch64", target_feature = "neon"))))] { - self = self.replace(0, 1. / self.extract(0).sqrt()); - self = self.replace(1, 1. / self.extract(1).sqrt()); + *self = self.replace(0, 1. / self.extract(0).sqrt()); + *self = self.replace(1, 1. / self.extract(1).sqrt()); *self } } From 17466e9a7595695e80803b3de8c02fc049f30b9f Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 4 May 2018 14:33:16 +0200 Subject: [PATCH 09/15] fixup --- examples/nbody.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/nbody.rs b/examples/nbody.rs index 83541cde60..8df5aa290c 100644 --- a/examples/nbody.rs +++ b/examples/nbody.rs @@ -59,9 +59,9 @@ impl Frsqrt for f64x2 { all(target_arch = "aarch64", target_feature = "neon"))))] { - *self = self.replace(0, 1. / self.extract(0).sqrt()); - *self = self.replace(1, 1. / self.extract(1).sqrt()); - *self + let r = self.replace(0, 1. / self.extract(0).sqrt()); + let r = r.replace(1, 1. / self.extract(1).sqrt()); + r } } } From 8819741d60cbb701d7a560c6bfc93baa8509ceee Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 4 May 2018 14:51:37 +0200 Subject: [PATCH 10/15] enable 64-bit wide mask MMX optimizations on x86_64 only --- coresimd/ppsv/codegen/masks_reductions.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/coresimd/ppsv/codegen/masks_reductions.rs b/coresimd/ppsv/codegen/masks_reductions.rs index 16c7cb8d52..a202b9548d 100644 --- a/coresimd/ppsv/codegen/masks_reductions.rs +++ b/coresimd/ppsv/codegen/masks_reductions.rs @@ -454,7 +454,7 @@ macro_rules! impl_mask_all_any { // 64-bit wide masks (m8x8) => { cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + if #[cfg(target_arch = "x86_64")] { x86_64_mmx_movemask_impl!(m8x8, m8x16); } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] { arm_64_x8_v7_neon_impl!(m8x8, vpmin_u8, vpmax_u8); @@ -467,7 +467,7 @@ macro_rules! impl_mask_all_any { }; (m16x4) => { cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + if #[cfg(target_arch = "x86_64")] { x86_64_mmx_movemask_impl!(m16x4, m16x8); } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] { arm_64_x4_v7_neon_impl!(m16x4, vpmin_u16, vpmax_u16); @@ -480,7 +480,7 @@ macro_rules! impl_mask_all_any { }; (m32x2) => { cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + if #[cfg(target_arch = "x86_64")] { x86_64_mmx_movemask_impl!(m32x2, m32x4); } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] { arm_64_x2_v7_neon_impl!(m32x2, vpmin_u32, vpmax_u32); From 68f771939671d636af8f95700bec476ed87c0217 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 4 May 2018 16:53:49 +0200 Subject: [PATCH 11/15] remove coresimd dependency on cfg_if --- crates/coresimd/Cargo.toml | 3 --- crates/coresimd/src/lib.rs | 46 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/crates/coresimd/Cargo.toml b/crates/coresimd/Cargo.toml index 3fb757c544..5bc2e5d7ef 100644 --- a/crates/coresimd/Cargo.toml +++ b/crates/coresimd/Cargo.toml @@ -18,9 +18,6 @@ is-it-maintained-issue-resolution = { repository = "rust-lang-nursery/stdsimd" } is-it-maintained-open-issues = { repository = "rust-lang-nursery/stdsimd" } maintenance = { status = "experimental" } -[dependencies] -cfg-if = "0.1" - [dev-dependencies] stdsimd-test = { version = "0.*", path = "../stdsimd-test" } stdsimd = { version = "0.0.3", path = "../stdsimd" } diff --git a/crates/coresimd/src/lib.rs b/crates/coresimd/src/lib.rs index a9a4d28862..0a4732754d 100644 --- a/crates/coresimd/src/lib.rs +++ b/crates/coresimd/src/lib.rs @@ -45,8 +45,50 @@ extern crate stdsimd; extern crate stdsimd_test; #[cfg(test)] extern crate test; -#[macro_use] -extern crate cfg_if; + +#[doc(hidden)] +macro_rules! cfg_if { + ($( + if #[cfg($($meta:meta),*)] { $($it:item)* } + ) else * else { + $($it2:item)* + }) => { + __cfg_if_items! { + () ; + $( ( ($($meta),*) ($($it)*) ), )* + ( () ($($it2)*) ), + } + }; + ( + if #[cfg($($i_met:meta),*)] { $($i_it:item)* } + $( + else if #[cfg($($e_met:meta),*)] { $($e_it:item)* } + )* + ) => { + __cfg_if_items! { + () ; + ( ($($i_met),*) ($($i_it)*) ), + $( ( ($($e_met),*) ($($e_it)*) ), )* + ( () () ), + } + } +} + +#[doc(hidden)] +macro_rules! __cfg_if_items { + (($($not:meta,)*) ; ) => {}; + (($($not:meta,)*) ; ( ($($m:meta),*) ($($it:item)*) ), $($rest:tt)*) => { + __cfg_if_apply! { cfg(all($($m,)* not(any($($not),*)))), $($it)* } + __cfg_if_items! { ($($not,)* $($m,)*) ; $($rest)* } + } +} + +#[doc(hidden)] +macro_rules! __cfg_if_apply { + ($m:meta, $($it:item)*) => { + $(#[$m] $it)* + } +} macro_rules! test_v16 { ($item:item) => {}; From 4ee0c5a28afe12e067c6f31f27ac61972b8215d2 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 4 May 2018 17:25:11 +0200 Subject: [PATCH 12/15] allow wasm to fail --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 45292f0bd2..a0ebd95ae4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -55,6 +55,7 @@ matrix: cargo clippy --all -- -D clippy-pedantic allow_failures: - env: CLIPPY=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1 + - env: TARGET=wasm32-unknown-unknown before_install: # FIXME (travis-ci/travis-ci#8920) shouldn't be necessary... From fec0a3a63efed755ef74cf42cb3c804ddb0a7961 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 4 May 2018 18:19:01 +0200 Subject: [PATCH 13/15] use an env variable to disable assert_instr tests --- ci/run.sh | 1 + crates/assert-instr-macro/src/lib.rs | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index 8a13073f86..5ca8bbdd96 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -40,6 +40,7 @@ cargo_test "--release" case ${TARGET} in x86*) RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx" + export STDSIMD_DISABLE_ASSERT_INSTR=1 cargo_test "--release" ;; *) diff --git a/crates/assert-instr-macro/src/lib.rs b/crates/assert-instr-macro/src/lib.rs index 4fe76bcf07..1963e0720c 100644 --- a/crates/assert-instr-macro/src/lib.rs +++ b/crates/assert-instr-macro/src/lib.rs @@ -33,17 +33,18 @@ pub fn assert_instr( }; let instr = &invoc.instr; + let name = &func.ident; + // Disable assert_instr for x86 targets compiled with avx enabled, which // causes LLVM to generate different intrinsics that the ones we are testing // for. - let x86_with_avx = (cfg!(target_arch = "x86") || cfg!(target_arch = "x86_64")) - && cfg!(target_feature = "avx"); - let maybe_ignore = if cfg!(optimized) && !x86_with_avx { + let disable_assert_instr = std::env::var("STDSIMD_DISABLE_ASSERT_INSTR").is_ok(); + let maybe_ignore = if cfg!(optimized) && !disable_assert_instr { TokenStream::empty() } else { (quote! { #[ignore] }).into() }; - let name = &func.ident; + use quote::ToTokens; let instr_str = instr .clone() From c589b1c7ba52c5cc5c0f4190eb588b37a6f107bf Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 4 May 2018 18:52:47 +0200 Subject: [PATCH 14/15] disable m32x2 mask MMX optimization on macos --- coresimd/ppsv/codegen/masks_reductions.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/coresimd/ppsv/codegen/masks_reductions.rs b/coresimd/ppsv/codegen/masks_reductions.rs index a202b9548d..b06c2d0a29 100644 --- a/coresimd/ppsv/codegen/masks_reductions.rs +++ b/coresimd/ppsv/codegen/masks_reductions.rs @@ -480,7 +480,8 @@ macro_rules! impl_mask_all_any { }; (m32x2) => { cfg_if! { - if #[cfg(target_arch = "x86_64")] { + if #[cfg(all(target_arch = "x86_64", not(target_os = "macos")))] { + // FIXME: this fails on travis-ci osx build bots. x86_64_mmx_movemask_impl!(m32x2, m32x4); } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] { arm_64_x2_v7_neon_impl!(m32x2, vpmin_u32, vpmax_u32); From a1390beeb575279892b2b1f05fff06c17c163088 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 4 May 2018 22:12:24 +0200 Subject: [PATCH 15/15] move cfg_if to coresimd/macros.rs --- coresimd/macros.rs | 41 +++++++++++++++++++++++++++++++++++ crates/coresimd/src/lib.rs | 44 -------------------------------------- 2 files changed, 41 insertions(+), 44 deletions(-) diff --git a/coresimd/macros.rs b/coresimd/macros.rs index 343f425c1a..fa96f50c81 100644 --- a/coresimd/macros.rs +++ b/coresimd/macros.rs @@ -13,3 +13,44 @@ macro_rules! types { pub struct $name($($fields)*); )*) } + +macro_rules! cfg_if { + ($( + if #[cfg($($meta:meta),*)] { $($it:item)* } + ) else * else { + $($it2:item)* + }) => { + __cfg_if_items! { + () ; + $( ( ($($meta),*) ($($it)*) ), )* + ( () ($($it2)*) ), + } + }; + ( + if #[cfg($($i_met:meta),*)] { $($i_it:item)* } + $( + else if #[cfg($($e_met:meta),*)] { $($e_it:item)* } + )* + ) => { + __cfg_if_items! { + () ; + ( ($($i_met),*) ($($i_it)*) ), + $( ( ($($e_met),*) ($($e_it)*) ), )* + ( () () ), + } + } +} + +macro_rules! __cfg_if_items { + (($($not:meta,)*) ; ) => {}; + (($($not:meta,)*) ; ( ($($m:meta),*) ($($it:item)*) ), $($rest:tt)*) => { + __cfg_if_apply! { cfg(all($($m,)* not(any($($not),*)))), $($it)* } + __cfg_if_items! { ($($not,)* $($m,)*) ; $($rest)* } + } +} + +macro_rules! __cfg_if_apply { + ($m:meta, $($it:item)*) => { + $(#[$m] $it)* + } +} diff --git a/crates/coresimd/src/lib.rs b/crates/coresimd/src/lib.rs index 0a4732754d..aaf61563bd 100644 --- a/crates/coresimd/src/lib.rs +++ b/crates/coresimd/src/lib.rs @@ -46,50 +46,6 @@ extern crate stdsimd_test; #[cfg(test)] extern crate test; -#[doc(hidden)] -macro_rules! cfg_if { - ($( - if #[cfg($($meta:meta),*)] { $($it:item)* } - ) else * else { - $($it2:item)* - }) => { - __cfg_if_items! { - () ; - $( ( ($($meta),*) ($($it)*) ), )* - ( () ($($it2)*) ), - } - }; - ( - if #[cfg($($i_met:meta),*)] { $($i_it:item)* } - $( - else if #[cfg($($e_met:meta),*)] { $($e_it:item)* } - )* - ) => { - __cfg_if_items! { - () ; - ( ($($i_met),*) ($($i_it)*) ), - $( ( ($($e_met),*) ($($e_it)*) ), )* - ( () () ), - } - } -} - -#[doc(hidden)] -macro_rules! __cfg_if_items { - (($($not:meta,)*) ; ) => {}; - (($($not:meta,)*) ; ( ($($m:meta),*) ($($it:item)*) ), $($rest:tt)*) => { - __cfg_if_apply! { cfg(all($($m,)* not(any($($not),*)))), $($it)* } - __cfg_if_items! { ($($not,)* $($m,)*) ; $($rest)* } - } -} - -#[doc(hidden)] -macro_rules! __cfg_if_apply { - ($m:meta, $($it:item)*) => { - $(#[$m] $it)* - } -} - macro_rules! test_v16 { ($item:item) => {}; }