forked from rust-lang/stdarch
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit 1d4668f
committed
Work arounds for LLVM6 code-gen bugs in all/any reductions
This commit adds workarounds for the mask reductions: `all` and `any`.
64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)
`x86_64` with `MMX` enabled
```asm
all_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movzx eax, byte, ptr, [rdi, +, 7]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 6]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 5]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 4]
movd xmm2, eax
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx eax, byte, ptr, [rdi, +, 3]
movd xmm0, eax
movzx eax, byte, ptr, [rdi, +, 2]
movd xmm1, eax
punpcklwd xmm1, xmm0
movzx eax, byte, ptr, [rdi, +, 1]
movd xmm0, eax
movzx eax, byte, ptr, [rdi]
movd xmm3, eax
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
pop rbp
ret
```
After this PR for `m8x8`, `m16x4`, `m32x2`:
```asm
all_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
cmp eax, 255
sete al
pop rbp
ret
any_8x8:
push rbp
mov rbp, rsp
movq mm0, qword, ptr, [rdi]
pmovmskb eax, mm0
test eax, eax
setne al
pop rbp
ret
```
x86` with `MMX` enabled
Before this PR:
```asm
all_8x8:
call L9$pb
L9$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
pand xmm0, xmm3
pshufd xmm1, xmm0, 229
pand xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
pand xmm0, xmm1
movd eax, xmm0
and al, 1
ret
any_8x8:
call L8$pb
L8$pb:
pop eax
mov ecx, dword, ptr, [esp, +, 4]
movzx edx, byte, ptr, [ecx, +, 7]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 6]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 5]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 4]
movd xmm2, edx
punpcklwd xmm2, xmm0
punpckldq xmm2, xmm1
movzx edx, byte, ptr, [ecx, +, 3]
movd xmm0, edx
movzx edx, byte, ptr, [ecx, +, 2]
movd xmm1, edx
punpcklwd xmm1, xmm0
movzx edx, byte, ptr, [ecx, +, 1]
movd xmm0, edx
movzx ecx, byte, ptr, [ecx]
movd xmm3, ecx
punpcklwd xmm3, xmm0
punpckldq xmm3, xmm1
punpcklqdq xmm3, xmm2
movdqa xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
pand xmm3, xmm0
pcmpeqw xmm3, xmm0
pshufd xmm0, xmm3, 78
por xmm0, xmm3
pshufd xmm1, xmm0, 229
por xmm1, xmm0
movdqa xmm0, xmm1
psrld xmm0, 16
por xmm0, xmm1
movd eax, xmm0
and al, 1
ret
```
After this PR:
```asm
all_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
cmp eax, 255
sete al
ret
any_8x8:
mov eax, dword, ptr, [esp, +, 4]
movq mm0, qword, ptr, [eax]
pmovmskb eax, mm0
test eax, eax
setne al
ret
```
`aarch64`
Before this PR:
```asm
all_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x8:
ldr d0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x8:
ldr d0, [x0]
mov v0.d[1], v0.d[0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, rust-lang#4
vand d0, d0, d1
vext.8 d1, d0, d0, rust-lang#2
vand d0, d0, d1
vdup.8 d1, d0[1]
vand d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x8:
vmov.i8 d0, #0x1
vldr d1, [r0]
vtst.8 d0, d1, d0
vext.8 d1, d0, d0, rust-lang#4
vorr d0, d0, d1
vext.8 d1, d0, d0, rust-lang#2
vorr d0, d0, d1
vdup.8 d1, d0[1]
vorr d0, d0, d1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x8:
vldr d0, [r0]
b <m8x8 as All>::all
<m8x8 as All>::all:
vpmin.u8 d16, d0, d16
vpmin.u8 d16, d16, d16
vpmin.u8 d0, d16, d16
b m8x8::extract
any_8x8:
vldr d0, [r0]
b <m8x8 as Any>::any
<m8x8 as Any>::any:
vpmax.u8 d16, d0, d16
vpmax.u8 d16, d16, d16
vpmax.u8 d0, d16, d16
b m8x8::extract
```
(note: inlining does not work properly on ARMv7)
128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)
`x86_64` with SSE2 enabled
Before this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
sete al
pop rbp
ret
any_8x16:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
`aarch64`
Before this PR:
```asm
all_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
tst w8, #0xff
umov w10, v0.b[2]
cset w8, ne
tst w9, #0xff
cset w9, ne
tst w10, #0xff
umov w10, v0.b[3]
and w8, w8, w9
cset w9, ne
tst w10, #0xff
umov w10, v0.b[4]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[5]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[6]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[7]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[8]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[9]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[10]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[11]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[12]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[13]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[14]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
umov w10, v0.b[15]
and w8, w9, w8
cset w9, ne
tst w10, #0xff
and w8, w9, w8
cset w9, ne
and w0, w9, w8
ret
any_8x16:
ldr q0, [x0]
umov w8, v0.b[0]
umov w9, v0.b[1]
orr w8, w8, w9
umov w9, v0.b[2]
orr w8, w8, w9
umov w9, v0.b[3]
orr w8, w8, w9
umov w9, v0.b[4]
orr w8, w8, w9
umov w9, v0.b[5]
orr w8, w8, w9
umov w9, v0.b[6]
orr w8, w8, w9
umov w9, v0.b[7]
orr w8, w8, w9
umov w9, v0.b[8]
orr w8, w8, w9
umov w9, v0.b[9]
orr w8, w8, w9
umov w9, v0.b[10]
orr w8, w8, w9
umov w9, v0.b[11]
orr w8, w8, w9
umov w9, v0.b[12]
orr w8, w8, w9
umov w9, v0.b[13]
orr w8, w8, w9
umov w9, v0.b[14]
orr w8, w8, w9
umov w9, v0.b[15]
orr w8, w8, w9
tst w8, #0xff
cset w0, ne
ret
```
After this PR:
```asm
all_8x16:
ldr q0, [x0]
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
any_8x16:
ldr q0, [x0]
umaxv b0, v0.16b
fmov w8, s0
tst w8, #0xff
cset w0, ne
ret
```
`ARMv7` + `neon`
Before this PR:
```asm
all_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, rust-lang#8
vand q0, q0, q1
vext.8 q1, q0, q0, rust-lang#4
vand q0, q0, q1
vext.8 q1, q0, q0, rust-lang#2
vand q0, q0, q1
vdup.8 q1, d0[1]
vand q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
any_8x16:
vmov.i8 q0, #0x1
vld1.64 {d2, d3}, [r0]
vtst.8 q0, q1, q0
vext.8 q1, q0, q0, rust-lang#8
vorr q0, q0, q1
vext.8 q1, q0, q0, rust-lang#4
vorr q0, q0, q1
vext.8 q1, q0, q0, rust-lang#2
vorr q0, q0, q1
vdup.8 q1, d0[1]
vorr q0, q0, q1
vmov.u8 r0, d0[0]
and r0, r0, #1
bx lr
```
After this PR:
```asm
all_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as All>::all
<m8x16 as All>::all:
vpmin.u8 d0, d0, d
b <m8x8 as All>::all
any_8x16:
vld1.64 {d0, d1}, [r0]
b <m8x16 as Any>::any
<m8x16 as Any>::any:
vpmax.u8 d0, d0, d1
b <m8x8 as Any>::any
```
The inlining problems are pretty bad on ARMv7 + NEON.
256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)
With SSE2 enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rip, +, LCPI17_0]
movdqa xmm1, xmmword, ptr, [rdi]
pand xmm1, xmm0
movdqa xmm2, xmmword, ptr, [rdi, +, 16]
pand xmm2, xmm0
pcmpeqb xmm2, xmm0
pcmpeqb xmm1, xmm0
pand xmm1, xmm2
pmovmskb eax, xmm1
xor ecx, ecx
cmp eax, 65535
mov eax, -1
cmovne eax, ecx
and al, 1
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
por xmm0, xmmword, ptr, [rdi, +, 16]
movdqa xmm1, xmmword, ptr, [rip, +, LCPI16_0]
pand xmm0, xmm1
pcmpeqb xmm0, xmm1
pmovmskb eax, xmm0
neg eax
sbb eax, eax
and al, 1
pop rbp
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb eax, xmm0
cmp eax, 65535
jne LBB17_1
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb ecx, xmm0
mov al, 1
cmp ecx, 65535
je LBB17_3
LBB17_1:
xor eax, eax
LBB17_3:
pop rbp
ret
any_8x32:
push rbp
mov rbp, rsp
movdqa xmm0, xmmword, ptr, [rdi]
pmovmskb ecx, xmm0
mov al, 1
test ecx, ecx
je LBB16_1
pop rbp
ret
LBB16_1:
movdqa xmm0, xmmword, ptr, [rdi, +, 16]
pmovmskb eax, xmm0
test eax, eax
setne al
pop rbp
ret
```
With AVX enabled
Before this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vandps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vandps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vandps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vandps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovaps ymm0, ymmword, ptr, [rdi]
vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
vextractf128 xmm1, ymm0, 1
vpxor xmm2, xmm2, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqd xmm3, xmm3, xmm3
vpxor xmm1, xmm1, xmm3
vpcmpeqb xmm0, xmm0, xmm2
vpxor xmm0, xmm0, xmm3
vinsertf128 ymm0, ymm0, xmm1, 1
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 78
vorps ymm0, ymm0, ymm1
vpermilps xmm1, xmm0, 229
vorps ymm0, ymm0, ymm1
vpsrld xmm1, xmm0, 16
vorps ymm0, ymm0, ymm1
vpsrlw xmm1, xmm0, 8
vorps ymm0, ymm0, ymm1
vpextrb eax, xmm0, 0
and al, 1
pop rbp
vzeroupper
ret
```
After this PR:
```asm
all_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vxorps xmm1, xmm1, xmm1
vcmptrueps ymm1, ymm1, ymm1
vptest ymm0, ymm1
setb al
pop rbp
vzeroupper
ret
any_8x32:
push rbp
mov rbp, rsp
vmovdqa ymm0, ymmword, ptr, [rdi]
vptest ymm0, ymm0
setne al
pop rbp
vzeroupper
ret
```
---
Closes rust-lang#362 .1 parent e7f8d4f commit 1d4668fCopy full SHA for 1d4668f
File tree
7 files changed
+660
-72
lines changed- coresimd/ppsv
- api
- codegen
- crates/coresimd
- src
7 files changed
+660
-72
lines changedcoresimd/ppsv/api/masks_reductions.rs
Copy file name to clipboardexpand all lines: coresimd/ppsv/api/masks_reductions.rs+2-24
Original file line number | Diff line number | Diff line change | |
---|---|---|---|
| |||
5 | 5 |
| |
6 | 6 |
| |
7 | 7 |
| |
8 |
| - | |
9 | 8 |
| |
10 | 9 |
| |
11 |
| - | |
12 |
| - | |
| 10 | + | |
13 | 11 |
| |
14 |
| - | |
15 |
| - | |
16 |
| - | |
17 |
| - | |
18 |
| - | |
19 |
| - | |
20 |
| - | |
21 |
| - | |
22 |
| - | |
23 | 12 |
| |
24 |
| - | |
25 | 13 |
| |
26 | 14 |
| |
27 |
| - | |
28 |
| - | |
| 15 | + | |
29 | 16 |
| |
30 |
| - | |
31 |
| - | |
32 |
| - | |
33 |
| - | |
34 |
| - | |
35 |
| - | |
36 |
| - | |
37 |
| - | |
38 |
| - | |
39 | 17 |
| |
40 | 18 |
| |
41 | 19 |
| |
|
0 commit comments