Skip to content

Commit f59e059

Browse files
yuyichaoKeno
authored andcommittedJan 16, 2017
Backport LLVM patches to fix X86 partial register stall
Fix #19976
1 parent ee84a3f commit f59e059

6 files changed

+1928
-1
lines changed
 

‎deps/llvm.mk

+6-1
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,7 @@ $(eval $(call LLVM_PATCH,llvm-r282182)) # Remove for 4.0
488488
$(eval $(call LLVM_PATCH,llvm-3.9.0_cygwin)) # R283427, Remove for 4.0
489489
endif
490490
$(eval $(call LLVM_PATCH,llvm-PR22923)) # Remove for 4.0
491-
$(eval $(call LLVM_PATCH,llvm-arm-fix-prel31))
491+
$(eval $(call LLVM_PATCH,llvm-arm-fix-prel31)) # Remove for 4.0
492492
$(eval $(call LLVM_PATCH,llvm-D25865-cmakeshlib)) # Remove for 4.0
493493
# Cygwin and openSUSE still use win32-threads mingw, https://llvm.org/bugs/show_bug.cgi?id=26365
494494
$(eval $(call LLVM_PATCH,llvm-3.9.0_threads))
@@ -505,6 +505,11 @@ $(eval $(call LLVM_PATCH,llvm-D27397)) # Julia issue #19792, Remove for 4.0
505505
$(eval $(call LLVM_PATCH,llvm-D28009)) # Julia issue #19792, Remove for 4.0
506506
$(eval $(call LLVM_PATCH,llvm-D28215_FreeBSD_shlib))
507507
$(eval $(call LLVM_PATCH,llvm-D28221-avx512)) # mentioned in issue #19797
508+
$(eval $(call LLVM_PATCH,llvm-PR276266)) # Issue #19976, Remove for 4.0
509+
$(eval $(call LLVM_PATCH,llvm-PR278088)) # Issue #19976, Remove for 4.0
510+
$(eval $(call LLVM_PATCH,llvm-PR277939)) # Issue #19976, Remove for 4.0
511+
$(eval $(call LLVM_PATCH,llvm-PR278321)) # Issue #19976, Remove for 4.0
512+
$(eval $(call LLVM_PATCH,llvm-PR278923)) # Issue #19976, Remove for 4.0
508513
endif # LLVM_VER
509514

510515
ifeq ($(LLVM_VER),3.7.1)

‎deps/patches/llvm-PR276266.patch

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
From 64d1e8b748bca22ce205eab7634cc5418c827f18 Mon Sep 17 00:00:00 2001
2+
From: Marina Yatsina <marina.yatsina@intel.com>
3+
Date: Thu, 21 Jul 2016 12:37:07 +0000
4+
Subject: [PATCH 3/5] ExecutionDepsFix - Fix bug in clearance calculation
5+
6+
The clearance calculation did not take into account registers defined as outputs or clobbers in inline assembly machine instructions because these register defs are implicit.
7+
8+
Differential Revision: http://reviews.llvm.org/D22580
9+
10+
11+
12+
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@276266 91177308-0d34-0410-b5e6-96231b3b80d8
13+
---
14+
lib/CodeGen/ExecutionDepsFix.cpp | 2 --
15+
test/CodeGen/X86/break-false-dep.ll | 10 ++++++++++
16+
2 files changed, 10 insertions(+), 2 deletions(-)
17+
18+
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
19+
index 566b8d507b2..1fe5f459b69 100644
20+
--- a/lib/CodeGen/ExecutionDepsFix.cpp
21+
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
22+
@@ -520,8 +520,6 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
23+
MachineOperand &MO = MI->getOperand(i);
24+
if (!MO.isReg())
25+
continue;
26+
- if (MO.isImplicit())
27+
- break;
28+
if (MO.isUse())
29+
continue;
30+
for (int rx : regIndices(MO.getReg())) {
31+
diff --git a/test/CodeGen/X86/break-false-dep.ll b/test/CodeGen/X86/break-false-dep.ll
32+
index 74a0728f918..a7cda499dab 100644
33+
--- a/test/CodeGen/X86/break-false-dep.ll
34+
+++ b/test/CodeGen/X86/break-false-dep.ll
35+
@@ -199,3 +199,13 @@ for.end16: ; preds = %for.inc14
36+
;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
37+
;AVX-NEXT: vmovsd [[XMM0]],
38+
}
39+
+
40+
+define double @inlineasmdep(i64 %arg) {
41+
+top:
42+
+ tail call void asm sideeffect "", "~{xmm0},~{dirflag},~{fpsr},~{flags}"()
43+
+ %tmp1 = sitofp i64 %arg to double
44+
+ ret double %tmp1
45+
+;AVX-LABEL:@inlineasmdep
46+
+;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]], [[XMM0]]
47+
+;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM0]], {{%xmm[0-9]+}}
48+
+}
49+
--
50+
2.11.0
51+

‎deps/patches/llvm-PR277939.patch

+169
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
From 9790ab8bccdbc71dfcc166860ab6ce9c369bf686 Mon Sep 17 00:00:00 2001
2+
From: Simon Pilgrim <llvm-dev@redking.me.uk>
3+
Date: Sat, 6 Aug 2016 21:21:12 +0000
4+
Subject: [PATCH 1/5] [X86][AVX2] Improve sign/zero extension on AVX2 targets
5+
6+
Split extensions to large vectors into 256-bit chunks - the equivalent of what we do with pre-AVX2 into 128-bit chunks
7+
8+
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@277939 91177308-0d34-0410-b5e6-96231b3b80d8
9+
---
10+
lib/Target/X86/X86ISelLowering.cpp | 22 +++++++++++++++-------
11+
test/CodeGen/X86/vec_int_to_fp.ll | 24 ++++++++----------------
12+
test/CodeGen/X86/vector-sext.ll | 10 ++--------
13+
3 files changed, 25 insertions(+), 31 deletions(-)
14+
15+
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
16+
index ca205335013..2bbedd4bd97 100644
17+
--- a/lib/Target/X86/X86ISelLowering.cpp
18+
+++ b/lib/Target/X86/X86ISelLowering.cpp
19+
@@ -30164,11 +30164,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
20+
: DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
21+
}
22+
23+
- // On pre-AVX2 targets, split into 128-bit nodes of
24+
- // ISD::*_EXTEND_VECTOR_INREG.
25+
- if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) {
26+
- unsigned NumVecs = VT.getSizeInBits() / 128;
27+
- unsigned NumSubElts = 128 / SVT.getSizeInBits();
28+
+ auto SplitAndExtendInReg = [&](unsigned SplitSize) {
29+
+ unsigned NumVecs = VT.getSizeInBits() / SplitSize;
30+
+ unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
31+
EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
32+
EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
33+
34+
@@ -30176,14 +30174,24 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
35+
for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
36+
SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
37+
DAG.getIntPtrConstant(Offset, DL));
38+
- SrcVec = ExtendVecSize(DL, SrcVec, 128);
39+
+ SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
40+
SrcVec = Opcode == ISD::SIGN_EXTEND
41+
? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
42+
: DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
43+
Opnds.push_back(SrcVec);
44+
}
45+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
46+
- }
47+
+ };
48+
+
49+
+ // On pre-AVX2 targets, split into 128-bit nodes of
50+
+ // ISD::*_EXTEND_VECTOR_INREG.
51+
+ if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
52+
+ return SplitAndExtendInReg(128);
53+
+
54+
+ // On pre-AVX512 targets, split into 256-bit nodes of
55+
+ // ISD::*_EXTEND_VECTOR_INREG.
56+
+ if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
57+
+ return SplitAndExtendInReg(256);
58+
59+
return SDValue();
60+
}
61+
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
62+
index 43f5318a607..5d8f91385c7 100644
63+
--- a/test/CodeGen/X86/vec_int_to_fp.ll
64+
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
65+
@@ -153,8 +153,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
66+
;
67+
; AVX2-LABEL: sitofp_16i8_to_2f64:
68+
; AVX2: # BB#0:
69+
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
70+
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
71+
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
72+
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
73+
; AVX2-NEXT: # kill
74+
; AVX2-NEXT: vzeroupper
75+
@@ -325,8 +324,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
76+
;
77+
; AVX2-LABEL: sitofp_16i8_to_4f64:
78+
; AVX2: # BB#0:
79+
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
80+
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
81+
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
82+
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
83+
; AVX2-NEXT: retq
84+
%cvt = sitofp <16 x i8> %a to <16 x double>
85+
@@ -543,8 +541,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
86+
;
87+
; AVX2-LABEL: uitofp_16i8_to_2f64:
88+
; AVX2: # BB#0:
89+
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
90+
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
91+
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
92+
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
93+
; AVX2-NEXT: # kill
94+
; AVX2-NEXT: vzeroupper
95+
@@ -778,8 +775,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
96+
;
97+
; AVX2-LABEL: uitofp_16i8_to_4f64:
98+
; AVX2: # BB#0:
99+
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
100+
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
101+
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
102+
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
103+
; AVX2-NEXT: retq
104+
%cvt = uitofp <16 x i8> %a to <16 x double>
105+
@@ -958,8 +954,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
106+
;
107+
; AVX2-LABEL: sitofp_16i8_to_4f32:
108+
; AVX2: # BB#0:
109+
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
110+
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
111+
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
112+
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
113+
; AVX2-NEXT: # kill
114+
; AVX2-NEXT: vzeroupper
115+
@@ -1134,8 +1129,7 @@ define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
116+
;
117+
; AVX2-LABEL: sitofp_16i8_to_8f32:
118+
; AVX2: # BB#0:
119+
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
120+
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
121+
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
122+
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
123+
; AVX2-NEXT: retq
124+
%cvt = sitofp <16 x i8> %a to <16 x float>
125+
@@ -1456,8 +1450,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
126+
;
127+
; AVX2-LABEL: uitofp_16i8_to_4f32:
128+
; AVX2: # BB#0:
129+
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
130+
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
131+
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
132+
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
133+
; AVX2-NEXT: # kill
134+
; AVX2-NEXT: vzeroupper
135+
@@ -1813,8 +1806,7 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
136+
;
137+
; AVX2-LABEL: uitofp_16i8_to_8f32:
138+
; AVX2: # BB#0:
139+
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
140+
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
141+
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
142+
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
143+
; AVX2-NEXT: retq
144+
%cvt = uitofp <16 x i8> %a to <16 x float>
145+
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
146+
index 018c5922a43..e29f3e5f91f 100644
147+
--- a/test/CodeGen/X86/vector-sext.ll
148+
+++ b/test/CodeGen/X86/vector-sext.ll
149+
@@ -407,15 +407,9 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
150+
;
151+
; AVX2-LABEL: sext_16i8_to_8i64:
152+
; AVX2: # BB#0: # %entry
153+
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
154+
-; AVX2-NEXT: vpslld $24, %xmm1, %xmm1
155+
-; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
156+
-; AVX2-NEXT: vpmovsxdq %xmm1, %ymm2
157+
+; AVX2-NEXT: vpmovsxbq %xmm0, %ymm2
158+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
159+
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
160+
-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
161+
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
162+
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1
163+
+; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1
164+
; AVX2-NEXT: vmovdqa %ymm2, %ymm0
165+
; AVX2-NEXT: retq
166+
;
167+
--
168+
2.11.0
169+

0 commit comments

Comments
 (0)
Please sign in to comment.