|
| 1 | +From 9790ab8bccdbc71dfcc166860ab6ce9c369bf686 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Simon Pilgrim <llvm-dev@redking.me.uk> |
| 3 | +Date: Sat, 6 Aug 2016 21:21:12 +0000 |
| 4 | +Subject: [PATCH 1/5] [X86][AVX2] Improve sign/zero extension on AVX2 targets |
| 5 | + |
| 6 | +Split extensions to large vectors into 256-bit chunks - the equivalent of what we do with pre-AVX2 into 128-bit chunks |
| 7 | + |
| 8 | +git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@277939 91177308-0d34-0410-b5e6-96231b3b80d8 |
| 9 | +--- |
| 10 | + lib/Target/X86/X86ISelLowering.cpp | 22 +++++++++++++++------- |
| 11 | + test/CodeGen/X86/vec_int_to_fp.ll | 24 ++++++++---------------- |
| 12 | + test/CodeGen/X86/vector-sext.ll | 10 ++-------- |
| 13 | + 3 files changed, 25 insertions(+), 31 deletions(-) |
| 14 | + |
| 15 | +diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp |
| 16 | +index ca205335013..2bbedd4bd97 100644 |
| 17 | +--- a/lib/Target/X86/X86ISelLowering.cpp |
| 18 | ++++ b/lib/Target/X86/X86ISelLowering.cpp |
| 19 | +@@ -30164,11 +30164,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, |
| 20 | + : DAG.getZeroExtendVectorInReg(ExOp, DL, VT); |
| 21 | + } |
| 22 | + |
| 23 | +- // On pre-AVX2 targets, split into 128-bit nodes of |
| 24 | +- // ISD::*_EXTEND_VECTOR_INREG. |
| 25 | +- if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) { |
| 26 | +- unsigned NumVecs = VT.getSizeInBits() / 128; |
| 27 | +- unsigned NumSubElts = 128 / SVT.getSizeInBits(); |
| 28 | ++ auto SplitAndExtendInReg = [&](unsigned SplitSize) { |
| 29 | ++ unsigned NumVecs = VT.getSizeInBits() / SplitSize; |
| 30 | ++ unsigned NumSubElts = SplitSize / SVT.getSizeInBits(); |
| 31 | + EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); |
| 32 | + EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); |
| 33 | + |
| 34 | +@@ -30176,14 +30174,24 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, |
| 35 | + for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { |
| 36 | + SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, |
| 37 | + DAG.getIntPtrConstant(Offset, DL)); |
| 38 | +- SrcVec = ExtendVecSize(DL, SrcVec, 128); |
| 39 | ++ SrcVec = ExtendVecSize(DL, SrcVec, SplitSize); |
| 40 | + SrcVec = Opcode == ISD::SIGN_EXTEND |
| 41 | + ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT) |
| 42 | + : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT); |
| 43 | + Opnds.push_back(SrcVec); |
| 44 | + } |
| 45 | + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); |
| 46 | +- } |
| 47 | ++ }; |
| 48 | ++ |
| 49 | ++ // On pre-AVX2 targets, split into 128-bit nodes of |
| 50 | ++ // ISD::*_EXTEND_VECTOR_INREG. |
| 51 | ++ if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) |
| 52 | ++ return SplitAndExtendInReg(128); |
| 53 | ++ |
| 54 | ++ // On pre-AVX512 targets, split into 256-bit nodes of |
| 55 | ++ // ISD::*_EXTEND_VECTOR_INREG. |
| 56 | ++ if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256)) |
| 57 | ++ return SplitAndExtendInReg(256); |
| 58 | + |
| 59 | + return SDValue(); |
| 60 | + } |
| 61 | +diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll |
| 62 | +index 43f5318a607..5d8f91385c7 100644 |
| 63 | +--- a/test/CodeGen/X86/vec_int_to_fp.ll |
| 64 | ++++ b/test/CodeGen/X86/vec_int_to_fp.ll |
| 65 | +@@ -153,8 +153,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { |
| 66 | + ; |
| 67 | + ; AVX2-LABEL: sitofp_16i8_to_2f64: |
| 68 | + ; AVX2: # BB#0: |
| 69 | +-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 |
| 70 | +-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 |
| 71 | ++; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 |
| 72 | + ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 |
| 73 | + ; AVX2-NEXT: # kill |
| 74 | + ; AVX2-NEXT: vzeroupper |
| 75 | +@@ -325,8 +324,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { |
| 76 | + ; |
| 77 | + ; AVX2-LABEL: sitofp_16i8_to_4f64: |
| 78 | + ; AVX2: # BB#0: |
| 79 | +-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 |
| 80 | +-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 |
| 81 | ++; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 |
| 82 | + ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 |
| 83 | + ; AVX2-NEXT: retq |
| 84 | + %cvt = sitofp <16 x i8> %a to <16 x double> |
| 85 | +@@ -543,8 +541,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { |
| 86 | + ; |
| 87 | + ; AVX2-LABEL: uitofp_16i8_to_2f64: |
| 88 | + ; AVX2: # BB#0: |
| 89 | +-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero |
| 90 | +-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero |
| 91 | ++; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero |
| 92 | + ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 |
| 93 | + ; AVX2-NEXT: # kill |
| 94 | + ; AVX2-NEXT: vzeroupper |
| 95 | +@@ -778,8 +775,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { |
| 96 | + ; |
| 97 | + ; AVX2-LABEL: uitofp_16i8_to_4f64: |
| 98 | + ; AVX2: # BB#0: |
| 99 | +-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero |
| 100 | +-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero |
| 101 | ++; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero |
| 102 | + ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 |
| 103 | + ; AVX2-NEXT: retq |
| 104 | + %cvt = uitofp <16 x i8> %a to <16 x double> |
| 105 | +@@ -958,8 +954,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { |
| 106 | + ; |
| 107 | + ; AVX2-LABEL: sitofp_16i8_to_4f32: |
| 108 | + ; AVX2: # BB#0: |
| 109 | +-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 |
| 110 | +-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 |
| 111 | ++; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 |
| 112 | + ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 |
| 113 | + ; AVX2-NEXT: # kill |
| 114 | + ; AVX2-NEXT: vzeroupper |
| 115 | +@@ -1134,8 +1129,7 @@ define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) { |
| 116 | + ; |
| 117 | + ; AVX2-LABEL: sitofp_16i8_to_8f32: |
| 118 | + ; AVX2: # BB#0: |
| 119 | +-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 |
| 120 | +-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 |
| 121 | ++; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 |
| 122 | + ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 |
| 123 | + ; AVX2-NEXT: retq |
| 124 | + %cvt = sitofp <16 x i8> %a to <16 x float> |
| 125 | +@@ -1456,8 +1450,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { |
| 126 | + ; |
| 127 | + ; AVX2-LABEL: uitofp_16i8_to_4f32: |
| 128 | + ; AVX2: # BB#0: |
| 129 | +-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero |
| 130 | +-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero |
| 131 | ++; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero |
| 132 | + ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 |
| 133 | + ; AVX2-NEXT: # kill |
| 134 | + ; AVX2-NEXT: vzeroupper |
| 135 | +@@ -1813,8 +1806,7 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { |
| 136 | + ; |
| 137 | + ; AVX2-LABEL: uitofp_16i8_to_8f32: |
| 138 | + ; AVX2: # BB#0: |
| 139 | +-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero |
| 140 | +-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero |
| 141 | ++; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero |
| 142 | + ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 |
| 143 | + ; AVX2-NEXT: retq |
| 144 | + %cvt = uitofp <16 x i8> %a to <16 x float> |
| 145 | +diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll |
| 146 | +index 018c5922a43..e29f3e5f91f 100644 |
| 147 | +--- a/test/CodeGen/X86/vector-sext.ll |
| 148 | ++++ b/test/CodeGen/X86/vector-sext.ll |
| 149 | +@@ -407,15 +407,9 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp |
| 150 | + ; |
| 151 | + ; AVX2-LABEL: sext_16i8_to_8i64: |
| 152 | + ; AVX2: # BB#0: # %entry |
| 153 | +-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero |
| 154 | +-; AVX2-NEXT: vpslld $24, %xmm1, %xmm1 |
| 155 | +-; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 |
| 156 | +-; AVX2-NEXT: vpmovsxdq %xmm1, %ymm2 |
| 157 | ++; AVX2-NEXT: vpmovsxbq %xmm0, %ymm2 |
| 158 | + ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] |
| 159 | +-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero |
| 160 | +-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 |
| 161 | +-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 |
| 162 | +-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1 |
| 163 | ++; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1 |
| 164 | + ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 |
| 165 | + ; AVX2-NEXT: retq |
| 166 | + ; |
| 167 | +-- |
| 168 | +2.11.0 |
| 169 | + |
0 commit comments