diff --git a/deps/llvm.mk b/deps/llvm.mk index 92e6a5e2f5236..d878d6d470418 100644 --- a/deps/llvm.mk +++ b/deps/llvm.mk @@ -488,7 +488,7 @@ $(eval $(call LLVM_PATCH,llvm-r282182)) # Remove for 4.0 $(eval $(call LLVM_PATCH,llvm-3.9.0_cygwin)) # R283427, Remove for 4.0 endif $(eval $(call LLVM_PATCH,llvm-PR22923)) # Remove for 4.0 -$(eval $(call LLVM_PATCH,llvm-arm-fix-prel31)) +$(eval $(call LLVM_PATCH,llvm-arm-fix-prel31)) # Remove for 4.0 $(eval $(call LLVM_PATCH,llvm-D25865-cmakeshlib)) # Remove for 4.0 # Cygwin and openSUSE still use win32-threads mingw, https://llvm.org/bugs/show_bug.cgi?id=26365 $(eval $(call LLVM_PATCH,llvm-3.9.0_threads)) @@ -505,6 +505,13 @@ $(eval $(call LLVM_PATCH,llvm-D27397)) # Julia issue #19792, Remove for 4.0 $(eval $(call LLVM_PATCH,llvm-D28009)) # Julia issue #19792, Remove for 4.0 $(eval $(call LLVM_PATCH,llvm-D28215_FreeBSD_shlib)) $(eval $(call LLVM_PATCH,llvm-D28221-avx512)) # mentioned in issue #19797 +$(eval $(call LLVM_PATCH,llvm-PR276266)) # Issue #19976, Remove for 4.0 +$(eval $(call LLVM_PATCH,llvm-PR278088)) # Issue #19976, Remove for 4.0 +$(eval $(call LLVM_PATCH,llvm-PR277939)) # Issue #19976, Remove for 4.0 +$(eval $(call LLVM_PATCH,llvm-PR278321)) # Issue #19976, Remove for 4.0 +$(eval $(call LLVM_PATCH,llvm-PR278923)) # Issue #19976, Remove for 4.0 +$(eval $(call LLVM_PATCH,llvm-D28759-loopclearance)) +$(eval $(call LLVM_PATCH,llvm-D28786-callclearance)) $(eval $(call LLVM_PATCH,llvm-rL293230-icc17-cmake)) # Remove for 4.0 endif # LLVM_VER diff --git a/deps/patches/llvm-D28759-loopclearance.patch b/deps/patches/llvm-D28759-loopclearance.patch new file mode 100644 index 0000000000000..036ca4d83103c --- /dev/null +++ b/deps/patches/llvm-D28759-loopclearance.patch @@ -0,0 +1,480 @@ +From e3621af0115a851d0ed02f0b436deec62ec3e99c Mon Sep 17 00:00:00 2001 +From: Keno Fischer +Date: Sun, 15 Jan 2017 23:59:07 -0500 +Subject: [PATCH] [ExecutionDepsFix] Improve clearance calculation for loops + +In revision rL278321, ExecutionDepsFix learned how to pick a better +register for undef register reads, e.g. for instructions such as +`vcvtsi2sdq`. While this revision improved performance on a good number +of our benchmarks, it unfortunately also caused significant regressions +(up to 3x) on others. This regression turned out to be caused by loops +such as: + +PH -> A -> B (xmm -> xmm) -> C -> D -> EXIT + ^ | + +----------------------------------+ + +In the previous version of the clearance calculation, we would visit +the blocks in order, remembering for each whether there were any +incoming backedges from blocks that we hadn't processed yet and if +so queuing up the block to be re-processed. However, for loop structures +such as the above, this is clearly insufficient, since the block B +does not have any unknown backedges, so we do not see the false +dependency from the previous interation's Def of xmm registers in B. + +To fix this, we need to consider all blocks that are part of the loop +and reprocess them one the correct clearance values are known. As +an optimization, we also want to avoid reprocessing any later blocks +that are not part of the loop. + +In summary, the iteration order is as follows: +Before: PH A B C D A' +Corrected (Naive): PH A B C D A' B' C' D' +Corrected (w/ optimization): PH A B C A' B' C' D + +To facilitate this optimization we introduce two new counters for each +basic block. The first counts how many of it's predecssors have +completed primary processing. The second counts how many of its +predecessors have completed all processing (we will call such a block +*done*. Now, the criteria to reprocess a block is as follows: + - All Predecessors have completed primary processing + - For x the number of predecessors that have completed primary + processing *at the time of primary processing of this block*, + the number of predecessors that are done has reached x. + +The intuition behind this criterion is as follows: +We need to perform primary processing on all predecessors in order to +find out any direct defs in those predecessors. When predecessors are +done, we also know that we have information about indirect defs (e.g. +in block B though that were inherited through B->C->A->B). However, +we can't wait for all predecessors to be done, since that would +cause cyclic dependencies. However, it is guaranteed that all those +predecessors that are prior to us in reverse postorder will be done +before us. Since we iterate of the basic blocks in reverse postorder, +the number x above, is precisely the count of the number of predecessors +prior to us in reverse postorder. +--- + lib/CodeGen/ExecutionDepsFix.cpp | 223 ++++++++++++++++++++++-------------- + test/CodeGen/X86/break-false-dep.ll | 57 +++++++++ + 2 files changed, 197 insertions(+), 83 deletions(-) + +diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp +index e7c6b03..6ac1db4 100644 +--- a/lib/CodeGen/ExecutionDepsFix.cpp ++++ b/lib/CodeGen/ExecutionDepsFix.cpp +@@ -142,8 +142,26 @@ class ExeDepsFix : public MachineFunctionPass { + std::vector> AliasMap; + const unsigned NumRegs; + LiveReg *LiveRegs; +- typedef DenseMap LiveOutMap; +- LiveOutMap LiveOuts; ++ struct MBBInfo { ++ // Keeps clearance and domain information for all registers. Not that this ++ // is different from the usual definition notion of liveness. The CPU ++ // doesn't care whether or not we consider a register killed. ++ LiveReg *OutRegs; ++ ++ // Whether we have gotten to this block in primary processing yet. ++ bool PrimaryCompleted; ++ ++ // The number of predecessors for which primary processing has completed ++ unsigned IncomingProcessed; ++ ++ // The value of `IncomingProcessed` at the start of primary processing ++ unsigned PrimaryIncoming; ++ ++ // The number of predecessors for which all processing steps are done. ++ unsigned IncomingCompleted; ++ }; ++ typedef DenseMap MBBInfoMap; ++ MBBInfoMap MBBInfos; + + /// List of undefined register reads in this block in forward order. + std::vector > UndefReads; +@@ -154,11 +172,6 @@ class ExeDepsFix : public MachineFunctionPass { + /// Current instruction number. + /// The first instruction in each basic block is 0. + int CurInstr; +- +- /// True when the current block has a predecessor that hasn't been visited +- /// yet. +- bool SeenUnknownBackEdge; +- + public: + ExeDepsFix(const TargetRegisterClass *rc) + : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {} +@@ -180,7 +193,6 @@ public: + private: + iterator_range::const_iterator> + regIndices(unsigned Reg) const; +- + // DomainValue allocation. + DomainValue *alloc(int domain = -1); + DomainValue *retain(DomainValue *DV) { +@@ -199,8 +211,11 @@ private: + + void enterBasicBlock(MachineBasicBlock*); + void leaveBasicBlock(MachineBasicBlock*); +- void visitInstr(MachineInstr*); +- void processDefs(MachineInstr*, bool Kill); ++ bool isBlockDone(MachineBasicBlock *); ++ void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass, bool Done); ++ void updateSuccessors(MachineBasicBlock *MBB, bool Primary, bool Done); ++ bool visitInstr(MachineInstr *); ++ void processDefs(MachineInstr *, bool BlockDone, bool Kill); + void visitSoftInstr(MachineInstr*, unsigned mask); + void visitHardInstr(MachineInstr*, unsigned domain); + void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, +@@ -360,9 +375,6 @@ bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) { + + /// Set up LiveRegs by merging predecessor live-out values. + void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { +- // Detect back-edges from predecessors we haven't processed yet. +- SeenUnknownBackEdge = false; +- + // Reset instruction counter in each basic block. + CurInstr = 0; + +@@ -397,18 +409,18 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { + // Try to coalesce live-out registers from predecessors. + for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(), + pe = MBB->pred_end(); pi != pe; ++pi) { +- LiveOutMap::const_iterator fi = LiveOuts.find(*pi); +- if (fi == LiveOuts.end()) { +- SeenUnknownBackEdge = true; ++ auto fi = MBBInfos.find(*pi); ++ assert(fi != MBBInfos.end()); ++ LiveReg *Incoming = fi->second.OutRegs; ++ if (Incoming == nullptr) { + continue; + } +- assert(fi->second && "Can't have NULL entries"); + + for (unsigned rx = 0; rx != NumRegs; ++rx) { + // Use the most recent predecessor def for each register. +- LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, fi->second[rx].Def); ++ LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def); + +- DomainValue *pdv = resolve(fi->second[rx].Value); ++ DomainValue *pdv = resolve(Incoming[rx].Value); + if (!pdv) + continue; + if (!LiveRegs[rx].Value) { +@@ -432,35 +444,33 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { + force(rx, pdv->getFirstDomain()); + } + } +- DEBUG(dbgs() << "BB#" << MBB->getNumber() +- << (SeenUnknownBackEdge ? ": incomplete\n" : ": all preds known\n")); ++ DEBUG( ++ dbgs() << "BB#" << MBB->getNumber() ++ << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n")); + } + + void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) { + assert(LiveRegs && "Must enter basic block first."); ++ LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs; + // Save live registers at end of MBB - used by enterBasicBlock(). + // Also use LiveOuts as a visited set to detect back-edges. +- bool First = LiveOuts.insert(std::make_pair(MBB, LiveRegs)).second; +- +- if (First) { +- // LiveRegs was inserted in LiveOuts. Adjust all defs to be relative to +- // the end of this block instead of the beginning. +- for (unsigned i = 0, e = NumRegs; i != e; ++i) +- LiveRegs[i].Def -= CurInstr; +- } else { +- // Insertion failed, this must be the second pass. ++ MBBInfos[MBB].OutRegs = LiveRegs; ++ ++ // LiveRegs was inserted in LiveOuts. Adjust all defs to be relative to ++ // the end of this block instead of the beginning. ++ for (unsigned i = 0, e = NumRegs; i != e; ++i) ++ LiveRegs[i].Def -= CurInstr; ++ if (OldOutRegs) { ++ // This must be the second pass. + // Release all the DomainValues instead of keeping them. + for (unsigned i = 0, e = NumRegs; i != e; ++i) +- release(LiveRegs[i].Value); +- delete[] LiveRegs; ++ release(OldOutRegs[i].Value); ++ delete[] OldOutRegs; + } + LiveRegs = nullptr; + } + +-void ExeDepsFix::visitInstr(MachineInstr *MI) { +- if (MI->isDebugValue()) +- return; +- ++bool ExeDepsFix::visitInstr(MachineInstr *MI) { + // Update instructions with explicit execution domains. + std::pair DomP = TII->getExecutionDomain(*MI); + if (DomP.first) { +@@ -470,9 +480,7 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) { + visitHardInstr(MI, DomP.first); + } + +- // Process defs to track register ages, and kill values clobbered by generic +- // instructions. +- processDefs(MI, !DomP.first); ++ return !DomP.first; + } + + /// \brief Helps avoid false dependencies on undef registers by updating the +@@ -542,14 +550,7 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, + DEBUG(dbgs() << ": Break dependency.\n"); + continue; + } +- // The current clearance seems OK, but we may be ignoring a def from a +- // back-edge. +- if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) { +- DEBUG(dbgs() << ": OK .\n"); +- return false; +- } +- // A def from an unprocessed back-edge may make us break this dependency. +- DEBUG(dbgs() << ": Wait for back-edge to resolve.\n"); ++ DEBUG(dbgs() << ": OK .\n"); + return false; + } + return true; +@@ -559,16 +560,21 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, + // If Kill is set, also kill off DomainValues clobbered by the defs. + // + // Also break dependencies on partial defs and undef uses. +-void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { ++void ExeDepsFix::processDefs(MachineInstr *MI, bool BlockDone, bool Kill) { + assert(!MI->isDebugValue() && "Won't process debug values"); + + // Break dependence on undef uses. Do this before updating LiveRegs below. + unsigned OpNum; +- unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); +- if (Pref) { +- pickBestRegisterForUndef(MI, OpNum, Pref); +- if (shouldBreakDependence(MI, OpNum, Pref)) +- UndefReads.push_back(std::make_pair(MI, OpNum)); ++ // If this block is not done, it makes little sense to make any decisions ++ // based on clearance information. We need to make a second pass anyway, ++ // and by then we'll have better information, so we can avoid this work now. ++ if (BlockDone) { ++ unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); ++ if (Pref) { ++ pickBestRegisterForUndef(MI, OpNum, Pref); ++ if (shouldBreakDependence(MI, OpNum, Pref)) ++ UndefReads.push_back(std::make_pair(MI, OpNum)); ++ } + } + const MCInstrDesc &MCID = MI->getDesc(); + for (unsigned i = 0, +@@ -584,11 +590,13 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { + DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr + << '\t' << *MI); + +- // Check clearance before partial register updates. +- // Call breakDependence before setting LiveRegs[rx].Def. +- unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); +- if (Pref && shouldBreakDependence(MI, i, Pref)) +- TII->breakPartialRegDependency(*MI, i, TRI); ++ if (BlockDone) { ++ // Check clearance before partial register updates. ++ // Call breakDependence before setting LiveRegs[rx].Def. ++ unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); ++ if (Pref && shouldBreakDependence(MI, i, Pref)) ++ TII->breakPartialRegDependency(*MI, i, TRI); ++ } + + // How many instructions since rx was last written? + LiveRegs[rx].Def = CurInstr; +@@ -780,6 +788,45 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { + } + } + ++void ExeDepsFix::processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass, ++ bool Done) { ++ enterBasicBlock(MBB); ++ for (MachineInstr &MI : *MBB) { ++ if (!MI.isDebugValue()) { ++ bool Kill = false; ++ if (PrimaryPass) ++ Kill = visitInstr(&MI); ++ processDefs(&MI, isBlockDone(MBB), Kill); ++ } ++ } ++ processUndefReads(MBB); ++ leaveBasicBlock(MBB); ++} ++ ++bool ExeDepsFix::isBlockDone(MachineBasicBlock *MBB) { ++ return MBBInfos[MBB].PrimaryCompleted && ++ MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming && ++ MBBInfos[MBB].IncomingProcessed == MBB->pred_size(); ++} ++ ++void ExeDepsFix::updateSuccessors(MachineBasicBlock *MBB, bool Primary, ++ bool Done) { ++ for (auto *Succ : MBB->successors()) { ++ if (!isBlockDone(Succ)) { ++ if (Primary) { ++ MBBInfos[Succ].IncomingProcessed++; ++ } ++ if (Done) { ++ MBBInfos[Succ].IncomingCompleted++; ++ } ++ if (isBlockDone(Succ)) { ++ processBasicBlock(Succ, false, true); ++ updateSuccessors(Succ, false, true); ++ } ++ } ++ } ++} ++ + bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { + if (skipFunction(*mf.getFunction())) + return false; +@@ -816,44 +863,54 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { + AliasMap[*AI].push_back(i); + } + ++ // Initialize the MMBInfos ++ for (auto &MBB : mf) { ++ MBBInfo InitialInfo{nullptr, false, 0, 0, 0}; ++ MBBInfos.insert(std::make_pair(&MBB, InitialInfo)); ++ } ++ + MachineBasicBlock *Entry = &*MF->begin(); + ReversePostOrderTraversal RPOT(Entry); +- SmallVector Loops; + for (ReversePostOrderTraversal::rpo_iterator + MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { + MachineBasicBlock *MBB = *MBBI; +- enterBasicBlock(MBB); +- if (SeenUnknownBackEdge) +- Loops.push_back(MBB); +- for (MachineInstr &MI : *MBB) +- visitInstr(&MI); +- processUndefReads(MBB); +- leaveBasicBlock(MBB); +- } +- +- // Visit all the loop blocks again in order to merge DomainValues from +- // back-edges. +- for (MachineBasicBlock *MBB : Loops) { +- enterBasicBlock(MBB); +- for (MachineInstr &MI : *MBB) +- if (!MI.isDebugValue()) +- processDefs(&MI, false); +- processUndefReads(MBB); +- leaveBasicBlock(MBB); ++ MBBInfos[MBB].PrimaryCompleted = true; ++ MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed; ++ bool PrimaryDone = isBlockDone(MBB); ++ processBasicBlock(MBB, true, PrimaryDone); ++ updateSuccessors(MBB, true, PrimaryDone); ++ } ++ ++ // We need to go through again and finalize any blocks that are not done yet. ++ // This is possible if blocks have dead predecessors, so we didn't visit them ++ // above. N.B.: The reason we update succesors immidately above, rather than ++ // doing everything in one go here, is to avoid having to do two passes on ++ // basic block between loops (with the scheme above, the whole loop will be ++ // completed before moving on to the blocks after it). ++ for (ReversePostOrderTraversal::rpo_iterator ++ MBBI = RPOT.begin(), ++ MBBE = RPOT.end(); ++ MBBI != MBBE; ++MBBI) { ++ MachineBasicBlock *MBB = *MBBI; ++ if (!isBlockDone(MBB)) { ++ processBasicBlock(MBB, false, true); ++ // Don't update successors here. We'll get to them anyway through this ++ // loop. ++ } + } + + // Clear the LiveOuts vectors and collapse any remaining DomainValues. + for (ReversePostOrderTraversal::rpo_iterator + MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { +- LiveOutMap::const_iterator FI = LiveOuts.find(*MBBI); +- if (FI == LiveOuts.end() || !FI->second) ++ auto FI = MBBInfos.find(*MBBI); ++ if (FI == MBBInfos.end() || !FI->second.OutRegs) + continue; + for (unsigned i = 0, e = NumRegs; i != e; ++i) +- if (FI->second[i].Value) +- release(FI->second[i].Value); +- delete[] FI->second; ++ if (FI->second.OutRegs[i].Value) ++ release(FI->second.OutRegs[i].Value); ++ delete[] FI->second.OutRegs; + } +- LiveOuts.clear(); ++ MBBInfos.clear(); + UndefReads.clear(); + Avail.clear(); + Allocator.DestroyAll(); +diff --git a/test/CodeGen/X86/break-false-dep.ll b/test/CodeGen/X86/break-false-dep.ll +index 4c5e747..0ba1825 100644 +--- a/test/CodeGen/X86/break-false-dep.ll ++++ b/test/CodeGen/X86/break-false-dep.ll +@@ -277,3 +277,60 @@ ret: + ;AVX: vcvtsi2sdq {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}} + ;AVX-NOT: [[XMM4_7]] + } ++ ++; Make sure we are making a smart choice regarding undef registers even for more ++; complicated loop structures. This example is the inner loop from ++; julia> a = falses(10000); a[1:4:end] = true ++; julia> linspace(1.0,2.0,10000)[a] ++define void @loopclearance2(double* nocapture %y, i64* %x, double %c1, double %c2, double %c3, double %c4, i64 %size) { ++entry: ++ tail call void asm sideeffect "", "~{xmm7},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() ++ br label %loop ++ ++loop: ++ %phi_i = phi i64 [ 1, %entry ], [ %nexti, %loop_end ] ++ %phi_j = phi i64 [ 1, %entry ], [ %nextj, %loop_end ] ++ %phi_k = phi i64 [ 0, %entry ], [ %nextk, %loop_end ] ++ br label %inner_loop ++ ++inner_loop: ++ %phi = phi i64 [ %phi_k, %loop ], [ %nextk, %inner_loop ] ++ %idx = lshr i64 %phi, 6 ++ %inputptr = getelementptr i64, i64* %x, i64 %idx ++ %input = load i64, i64* %inputptr, align 8 ++ %masked = and i64 %phi, 63 ++ %shiftedmasked = shl i64 1, %masked ++ %maskedinput = and i64 %input, %shiftedmasked ++ %cmp = icmp eq i64 %maskedinput, 0 ++ %nextk = add i64 %phi, 1 ++ br i1 %cmp, label %inner_loop, label %loop_end ++ ++loop_end: ++ %nexti = add i64 %phi_i, 1 ++ %nextj = add i64 %phi_j, 1 ++ ; Register use, plus us clobbering 7-15 above, basically forces xmm7 here as ++ ; the only reasonable choice. The primary thing we care about is that it's ++ ; not one of the registers used in the loop (e.g. not the output reg here) ++;AVX-NOT: %xmm6 ++;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} ++;AVX-NOT: %xmm6 ++ %nexti_f = sitofp i64 %nexti to double ++ %sub = fsub double %c1, %nexti_f ++ %mul = fmul double %sub, %c2 ++;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} ++;AVX-NOT: %xmm6 ++ %phi_f = sitofp i64 %phi to double ++ %mul2 = fmul double %phi_f, %c3 ++ %add2 = fadd double %mul, %mul2 ++ %div = fdiv double %add2, %c4 ++ %prev_j = add i64 %phi_j, -1 ++ %outptr = getelementptr double, double* %y, i64 %prev_j ++ store double %div, double* %outptr, align 8 ++ %done = icmp slt i64 %size, %nexti ++ br i1 %done, label %loopdone, label %loop ++ ++loopdone: ++ ret void ++} +-- +2.9.3 diff --git a/deps/patches/llvm-D28786-callclearance.patch b/deps/patches/llvm-D28786-callclearance.patch new file mode 100644 index 0000000000000..fa7dfd3f22d38 --- /dev/null +++ b/deps/patches/llvm-D28786-callclearance.patch @@ -0,0 +1,344 @@ +diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h +index 83515bc..65b435a 100644 +--- a/include/llvm/Target/TargetInstrInfo.h ++++ b/include/llvm/Target/TargetInstrInfo.h +@@ -1440,6 +1440,17 @@ public: + virtual void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const {} + ++ /// May return true if the instruction in question is a dependency breaking ++ /// instruction. If so, the register number for which it is dependency ++ /// breaking should be returned in `OutReg`. It is prefereable to return ++ /// false if the result cannot be determined. This would at worst result ++ /// in the insertion of an unnecessary instruction, while the other ++ /// alternative could result in significant false-dependency penalties. ++ virtual bool isDependencyBreak(MachineInstr &MI, ++ unsigned *OutReg = nullptr) const { ++ return false; ++ } ++ + /// Create machine specific model for scheduling. + virtual DFAPacketizer * + CreateTargetScheduleState(const TargetSubtargetInfo &) const { +diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp +index 6ac1db4..63065ea 100644 +--- a/lib/CodeGen/ExecutionDepsFix.cpp ++++ b/lib/CodeGen/ExecutionDepsFix.cpp +@@ -214,13 +214,18 @@ private: + bool isBlockDone(MachineBasicBlock *); + void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass, bool Done); + void updateSuccessors(MachineBasicBlock *MBB, bool Primary, bool Done); +- bool visitInstr(MachineInstr *); ++ bool visitInstr(MachineInstr *, bool PrimaryPass); + void processDefs(MachineInstr *, bool BlockDone, bool Kill); + void visitSoftInstr(MachineInstr*, unsigned mask); + void visitHardInstr(MachineInstr*, unsigned domain); +- void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, +- unsigned Pref); ++ void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, unsigned Pref, ++ bool &TrueDependency); + bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref); ++ ++ // Undef Reads ++ void collapseUndefReads(unsigned from, unsigned to, unsigned Reg); ++ unsigned updateChooseableRegs(SparseSet &, ++ const TargetRegisterClass *, bool); + void processUndefReads(MachineBasicBlock*); + }; + } +@@ -394,11 +399,19 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { + + // This is the entry block. + if (MBB->pred_empty()) { ++ // Treat all registers as being defined just before the first instruction. ++ // Howver, we want the logic later to prefer non live-ins over live-ins, ++ // so pretend the live-ins were defined slightly later. ++ // We used to only do this for live-ins, but that's a bit of a gamble. ++ // If our caller does arithmetic with these registers is is quite likely ++ // that it will have used registers beyond the ones that are live here. ++ // Given the immense penalty for getting this wrong, being conservative ++ // here seems worth it. ++ for (unsigned rx = 0; rx != NumRegs; ++rx) { ++ LiveRegs[rx].Def = -2; ++ } + for (const auto &LI : MBB->liveins()) { + for (int rx : regIndices(LI.PhysReg)) { +- // Treat function live-ins as if they were defined just before the first +- // instruction. Usually, function arguments are set up immediately +- // before the call. + LiveRegs[rx].Def = -1; + } + } +@@ -470,24 +483,36 @@ void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) { + LiveRegs = nullptr; + } + +-bool ExeDepsFix::visitInstr(MachineInstr *MI) { +- // Update instructions with explicit execution domains. +- std::pair DomP = TII->getExecutionDomain(*MI); +- if (DomP.first) { +- if (DomP.second) +- visitSoftInstr(MI, DomP.second); +- else +- visitHardInstr(MI, DomP.first); ++bool ExeDepsFix::visitInstr(MachineInstr *MI, bool PrimaryPass) { ++ bool Kill = false; ++ ++ if (PrimaryPass) { ++ // Update instructions with explicit execution domains. ++ std::pair DomP = TII->getExecutionDomain(*MI); ++ if (DomP.first) { ++ if (DomP.second) ++ visitSoftInstr(MI, DomP.second); ++ else ++ visitHardInstr(MI, DomP.first); ++ } ++ Kill = !DomP.first; + } + +- return !DomP.first; ++ // If this is a call, pretend all registers we are considering are def'd here. ++ // We have no idea which registers the callee may use. ++ if (MI->isCall()) { ++ for (unsigned i = 0, e = NumRegs; i != e; ++i) ++ LiveRegs[i].Def = CurInstr; ++ } ++ ++ return Kill; + } + + /// \brief Helps avoid false dependencies on undef registers by updating the + /// machine instructions' undef operand to use a register that the instruction + /// is truly dependent on, or use a register with clearance higher than Pref. + void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, +- unsigned Pref) { ++ unsigned Pref, bool &TrueDependency) { + MachineOperand &MO = MI->getOperand(OpIdx); + assert(MO.isUndef() && "Expected undef machine operand"); + +@@ -510,6 +535,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, + // We found a true dependency - replace the undef register with the true + // dependency. + MO.setReg(CurrMO.getReg()); ++ TrueDependency = true; + return; + } + +@@ -571,9 +597,14 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool BlockDone, bool Kill) { + if (BlockDone) { + unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); + if (Pref) { +- pickBestRegisterForUndef(MI, OpNum, Pref); +- if (shouldBreakDependence(MI, OpNum, Pref)) ++ bool TrueDependency = false; ++ pickBestRegisterForUndef(MI, OpNum, Pref, TrueDependency); ++ // Don't bother adding true dependencies to UndefReads. All we'd find out ++ // is that the register is live (since this very instruction depends on ++ // it), so we can't do anything. ++ if (!TrueDependency && shouldBreakDependence(MI, OpNum, Pref)) { + UndefReads.push_back(std::make_pair(MI, OpNum)); ++ } + } + } + const MCInstrDesc &MCID = MI->getDesc(); +@@ -606,9 +637,52 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool BlockDone, bool Kill) { + kill(rx); + } + } ++ unsigned DepReg = 0; ++ if (TII->isDependencyBreak(*MI, &DepReg)) { ++ for (int rx : regIndices(DepReg)) { ++ // This instruction is a dependency break, so there are no clearance ++ // issues, reset the counter. ++ LiveRegs[rx].Def = -(1 << 20); ++ } ++ } + ++CurInstr; + } + ++// Set the undef read register to `Reg` for all UndefReads in the range ++// [from,to). ++void ExeDepsFix::collapseUndefReads(unsigned from, unsigned to, unsigned Reg) { ++ if (from >= to) ++ return; ++ for (unsigned i = from; i < to; ++i) { ++ MachineInstr *MI = std::get<0>(UndefReads[i]); ++ unsigned OpIdx = std::get<1>(UndefReads[i]); ++ MachineOperand &MO = MI->getOperand(OpIdx); ++ MO.setReg(Reg); ++ } ++ TII->breakPartialRegDependency(*std::get<0>(UndefReads[from]), ++ std::get<1>(UndefReads[from]), TRI); ++} ++ ++unsigned ExeDepsFix::updateChooseableRegs(SparseSet &ChoosableRegs, ++ const TargetRegisterClass *OpRC, ++ bool add) { ++ unsigned LowestValid = (unsigned)-1; ++ ++ for (auto Reg : OpRC->getRegisters()) { ++ if (LiveRegSet.contains(Reg)) ++ ChoosableRegs.erase(Reg); ++ else if (add) { ++ ChoosableRegs.insert(Reg); ++ if (LowestValid == (unsigned)-1) ++ LowestValid = Reg; ++ } else if (ChoosableRegs.count(Reg) == 1) { ++ if (LowestValid == (unsigned)-1) ++ LowestValid = Reg; ++ } ++ } ++ return LowestValid; ++} ++ + /// \break Break false dependencies on undefined register reads. + /// + /// Walk the block backward computing precise liveness. This is expensive, so we +@@ -619,31 +693,87 @@ void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) { + if (UndefReads.empty()) + return; + ++ // We want to be slightly clever here, to avoid the following common pattern: ++ // Suppose we have some instruction `vrandom %in, %out` and the following code ++ // vrandom %xmm0, %xmm0 ++ // vrandom %xmm1, %xmm1 ++ // vrandom %xmm2, %xmm2 ++ // vrandom %xmm3, %xmm3 ++ // The earlier logic likes to produce these, because it picks the first ++ // register ++ // to break ties in clearance. However, most register allocators pick the dest ++ // register the same way. Naively, we'd have to insert a dependency break, ++ // before every instruction above. However, what we really want is ++ // vxorps %xmm3, %xmm3, %xmm3 ++ // vrandom %xmm3, %xmm0 ++ // vrandom %xmm3, %xmm1 ++ // vrandom %xmm3, %xmm2 ++ // vrandom %xmm3, %xmm3 ++ // To do so, we walk backwards and cumulatively keep track of which registers ++ // we can use to break the dependency. Then, once the set has collapsed, we ++ // reset the undef read register for all following instructions. ++ + // Collect this block's live out register units. + LiveRegSet.init(TRI); + // We do not need to care about pristine registers as they are just preserved + // but not actually used in the function. + LiveRegSet.addLiveOutsNoPristines(*MBB); + +- MachineInstr *UndefMI = UndefReads.back().first; +- unsigned OpIdx = UndefReads.back().second; ++ SparseSet ChoosableRegs; ++ ChoosableRegs.setUniverse(TRI->getNumRegs()); ++ ++ unsigned LastValid = (unsigned)-1; ++ const TargetRegisterClass *LastOpRC = nullptr; ++ size_t i, LastInit; ++ i = LastInit = UndefReads.size() - 1; ++ MachineInstr *UndefMI = std::get<0>(UndefReads[i]); + + for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) { + // Update liveness, including the current instruction's defs. + LiveRegSet.stepBackward(I); + ++ // This ensures that we don't accidentally pick a register whose live region ++ // lies entirely between two undef reads (since that would defeat the ++ // purpose of breaking the dependency). ++ for (auto LiveReg : LiveRegSet) ++ ChoosableRegs.erase(LiveReg); ++ + if (UndefMI == &I) { +- if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg())) +- TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI); ++ unsigned OpIdx = std::get<1>(UndefReads[i]); ++ // Get the undef operand's register class ++ const TargetRegisterClass *OpRC = ++ TII->getRegClass(UndefMI->getDesc(), OpIdx, TRI, *MF); ++ if (OpRC != LastOpRC || ChoosableRegs.size() == 0) { ++ if (LastInit != i) { ++ if (LastValid != (unsigned)-1) ++ collapseUndefReads(i + 1, LastInit + 1, LastValid); ++ ChoosableRegs.clear(); ++ LastInit = i; ++ } ++ } ++ ++ unsigned LowestValid = ++ updateChooseableRegs(ChoosableRegs, OpRC, LastInit == i); ++ ++ if (ChoosableRegs.size() == 0) { ++ if (LastInit != i) { ++ if (LastValid != (unsigned)-1) ++ collapseUndefReads(i + 1, LastInit + 1, LastValid); ++ LowestValid = updateChooseableRegs(ChoosableRegs, OpRC, true); ++ LastInit = i; ++ } ++ } ++ LastValid = LowestValid; ++ LastOpRC = OpRC; + +- UndefReads.pop_back(); +- if (UndefReads.empty()) +- return; ++ if (i == 0) ++ break; + +- UndefMI = UndefReads.back().first; +- OpIdx = UndefReads.back().second; ++ UndefMI = std::get<0>(UndefReads[--i]); + } + } ++ if (LastValid != (unsigned)-1) ++ collapseUndefReads(0, LastInit + 1, LastValid); + } + + // A hard instruction only works in one domain. All input registers will be +@@ -793,9 +923,7 @@ void ExeDepsFix::processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass, + enterBasicBlock(MBB); + for (MachineInstr &MI : *MBB) { + if (!MI.isDebugValue()) { +- bool Kill = false; +- if (PrimaryPass) +- Kill = visitInstr(&MI); ++ bool Kill = visitInstr(&MI, PrimaryPass); + processDefs(&MI, isBlockDone(MBB), Kill); + } + } +diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp +index 5793597..f31c97e 100644 +--- a/lib/Target/X86/X86InstrInfo.cpp ++++ b/lib/Target/X86/X86InstrInfo.cpp +@@ -7496,6 +7496,23 @@ void X86InstrInfo::breakPartialRegDependency( + } + } + ++bool X86InstrInfo::isDependencyBreak(MachineInstr &MI, unsigned *OutReg) const { ++ unsigned Opc = MI.getOpcode(); ++ if (!(Opc == X86::VXORPSrr || Opc == X86::VXORPDrr || Opc == X86::XORPSrr || ++ Opc == X86::XORPDrr)) ++ return false; ++ unsigned Reg = 0; ++ for (unsigned i = 0; i < MI.getNumOperands(); ++i) { ++ const MachineOperand &MO = MI.getOperand(i); ++ if (!MO.isReg() || (Reg != 0 && MO.getReg() != Reg)) ++ return false; ++ Reg = MO.getReg(); ++ } ++ if (OutReg) ++ *OutReg = Reg; ++ return true; ++} ++ + MachineInstr * + X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, + ArrayRef Ops, +diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h +index 8d74617..fa86882 100644 +--- a/lib/Target/X86/X86InstrInfo.h ++++ b/lib/Target/X86/X86InstrInfo.h +@@ -484,6 +484,7 @@ public: + const TargetRegisterInfo *TRI) const override; + void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const override; ++ bool isDependencyBreak(MachineInstr &MI, unsigned *OutReg) const override; + + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, + unsigned OpNum, diff --git a/deps/patches/llvm-PR276266.patch b/deps/patches/llvm-PR276266.patch new file mode 100644 index 0000000000000..576e96e5836d3 --- /dev/null +++ b/deps/patches/llvm-PR276266.patch @@ -0,0 +1,51 @@ +From 64d1e8b748bca22ce205eab7634cc5418c827f18 Mon Sep 17 00:00:00 2001 +From: Marina Yatsina +Date: Thu, 21 Jul 2016 12:37:07 +0000 +Subject: [PATCH 3/5] ExecutionDepsFix - Fix bug in clearance calculation + +The clearance calculation did not take into account registers defined as outputs or clobbers in inline assembly machine instructions because these register defs are implicit. + +Differential Revision: http://reviews.llvm.org/D22580 + + + +git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@276266 91177308-0d34-0410-b5e6-96231b3b80d8 +--- + lib/CodeGen/ExecutionDepsFix.cpp | 2 -- + test/CodeGen/X86/break-false-dep.ll | 10 ++++++++++ + 2 files changed, 10 insertions(+), 2 deletions(-) + +diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp +index 566b8d507b2..1fe5f459b69 100644 +--- a/lib/CodeGen/ExecutionDepsFix.cpp ++++ b/lib/CodeGen/ExecutionDepsFix.cpp +@@ -520,8 +520,6 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; +- if (MO.isImplicit()) +- break; + if (MO.isUse()) + continue; + for (int rx : regIndices(MO.getReg())) { +diff --git a/test/CodeGen/X86/break-false-dep.ll b/test/CodeGen/X86/break-false-dep.ll +index 74a0728f918..a7cda499dab 100644 +--- a/test/CodeGen/X86/break-false-dep.ll ++++ b/test/CodeGen/X86/break-false-dep.ll +@@ -199,3 +199,13 @@ for.end16: ; preds = %for.inc14 + ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] + ;AVX-NEXT: vmovsd [[XMM0]], + } ++ ++define double @inlineasmdep(i64 %arg) { ++top: ++ tail call void asm sideeffect "", "~{xmm0},~{dirflag},~{fpsr},~{flags}"() ++ %tmp1 = sitofp i64 %arg to double ++ ret double %tmp1 ++;AVX-LABEL:@inlineasmdep ++;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]], [[XMM0]] ++;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM0]], {{%xmm[0-9]+}} ++} +-- +2.11.0 + diff --git a/deps/patches/llvm-PR277939.patch b/deps/patches/llvm-PR277939.patch new file mode 100644 index 0000000000000..65e46c32b4848 --- /dev/null +++ b/deps/patches/llvm-PR277939.patch @@ -0,0 +1,169 @@ +From 9790ab8bccdbc71dfcc166860ab6ce9c369bf686 Mon Sep 17 00:00:00 2001 +From: Simon Pilgrim +Date: Sat, 6 Aug 2016 21:21:12 +0000 +Subject: [PATCH 1/5] [X86][AVX2] Improve sign/zero extension on AVX2 targets + +Split extensions to large vectors into 256-bit chunks - the equivalent of what we do with pre-AVX2 into 128-bit chunks + +git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@277939 91177308-0d34-0410-b5e6-96231b3b80d8 +--- + lib/Target/X86/X86ISelLowering.cpp | 22 +++++++++++++++------- + test/CodeGen/X86/vec_int_to_fp.ll | 24 ++++++++---------------- + test/CodeGen/X86/vector-sext.ll | 10 ++-------- + 3 files changed, 25 insertions(+), 31 deletions(-) + +diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp +index ca205335013..2bbedd4bd97 100644 +--- a/lib/Target/X86/X86ISelLowering.cpp ++++ b/lib/Target/X86/X86ISelLowering.cpp +@@ -30164,11 +30164,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, + : DAG.getZeroExtendVectorInReg(ExOp, DL, VT); + } + +- // On pre-AVX2 targets, split into 128-bit nodes of +- // ISD::*_EXTEND_VECTOR_INREG. +- if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) { +- unsigned NumVecs = VT.getSizeInBits() / 128; +- unsigned NumSubElts = 128 / SVT.getSizeInBits(); ++ auto SplitAndExtendInReg = [&](unsigned SplitSize) { ++ unsigned NumVecs = VT.getSizeInBits() / SplitSize; ++ unsigned NumSubElts = SplitSize / SVT.getSizeInBits(); + EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); + EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); + +@@ -30176,14 +30174,24 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, + for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { + SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, + DAG.getIntPtrConstant(Offset, DL)); +- SrcVec = ExtendVecSize(DL, SrcVec, 128); ++ SrcVec = ExtendVecSize(DL, SrcVec, SplitSize); + SrcVec = Opcode == ISD::SIGN_EXTEND + ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT) + : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT); + Opnds.push_back(SrcVec); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); +- } ++ }; ++ ++ // On pre-AVX2 targets, split into 128-bit nodes of ++ // ISD::*_EXTEND_VECTOR_INREG. ++ if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) ++ return SplitAndExtendInReg(128); ++ ++ // On pre-AVX512 targets, split into 256-bit nodes of ++ // ISD::*_EXTEND_VECTOR_INREG. ++ if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256)) ++ return SplitAndExtendInReg(256); + + return SDValue(); + } +diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll +index 43f5318a607..5d8f91385c7 100644 +--- a/test/CodeGen/X86/vec_int_to_fp.ll ++++ b/test/CodeGen/X86/vec_int_to_fp.ll +@@ -153,8 +153,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { + ; + ; AVX2-LABEL: sitofp_16i8_to_2f64: + ; AVX2: # BB#0: +-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ++; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 + ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 + ; AVX2-NEXT: # kill + ; AVX2-NEXT: vzeroupper +@@ -325,8 +324,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { + ; + ; AVX2-LABEL: sitofp_16i8_to_4f64: + ; AVX2: # BB#0: +-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ++; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 + ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 + ; AVX2-NEXT: retq + %cvt = sitofp <16 x i8> %a to <16 x double> +@@ -543,8 +541,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { + ; + ; AVX2-LABEL: uitofp_16i8_to_2f64: + ; AVX2: # BB#0: +-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ++; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero + ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 + ; AVX2-NEXT: # kill + ; AVX2-NEXT: vzeroupper +@@ -778,8 +775,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { + ; + ; AVX2-LABEL: uitofp_16i8_to_4f64: + ; AVX2: # BB#0: +-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ++; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero + ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 + ; AVX2-NEXT: retq + %cvt = uitofp <16 x i8> %a to <16 x double> +@@ -958,8 +954,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { + ; + ; AVX2-LABEL: sitofp_16i8_to_4f32: + ; AVX2: # BB#0: +-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ++; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 + ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 + ; AVX2-NEXT: # kill + ; AVX2-NEXT: vzeroupper +@@ -1134,8 +1129,7 @@ define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) { + ; + ; AVX2-LABEL: sitofp_16i8_to_8f32: + ; AVX2: # BB#0: +-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ++; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 + ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 + ; AVX2-NEXT: retq + %cvt = sitofp <16 x i8> %a to <16 x float> +@@ -1456,8 +1450,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { + ; + ; AVX2-LABEL: uitofp_16i8_to_4f32: + ; AVX2: # BB#0: +-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ++; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero + ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 + ; AVX2-NEXT: # kill + ; AVX2-NEXT: vzeroupper +@@ -1813,8 +1806,7 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { + ; + ; AVX2-LABEL: uitofp_16i8_to_8f32: + ; AVX2: # BB#0: +-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ++; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero + ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 + ; AVX2-NEXT: retq + %cvt = uitofp <16 x i8> %a to <16 x float> +diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll +index 018c5922a43..e29f3e5f91f 100644 +--- a/test/CodeGen/X86/vector-sext.ll ++++ b/test/CodeGen/X86/vector-sext.ll +@@ -407,15 +407,9 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp + ; + ; AVX2-LABEL: sext_16i8_to_8i64: + ; AVX2: # BB#0: # %entry +-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +-; AVX2-NEXT: vpslld $24, %xmm1, %xmm1 +-; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 +-; AVX2-NEXT: vpmovsxdq %xmm1, %ymm2 ++; AVX2-NEXT: vpmovsxbq %xmm0, %ymm2 + ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 +-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 +-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1 ++; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1 + ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 + ; AVX2-NEXT: retq + ; +-- +2.11.0 + diff --git a/deps/patches/llvm-PR278088.patch b/deps/patches/llvm-PR278088.patch new file mode 100644 index 0000000000000..325069326b3ed --- /dev/null +++ b/deps/patches/llvm-PR278088.patch @@ -0,0 +1,224 @@ +From b01ff685400365f55c5333c29c2227842d61e984 Mon Sep 17 00:00:00 2001 +From: Craig Topper +Date: Tue, 9 Aug 2016 03:06:26 +0000 +Subject: [PATCH 2/5] [X86] Remove unnecessary bitcast from the front of + AVX1Only 256-bit logical operation patterns. + +git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@278088 91177308-0d34-0410-b5e6-96231b3b80d8 +--- + lib/Target/X86/X86InstrSSE.td | 8 +++---- + test/CodeGen/X86/WidenArith.ll | 2 +- + test/CodeGen/X86/merge-consecutive-loads-256.ll | 26 ++++++--------------- + test/CodeGen/X86/v8i1-masks.ll | 4 ++-- + test/CodeGen/X86/vec_int_to_fp.ll | 30 ++++++++++++------------- + test/CodeGen/X86/vec_uint_to_fp-fastmath.ll | 26 ++++++++++----------- + 6 files changed, 42 insertions(+), 54 deletions(-) + +diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td +index f91764a67d1..77da22de4d3 100644 +--- a/lib/Target/X86/X86InstrSSE.td ++++ b/lib/Target/X86/X86InstrSSE.td +@@ -2950,13 +2950,13 @@ let isCommutable = 0 in + // AVX1 requires type coercions in order to fold loads directly into logical + // operations. + let Predicates = [HasAVX1Only] in { +- def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))), ++ def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), + (VANDPSYrm VR256:$src1, addr:$src2)>; +- def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))), ++ def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), + (VORPSYrm VR256:$src1, addr:$src2)>; +- def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))), ++ def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), + (VXORPSYrm VR256:$src1, addr:$src2)>; +- def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))), ++ def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), + (VANDNPSYrm VR256:$src1, addr:$src2)>; + } + +diff --git a/test/CodeGen/X86/WidenArith.ll b/test/CodeGen/X86/WidenArith.ll +index cdd1a2818b2..cc5fcba6670 100644 +--- a/test/CodeGen/X86/WidenArith.ll ++++ b/test/CodeGen/X86/WidenArith.ll +@@ -9,8 +9,8 @@ define <8 x i32> @test(<8 x float> %a, <8 x float> %b) { + ; CHECK-NEXT: vsubps %ymm2, %ymm1, %ymm3 + ; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %ymm0 + ; CHECK-NEXT: vcmpltps %ymm3, %ymm2, %ymm1 +-; CHECK-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 + ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 ++; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 + ; CHECK-NEXT: retq + %c1 = fadd <8 x float> %a, %b + %b1 = fmul <8 x float> %b, %a +diff --git a/test/CodeGen/X86/merge-consecutive-loads-256.ll b/test/CodeGen/X86/merge-consecutive-loads-256.ll +index 8c2e9372900..dc268d9bdf8 100644 +--- a/test/CodeGen/X86/merge-consecutive-loads-256.ll ++++ b/test/CodeGen/X86/merge-consecutive-loads-256.ll +@@ -547,29 +547,17 @@ define <16 x i16> @merge_16i16_i16_0uu3uuuuuuuuCuEF(i16* %ptr) nounwind uwtable + } + + define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF(i16* %ptr) nounwind uwtable noinline ssp { +-; AVX1-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF: +-; AVX1: # BB#0: +-; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,65535,0,0,0,0,0,0,0,0,65535,0,65535,65535] +-; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0 +-; AVX1-NEXT: retq +-; +-; AVX2-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF: +-; AVX2: # BB#0: +-; AVX2-NEXT: vmovups (%rdi), %ymm0 +-; AVX2-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +-; AVX2-NEXT: retq +-; +-; AVX512F-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF: +-; AVX512F: # BB#0: +-; AVX512F-NEXT: vmovups (%rdi), %ymm0 +-; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +-; AVX512F-NEXT: retq ++; AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF: ++; AVX: # BB#0: ++; AVX-NEXT: vmovups (%rdi), %ymm0 ++; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ++; AVX-NEXT: retq + ; + ; X32-AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF: + ; X32-AVX: # BB#0: + ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +-; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,65535,0,0,0,0,0,0,0,0,65535,0,65535,65535] +-; X32-AVX-NEXT: vandps (%eax), %ymm0, %ymm0 ++; X32-AVX-NEXT: vmovups (%eax), %ymm0 ++; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 + ; X32-AVX-NEXT: retl + %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 0 + %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 3 +diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll +index 0135832ad92..d5c31506e98 100644 +--- a/test/CodeGen/X86/v8i1-masks.ll ++++ b/test/CodeGen/X86/v8i1-masks.ll +@@ -13,8 +13,8 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi + ; X32-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 + ; X32-NEXT: vmovups (%eax), %ymm2 + ; X32-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +-; X32-NEXT: vandps LCPI0_0, %ymm1, %ymm1 + ; X32-NEXT: vandps %ymm1, %ymm0, %ymm0 ++; X32-NEXT: vandps LCPI0_0, %ymm0, %ymm0 + ; X32-NEXT: vmovaps %ymm0, (%eax) + ; X32-NEXT: vzeroupper + ; X32-NEXT: retl +@@ -26,8 +26,8 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi + ; X64-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 + ; X64-NEXT: vmovups (%rdx), %ymm2 + ; X64-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +-; X64-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 + ; X64-NEXT: vandps %ymm1, %ymm0, %ymm0 ++; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 + ; X64-NEXT: vmovaps %ymm0, (%rax) + ; X64-NEXT: vzeroupper + ; X64-NEXT: retq +diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll +index 5d8f91385c7..8ea7243664a 100644 +--- a/test/CodeGen/X86/vec_int_to_fp.ll ++++ b/test/CodeGen/X86/vec_int_to_fp.ll +@@ -1694,15 +1694,15 @@ define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) { + ; + ; AVX1-LABEL: uitofp_8i32_to_8f32: + ; AVX1: # BB#0: +-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 ++; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ++; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ++; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 ++; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 + ; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 +-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ++; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ++; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 + ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +-; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +-; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 ++; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 + ; AVX1-NEXT: retq + ; + ; AVX2-LABEL: uitofp_8i32_to_8f32: +@@ -3372,16 +3372,16 @@ define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) { + ; + ; AVX1-LABEL: uitofp_load_8i32_to_8f32: + ; AVX1: # BB#0: +-; AVX1-NEXT: vmovaps (%rdi), %ymm0 +-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 ++; AVX1-NEXT: vmovdqa (%rdi), %ymm0 ++; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ++; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ++; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 ++; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 + ; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 +-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ++; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ++; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 + ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +-; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +-; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 ++; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 + ; AVX1-NEXT: retq + ; + ; AVX2-LABEL: uitofp_load_8i32_to_8f32: +diff --git a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll +index c0e02bd1599..cb8e2096585 100644 +--- a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll ++++ b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll +@@ -78,18 +78,18 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) { + ret <4 x float> %tmp + } + +-; AVX: [[MASKCSTADDR_v8:.LCPI[0-9_]+]]: +-; AVX-NEXT: .long 65535 # 0xffff +-; AVX-NEXT: .long 65535 # 0xffff +-; AVX-NEXT: .long 65535 # 0xffff +-; AVX-NEXT: .long 65535 # 0xffff +- + ; AVX: [[FPMASKCSTADDR_v8:.LCPI[0-9_]+]]: + ; AVX-NEXT: .long 1199570944 # float 65536 + ; AVX-NEXT: .long 1199570944 # float 65536 + ; AVX-NEXT: .long 1199570944 # float 65536 + ; AVX-NEXT: .long 1199570944 # float 65536 + ++; AVX: [[MASKCSTADDR_v8:.LCPI[0-9_]+]]: ++; AVX-NEXT: .long 65535 # 0xffff ++; AVX-NEXT: .long 65535 # 0xffff ++; AVX-NEXT: .long 65535 # 0xffff ++; AVX-NEXT: .long 65535 # 0xffff ++ + ; AVX2: [[FPMASKCSTADDR_v8:.LCPI[0-9_]+]]: + ; AVX2-NEXT: .long 1199570944 # float 65536 + +@@ -119,15 +119,15 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) { + ; + ; AVX-LABEL: test_uitofp_v8i32_to_v8f32: + ; AVX: # BB#0: +-; AVX-NEXT: vandps [[MASKCSTADDR_v8]](%rip), %ymm0, %ymm1 ++; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ++; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ++; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 ++; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 + ; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 +-; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 +-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +-; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ++; AVX-NEXT: vmulps [[FPMASKCSTADDR_v8]](%rip), %ymm1, %ymm1 ++; AVX-NEXT: vandps [[MASKCSTADDR_v8]](%rip), %ymm0, %ymm0 + ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +-; AVX-NEXT: vmulps [[FPMASKCSTADDR_v8]](%rip), %ymm0, %ymm0 +-; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ++; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 + ; AVX-NEXT: retq + ; + ; AVX2-LABEL: test_uitofp_v8i32_to_v8f32: +-- +2.11.0 + diff --git a/deps/patches/llvm-PR278321.patch b/deps/patches/llvm-PR278321.patch new file mode 100644 index 0000000000000..709436536f01a --- /dev/null +++ b/deps/patches/llvm-PR278321.patch @@ -0,0 +1,1409 @@ +From a4ec9b3d6c2c53eb463284db0aa54158fad32701 Mon Sep 17 00:00:00 2001 +From: Marina Yatsina +Date: Thu, 11 Aug 2016 07:32:08 +0000 +Subject: [PATCH 4/5] Avoid false dependencies of undef machine operands + +This patch helps avoid false dependencies on undef registers by updating the machine instructions' undef operand to use a register that the instruction is truly dependent on, or use a register with clearance higher than Pref. + +Pseudo example: + +loop: +xmm0 = ... +xmm1 = vcvtsi2sdl eax, xmm0 +... = inst xmm0 +jmp loop + +In this example, selecting xmm0 as the undef register creates false dependency between loop iterations. +This false dependency cannot be solved by inserting an xor before vcvtsi2sdl because xmm0 is alive at the point of the vcvtsi2sdl instruction. +Selecting a different register instead of xmm0, especially a register that is not used in the loop, will eliminate this problem. + +Differential Revision: https://reviews.llvm.org/D22466 + +git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@278321 91177308-0d34-0410-b5e6-96231b3b80d8 +--- + lib/CodeGen/ExecutionDepsFix.cpp | 53 ++++ + lib/Target/X86/X86InstrInfo.cpp | 2 +- + test/CodeGen/X86/break-false-dep.ll | 72 ++++- + test/CodeGen/X86/copy-propagation.ll | 3 +- + test/CodeGen/X86/half.ll | 2 +- + test/CodeGen/X86/vec_int_to_fp.ll | 579 ++++++++++++++++++++--------------- + 6 files changed, 467 insertions(+), 244 deletions(-) + +diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp +index 1fe5f459b69..5f91db9251c 100644 +--- a/lib/CodeGen/ExecutionDepsFix.cpp ++++ b/lib/CodeGen/ExecutionDepsFix.cpp +@@ -203,6 +203,8 @@ private: + void processDefs(MachineInstr*, bool Kill); + void visitSoftInstr(MachineInstr*, unsigned mask); + void visitHardInstr(MachineInstr*, unsigned domain); ++ void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, ++ unsigned Pref); + bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref); + void processUndefReads(MachineBasicBlock*); + }; +@@ -473,6 +475,56 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) { + processDefs(MI, !DomP.first); + } + ++/// \brief Helps avoid false dependencies on undef registers by updating the ++/// machine instructions' undef operand to use a register that the instruction ++/// is truly dependent on, or use a register with clearance higher than Pref. ++void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, ++ unsigned Pref) { ++ MachineOperand &MO = MI->getOperand(OpIdx); ++ assert(MO.isUndef() && "Expected undef machine operand"); ++ ++ unsigned OriginalReg = MO.getReg(); ++ ++ // Update only undef operands that are mapped to one register. ++ if (AliasMap[OriginalReg].size() != 1) ++ return; ++ ++ // Get the undef operand's register class ++ const TargetRegisterClass *OpRC = ++ TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF); ++ ++ // If the instruction has a true dependency, we can hide the false depdency ++ // behind it. ++ for (MachineOperand &CurrMO : MI->operands()) { ++ if (!CurrMO.isReg() || CurrMO.isDef() || CurrMO.isUndef() || ++ !OpRC->contains(CurrMO.getReg())) ++ continue; ++ // We found a true dependency - replace the undef register with the true ++ // dependency. ++ MO.setReg(CurrMO.getReg()); ++ return; ++ } ++ ++ // Go over all registers in the register class and find the register with ++ // max clearance or clearance higher than Pref. ++ unsigned MaxClearance = 0; ++ unsigned MaxClearanceReg = OriginalReg; ++ for (unsigned rx = 0; rx < OpRC->getNumRegs(); ++rx) { ++ unsigned Clearance = CurInstr - LiveRegs[rx].Def; ++ if (Clearance <= MaxClearance) ++ continue; ++ MaxClearance = Clearance; ++ MaxClearanceReg = OpRC->getRegister(rx); ++ ++ if (MaxClearance > Pref) ++ break; ++ } ++ ++ // Update the operand if we found a register with better clearance. ++ if (MaxClearanceReg != OriginalReg) ++ MO.setReg(MaxClearanceReg); ++} ++ + /// \brief Return true to if it makes sense to break dependence on a partial def + /// or undef use. + bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, +@@ -510,6 +562,7 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { + unsigned OpNum; + unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); + if (Pref) { ++ pickBestRegisterForUndef(MI, OpNum, Pref); + if (shouldBreakDependence(MI, OpNum, Pref)) + UndefReads.push_back(std::make_pair(MI, OpNum)); + } +diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp +index 5f0aab9ddc6..9bfe25973ae 100644 +--- a/lib/Target/X86/X86InstrInfo.cpp ++++ b/lib/Target/X86/X86InstrInfo.cpp +@@ -68,7 +68,7 @@ static cl::opt + UndefRegClearance("undef-reg-clearance", + cl::desc("How many idle instructions we would like before " + "certain undef register reads"), +- cl::init(64), cl::Hidden); ++ cl::init(128), cl::Hidden); + + enum { + // Select which memory operand is being unfolded. +diff --git a/test/CodeGen/X86/break-false-dep.ll b/test/CodeGen/X86/break-false-dep.ll +index a7cda499dab..4c5e747f9ca 100644 +--- a/test/CodeGen/X86/break-false-dep.ll ++++ b/test/CodeGen/X86/break-false-dep.ll +@@ -126,6 +126,7 @@ loop: + %i = phi i64 [ 1, %entry ], [ %inc, %loop ] + %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] + %fi = sitofp i64 %i to double ++ tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() + %vy = load double, double* %y + %fipy = fadd double %fi, %vy + %iipy = fptosi double %fipy to i64 +@@ -174,6 +175,7 @@ for.body3: + store double %mul11, double* %arrayidx13, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 ++ tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() + br i1 %exitcond, label %for.inc14, label %for.body3 + + for.inc14: ; preds = %for.body3 +@@ -193,7 +195,7 @@ for.end16: ; preds = %for.inc14 + ;SSE-NEXT: movsd [[XMM0]], + ;AVX-LABEL:@loopdep3 + ;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]] +-;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], [[XMM0]] ++;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], {{%xmm[0-9]+}} + ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] + ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] + ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] +@@ -202,10 +204,76 @@ for.end16: ; preds = %for.inc14 + + define double @inlineasmdep(i64 %arg) { + top: +- tail call void asm sideeffect "", "~{xmm0},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() + %tmp1 = sitofp i64 %arg to double + ret double %tmp1 + ;AVX-LABEL:@inlineasmdep + ;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]], [[XMM0]] + ;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM0]], {{%xmm[0-9]+}} + } ++ ++; Make sure we are making a smart choice regarding undef registers and ++; hiding the false dependency behind a true dependency ++define double @truedeps(float %arg) { ++top: ++ tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() ++ %tmp1 = fpext float %arg to double ++ ret double %tmp1 ++;AVX-LABEL:@truedeps ++;AVX-NOT: vxorps ++;AVX: vcvtss2sd [[XMM0:%xmm[0-9]+]], [[XMM0]], {{%xmm[0-9]+}} ++} ++ ++; Make sure we are making a smart choice regarding undef registers and ++; choosing the register with the highest clearence ++define double @clearence(i64 %arg) { ++top: ++ tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() ++ %tmp1 = sitofp i64 %arg to double ++ ret double %tmp1 ++;AVX-LABEL:@clearence ++;AVX: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]] ++;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM6]], {{%xmm[0-9]+}} ++} ++ ++; Make sure we are making a smart choice regarding undef registers in order to ++; avoid a cyclic dependence on a write to the same register in a previous ++; iteration, especially when we cannot zero out the undef register because it ++; is alive. ++define i64 @loopclearence(i64* nocapture %x, double* nocapture %y) nounwind { ++entry: ++ %vx = load i64, i64* %x ++ br label %loop ++loop: ++ %i = phi i64 [ 1, %entry ], [ %inc, %loop ] ++ %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] ++ %fi = sitofp i64 %i to double ++ tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() ++ tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() ++ %vy = load double, double* %y ++ %fipy = fadd double %fi, %vy ++ %iipy = fptosi double %fipy to i64 ++ %s2 = add i64 %s1, %iipy ++ %inc = add nsw i64 %i, 1 ++ %exitcond = icmp eq i64 %inc, 156250000 ++ br i1 %exitcond, label %ret, label %loop ++ret: ++ ret i64 %s2 ++;AVX-LABEL:@loopclearence ++;Registers 4-7 are not used and therefore one of them should be chosen ++;AVX-NOT: {{%xmm[4-7]}} ++;AVX: vcvtsi2sdq {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}} ++;AVX-NOT: [[XMM4_7]] ++} +diff --git a/test/CodeGen/X86/copy-propagation.ll b/test/CodeGen/X86/copy-propagation.ll +index 19421a06fa8..dac46c17382 100644 +--- a/test/CodeGen/X86/copy-propagation.ll ++++ b/test/CodeGen/X86/copy-propagation.ll +@@ -26,7 +26,7 @@ target triple = "x86_64-pc-win32-elf" + ; Copy the result in a temporary. + ; Note: Technically the regalloc could have been smarter and this move not required, + ; which would have hidden the bug. +-; CHECK-NEXT: vmovapd %xmm0, [[TMP:%xmm[0-9]+]] ++; CHECK: vmovapd %xmm0, [[TMP:%xmm[0-9]+]] + ; Crush xmm0. + ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 + ; CHECK: movl $339772768, %e[[INDIRECT_CALL2:[a-z]+]] +@@ -37,6 +37,7 @@ target triple = "x86_64-pc-win32-elf" + define double @foo(i64 %arg) { + top: + %tmp = call double inttoptr (i64 339752784 to double (double, double)*)(double 1.000000e+00, double 0.000000e+00) ++ tail call void asm sideeffect "", "x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"(double %tmp) + %tmp1 = sitofp i64 %arg to double + call void inttoptr (i64 339772768 to void (double, double)*)(double %tmp, double %tmp1) + %tmp3 = fadd double %tmp1, %tmp +diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll +index 717ddbfa6fd..739bb146e3a 100644 +--- a/test/CodeGen/X86/half.ll ++++ b/test/CodeGen/X86/half.ll +@@ -299,7 +299,7 @@ define half @test_f80trunc_nodagcombine() #0 { + ; CHECK-F16C-NEXT: movswl (%rsi), %eax + ; CHECK-F16C-NEXT: vmovd %eax, %xmm0 + ; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +-; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm1 ++; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1 + ; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 + ; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 + ; CHECK-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0 +diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll +index 8ea7243664a..bb0a93dc848 100644 +--- a/test/CodeGen/X86/vec_int_to_fp.ll ++++ b/test/CodeGen/X86/vec_int_to_fp.ll +@@ -27,10 +27,9 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) { + ; AVX-LABEL: sitofp_2i64_to_2f64: + ; AVX: # BB#0: + ; AVX-NEXT: vpextrq $1, %xmm0, %rax +-; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 ++; AVX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 + ; AVX-NEXT: vmovq %xmm0, %rax +-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 ++; AVX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 + ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] + ; AVX-NEXT: retq + %cvt = sitofp <2 x i64> %a to <2 x double> +@@ -188,15 +187,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { + ; AVX1: # BB#0: + ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 + ; AVX1-NEXT: vpextrq $1, %xmm1, %rax +-; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 + ; AVX1-NEXT: vmovq %xmm1, %rax +-; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 ++; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 + ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] + ; AVX1-NEXT: vpextrq $1, %xmm0, %rax +-; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 + ; AVX1-NEXT: vmovq %xmm0, %rax +-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 ++; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 + ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] + ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + ; AVX1-NEXT: retq +@@ -205,18 +203,33 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { + ; AVX2: # BB#0: + ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 + ; AVX2-NEXT: vpextrq $1, %xmm1, %rax +-; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 + ; AVX2-NEXT: vmovq %xmm1, %rax +-; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 ++; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 + ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] + ; AVX2-NEXT: vpextrq $1, %xmm0, %rax +-; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 + ; AVX2-NEXT: vmovq %xmm0, %rax +-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 ++; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 + ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] + ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + ; AVX2-NEXT: retq ++; ++; AVX512-LABEL: sitofp_4i64_to_4f64: ++; AVX512: # BB#0: ++; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm1 ++; AVX512-NEXT: vpextrq $1, %xmm1, %rax ++; AVX512-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ++; AVX512-NEXT: vmovq %xmm1, %rax ++; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ++; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ++; AVX512-NEXT: vpextrq $1, %xmm0, %rax ++; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ++; AVX512-NEXT: vmovq %xmm0, %rax ++; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ++; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ++; AVX512-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 ++; AVX512-NEXT: retq + %cvt = sitofp <4 x i64> %a to <4 x double> + ret <4 x double> %cvt + } +@@ -803,12 +816,11 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { + ; AVX-LABEL: sitofp_2i64_to_4f32: + ; AVX: # BB#0: + ; AVX-NEXT: vpextrq $1, %xmm0, %rax +-; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 + ; AVX-NEXT: vmovq %xmm0, %rax +-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ++; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 + ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +-; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 + ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] + ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] + ; AVX-NEXT: retq +@@ -836,12 +848,11 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { + ; AVX-LABEL: sitofp_4i64_to_4f32_undef: + ; AVX: # BB#0: + ; AVX-NEXT: vpextrq $1, %xmm0, %rax +-; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 + ; AVX-NEXT: vmovq %xmm0, %rax +-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ++; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 + ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +-; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 + ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] + ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] + ; AVX-NEXT: retq +@@ -988,17 +999,16 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { + ; AVX1-LABEL: sitofp_4i64_to_4f32: + ; AVX1: # BB#0: + ; AVX1-NEXT: vpextrq $1, %xmm0, %rax +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 + ; AVX1-NEXT: vmovq %xmm0, %rax +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] + ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 + ; AVX1-NEXT: vmovq %xmm0, %rax +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] + ; AVX1-NEXT: vpextrq $1, %xmm0, %rax +-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] + ; AVX1-NEXT: vzeroupper + ; AVX1-NEXT: retq +@@ -1006,20 +1016,35 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { + ; AVX2-LABEL: sitofp_4i64_to_4f32: + ; AVX2: # BB#0: + ; AVX2-NEXT: vpextrq $1, %xmm0, %rax +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 + ; AVX2-NEXT: vmovq %xmm0, %rax +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] + ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 + ; AVX2-NEXT: vmovq %xmm0, %rax +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] + ; AVX2-NEXT: vpextrq $1, %xmm0, %rax +-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] + ; AVX2-NEXT: vzeroupper + ; AVX2-NEXT: retq ++; ++; AVX512-LABEL: sitofp_4i64_to_4f32: ++; AVX512: # BB#0: ++; AVX512-NEXT: vpextrq $1, %xmm0, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ++; AVX512-NEXT: vmovq %xmm0, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ++; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ++; AVX512-NEXT: vmovq %xmm0, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ++; AVX512-NEXT: vpextrq $1, %xmm0, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ++; AVX512-NEXT: retq + %cvt = sitofp <4 x i64> %a to <4 x float> + ret <4 x float> %cvt + } +@@ -1181,48 +1206,58 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { + ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] + ; SSE-NEXT: retq + ; +-; AVX-LABEL: uitofp_2i64_to_4f32: +-; AVX: # BB#0: +-; AVX-NEXT: vpextrq $1, %xmm0, %rax +-; AVX-NEXT: movl %eax, %ecx +-; AVX-NEXT: andl $1, %ecx +-; AVX-NEXT: testq %rax, %rax +-; AVX-NEXT: js .LBB38_1 +-; AVX-NEXT: # BB#2: +-; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +-; AVX-NEXT: jmp .LBB38_3 +-; AVX-NEXT: .LBB38_1: +-; AVX-NEXT: shrq %rax +-; AVX-NEXT: orq %rax, %rcx +-; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 +-; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +-; AVX-NEXT: .LBB38_3: +-; AVX-NEXT: vmovq %xmm0, %rax +-; AVX-NEXT: movl %eax, %ecx +-; AVX-NEXT: andl $1, %ecx +-; AVX-NEXT: testq %rax, %rax +-; AVX-NEXT: js .LBB38_4 +-; AVX-NEXT: # BB#5: +-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +-; AVX-NEXT: jmp .LBB38_6 +-; AVX-NEXT: .LBB38_4: +-; AVX-NEXT: shrq %rax +-; AVX-NEXT: orq %rax, %rcx +-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 +-; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +-; AVX-NEXT: .LBB38_6: +-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +-; AVX-NEXT: testq %rax, %rax +-; AVX-NEXT: js .LBB38_8 +-; AVX-NEXT: # BB#7: +-; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +-; AVX-NEXT: .LBB38_8: +-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +-; AVX-NEXT: retq ++; AVX1-LABEL: uitofp_2i64_to_4f32: ++; AVX1: # BB#0: ++; AVX1-NEXT: vpextrq $1, %xmm0, %rax ++; AVX1-NEXT: movl %eax, %ecx ++; AVX1-NEXT: andl $1, %ecx ++; AVX1-NEXT: testq %rax, %rax ++; AVX1-NEXT: js .LBB38_1 ++; AVX1-NEXT: # BB#2: ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ++; AVX1-NEXT: jmp .LBB38_3 ++; AVX1-NEXT: .LBB38_1: ++; AVX1-NEXT: shrq %rax ++; AVX1-NEXT: orq %rax, %rcx ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 ++; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ++; AVX1-NEXT: .LBB38_3: ++; AVX1-NEXT: vmovq %xmm0, %rax ++; AVX1-NEXT: movl %eax, %ecx ++; AVX1-NEXT: andl $1, %ecx ++; AVX1-NEXT: testq %rax, %rax ++; AVX1-NEXT: js .LBB38_4 ++; AVX1-NEXT: # BB#5: ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ++; AVX1-NEXT: jmp .LBB38_6 ++; AVX1-NEXT: .LBB38_4: ++; AVX1-NEXT: shrq %rax ++; AVX1-NEXT: orq %rax, %rcx ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm0 ++; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ++; AVX1-NEXT: .LBB38_6: ++; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ++; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ++; AVX1-NEXT: testq %rax, %rax ++; AVX1-NEXT: js .LBB38_8 ++; AVX1-NEXT: # BB#7: ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ++; AVX1-NEXT: .LBB38_8: ++; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ++; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ++; AVX1-NEXT: retq ++; ++; AVX512-LABEL: uitofp_2i64_to_4f32: ++; AVX512: # BB#0: ++; AVX512-NEXT: vpextrq $1, %xmm0, %rax ++; AVX512-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm1 ++; AVX512-NEXT: vmovq %xmm0, %rax ++; AVX512-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm0 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ++; AVX512-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm1 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ++; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ++; AVX512-NEXT: retq + %cvt = uitofp <2 x i64> %a to <2 x float> + %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> + ret <4 x float> %ext +@@ -1277,48 +1312,58 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { + ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] + ; SSE-NEXT: retq + ; +-; AVX-LABEL: uitofp_4i64_to_4f32_undef: +-; AVX: # BB#0: +-; AVX-NEXT: vpextrq $1, %xmm0, %rax +-; AVX-NEXT: movl %eax, %ecx +-; AVX-NEXT: andl $1, %ecx +-; AVX-NEXT: testq %rax, %rax +-; AVX-NEXT: js .LBB39_1 +-; AVX-NEXT: # BB#2: +-; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +-; AVX-NEXT: jmp .LBB39_3 +-; AVX-NEXT: .LBB39_1: +-; AVX-NEXT: shrq %rax +-; AVX-NEXT: orq %rax, %rcx +-; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 +-; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +-; AVX-NEXT: .LBB39_3: +-; AVX-NEXT: vmovq %xmm0, %rax +-; AVX-NEXT: movl %eax, %ecx +-; AVX-NEXT: andl $1, %ecx +-; AVX-NEXT: testq %rax, %rax +-; AVX-NEXT: js .LBB39_4 +-; AVX-NEXT: # BB#5: +-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +-; AVX-NEXT: jmp .LBB39_6 +-; AVX-NEXT: .LBB39_4: +-; AVX-NEXT: shrq %rax +-; AVX-NEXT: orq %rax, %rcx +-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 +-; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +-; AVX-NEXT: .LBB39_6: +-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +-; AVX-NEXT: testq %rax, %rax +-; AVX-NEXT: js .LBB39_8 +-; AVX-NEXT: # BB#7: +-; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +-; AVX-NEXT: .LBB39_8: +-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +-; AVX-NEXT: retq ++; AVX1-LABEL: uitofp_4i64_to_4f32_undef: ++; AVX1: # BB#0: ++; AVX1-NEXT: vpextrq $1, %xmm0, %rax ++; AVX1-NEXT: movl %eax, %ecx ++; AVX1-NEXT: andl $1, %ecx ++; AVX1-NEXT: testq %rax, %rax ++; AVX1-NEXT: js .LBB39_1 ++; AVX1-NEXT: # BB#2: ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ++; AVX1-NEXT: jmp .LBB39_3 ++; AVX1-NEXT: .LBB39_1: ++; AVX1-NEXT: shrq %rax ++; AVX1-NEXT: orq %rax, %rcx ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 ++; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ++; AVX1-NEXT: .LBB39_3: ++; AVX1-NEXT: vmovq %xmm0, %rax ++; AVX1-NEXT: movl %eax, %ecx ++; AVX1-NEXT: andl $1, %ecx ++; AVX1-NEXT: testq %rax, %rax ++; AVX1-NEXT: js .LBB39_4 ++; AVX1-NEXT: # BB#5: ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ++; AVX1-NEXT: jmp .LBB39_6 ++; AVX1-NEXT: .LBB39_4: ++; AVX1-NEXT: shrq %rax ++; AVX1-NEXT: orq %rax, %rcx ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm0 ++; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ++; AVX1-NEXT: .LBB39_6: ++; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ++; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ++; AVX1-NEXT: testq %rax, %rax ++; AVX1-NEXT: js .LBB39_8 ++; AVX1-NEXT: # BB#7: ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ++; AVX1-NEXT: .LBB39_8: ++; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ++; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ++; AVX1-NEXT: retq ++; ++; AVX512-LABEL: uitofp_4i64_to_4f32_undef: ++; AVX512: # BB#0: ++; AVX512-NEXT: vpextrq $1, %xmm0, %rax ++; AVX512-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm1 ++; AVX512-NEXT: vmovq %xmm0, %rax ++; AVX512-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm0 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ++; AVX512-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm1 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ++; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ++; AVX512-NEXT: retq + %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> + %cvt = uitofp <4 x i64> %ext to <4 x float> + ret <4 x float> %cvt +@@ -1539,12 +1584,12 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB45_1 + ; AVX1-NEXT: # BB#2: +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 + ; AVX1-NEXT: jmp .LBB45_3 + ; AVX1-NEXT: .LBB45_1: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 + ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 + ; AVX1-NEXT: .LBB45_3: + ; AVX1-NEXT: vmovq %xmm0, %rax +@@ -1553,12 +1598,12 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB45_4 + ; AVX1-NEXT: # BB#5: +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 + ; AVX1-NEXT: jmp .LBB45_6 + ; AVX1-NEXT: .LBB45_4: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2 + ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 + ; AVX1-NEXT: .LBB45_6: + ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +@@ -1569,12 +1614,12 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB45_7 + ; AVX1-NEXT: # BB#8: +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 + ; AVX1-NEXT: jmp .LBB45_9 + ; AVX1-NEXT: .LBB45_7: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2 + ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 + ; AVX1-NEXT: .LBB45_9: + ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +@@ -1584,16 +1629,14 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB45_10 + ; AVX1-NEXT: # BB#11: +-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] + ; AVX1-NEXT: vzeroupper + ; AVX1-NEXT: retq + ; AVX1-NEXT: .LBB45_10: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0 + ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] + ; AVX1-NEXT: vzeroupper +@@ -1607,12 +1650,12 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB45_1 + ; AVX2-NEXT: # BB#2: +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 + ; AVX2-NEXT: jmp .LBB45_3 + ; AVX2-NEXT: .LBB45_1: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 + ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 + ; AVX2-NEXT: .LBB45_3: + ; AVX2-NEXT: vmovq %xmm0, %rax +@@ -1621,12 +1664,12 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB45_4 + ; AVX2-NEXT: # BB#5: +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 + ; AVX2-NEXT: jmp .LBB45_6 + ; AVX2-NEXT: .LBB45_4: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2 + ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 + ; AVX2-NEXT: .LBB45_6: + ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +@@ -1637,12 +1680,12 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB45_7 + ; AVX2-NEXT: # BB#8: +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 + ; AVX2-NEXT: jmp .LBB45_9 + ; AVX2-NEXT: .LBB45_7: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2 + ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 + ; AVX2-NEXT: .LBB45_9: + ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +@@ -1652,16 +1695,14 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB45_10 + ; AVX2-NEXT: # BB#11: +-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] + ; AVX2-NEXT: vzeroupper + ; AVX2-NEXT: retq + ; AVX2-NEXT: .LBB45_10: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0 + ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] + ; AVX2-NEXT: vzeroupper +@@ -1831,16 +1872,25 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) { + ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] + ; SSE-NEXT: retq + ; +-; AVX-LABEL: sitofp_load_2i64_to_2f64: +-; AVX: # BB#0: +-; AVX-NEXT: vmovdqa (%rdi), %xmm0 +-; AVX-NEXT: vpextrq $1, %xmm0, %rax +-; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +-; AVX-NEXT: vmovq %xmm0, %rax +-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +-; AVX-NEXT: retq ++; AVX1-LABEL: sitofp_load_2i64_to_2f64: ++; AVX1: # BB#0: ++; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ++; AVX1-NEXT: vpextrq $1, %xmm0, %rax ++; AVX1-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 ++; AVX1-NEXT: vmovq %xmm0, %rax ++; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ++; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ++; AVX1-NEXT: retq ++; ++; AVX512-LABEL: sitofp_load_2i64_to_2f64: ++; AVX512: # BB#0: ++; AVX512-NEXT: vmovdqa64 (%rdi), %xmm0 ++; AVX512-NEXT: vpextrq $1, %xmm0, %rax ++; AVX512-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 ++; AVX512-NEXT: vmovq %xmm0, %rax ++; AVX512-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ++; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ++; AVX512-NEXT: retq + %ld = load <2 x i64>, <2 x i64> *%a + %cvt = sitofp <2 x i64> %ld to <2 x double> + ret <2 x double> %cvt +@@ -1930,15 +1980,14 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { + ; AVX1-NEXT: vmovaps (%rdi), %ymm0 + ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 + ; AVX1-NEXT: vpextrq $1, %xmm1, %rax +-; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 + ; AVX1-NEXT: vmovq %xmm1, %rax +-; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 ++; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 + ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] + ; AVX1-NEXT: vpextrq $1, %xmm0, %rax +-; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 + ; AVX1-NEXT: vmovq %xmm0, %rax +-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 ++; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 + ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] + ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + ; AVX1-NEXT: retq +@@ -1948,18 +1997,34 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { + ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 + ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 + ; AVX2-NEXT: vpextrq $1, %xmm1, %rax +-; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 + ; AVX2-NEXT: vmovq %xmm1, %rax +-; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 ++; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 + ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] + ; AVX2-NEXT: vpextrq $1, %xmm0, %rax +-; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 + ; AVX2-NEXT: vmovq %xmm0, %rax +-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 ++; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 + ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] + ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + ; AVX2-NEXT: retq ++; ++; AVX512-LABEL: sitofp_load_4i64_to_4f64: ++; AVX512: # BB#0: ++; AVX512-NEXT: vmovdqa64 (%rdi), %ymm0 ++; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm1 ++; AVX512-NEXT: vpextrq $1, %xmm1, %rax ++; AVX512-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ++; AVX512-NEXT: vmovq %xmm1, %rax ++; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ++; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ++; AVX512-NEXT: vpextrq $1, %xmm0, %rax ++; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ++; AVX512-NEXT: vmovq %xmm0, %rax ++; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ++; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ++; AVX512-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 ++; AVX512-NEXT: retq + %ld = load <4 x i64>, <4 x i64> *%a + %cvt = sitofp <4 x i64> %ld to <4 x double> + ret <4 x double> %cvt +@@ -2365,17 +2430,16 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { + ; AVX1: # BB#0: + ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 + ; AVX1-NEXT: vpextrq $1, %xmm0, %rax +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 + ; AVX1-NEXT: vmovq %xmm0, %rax +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] + ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 + ; AVX1-NEXT: vmovq %xmm0, %rax +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] + ; AVX1-NEXT: vpextrq $1, %xmm0, %rax +-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] + ; AVX1-NEXT: vzeroupper + ; AVX1-NEXT: retq +@@ -2384,20 +2448,36 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { + ; AVX2: # BB#0: + ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 + ; AVX2-NEXT: vpextrq $1, %xmm0, %rax +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 + ; AVX2-NEXT: vmovq %xmm0, %rax +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] + ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 + ; AVX2-NEXT: vmovq %xmm0, %rax +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] + ; AVX2-NEXT: vpextrq $1, %xmm0, %rax +-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] + ; AVX2-NEXT: vzeroupper + ; AVX2-NEXT: retq ++; ++; AVX512-LABEL: sitofp_load_4i64_to_4f32: ++; AVX512: # BB#0: ++; AVX512-NEXT: vmovdqa64 (%rdi), %ymm0 ++; AVX512-NEXT: vpextrq $1, %xmm0, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ++; AVX512-NEXT: vmovq %xmm0, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ++; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ++; AVX512-NEXT: vmovq %xmm0, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ++; AVX512-NEXT: vpextrq $1, %xmm0, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ++; AVX512-NEXT: retq + %ld = load <4 x i64>, <4 x i64> *%a + %cvt = sitofp <4 x i64> %ld to <4 x float> + ret <4 x float> %cvt +@@ -2503,29 +2583,28 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 + ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 + ; AVX1-NEXT: vpextrq $1, %xmm1, %rax +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 + ; AVX1-NEXT: vmovq %xmm1, %rax +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] + ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 + ; AVX1-NEXT: vmovq %xmm1, %rax +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] + ; AVX1-NEXT: vpextrq $1, %xmm1, %rax +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] + ; AVX1-NEXT: vpextrq $1, %xmm0, %rax +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 + ; AVX1-NEXT: vmovq %xmm0, %rax +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] + ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 + ; AVX1-NEXT: vmovq %xmm0, %rax +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] + ; AVX1-NEXT: vpextrq $1, %xmm0, %rax +-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] + ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + ; AVX1-NEXT: retq +@@ -2535,32 +2614,62 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 + ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 + ; AVX2-NEXT: vpextrq $1, %xmm1, %rax +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 + ; AVX2-NEXT: vmovq %xmm1, %rax +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] + ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 + ; AVX2-NEXT: vmovq %xmm1, %rax +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] + ; AVX2-NEXT: vpextrq $1, %xmm1, %rax +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] + ; AVX2-NEXT: vpextrq $1, %xmm0, %rax +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 + ; AVX2-NEXT: vmovq %xmm0, %rax +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] + ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 + ; AVX2-NEXT: vmovq %xmm0, %rax +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] + ; AVX2-NEXT: vpextrq $1, %xmm0, %rax +-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] + ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + ; AVX2-NEXT: retq ++; ++; AVX512-LABEL: sitofp_load_8i64_to_8f32: ++; AVX512: # BB#0: ++; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ++; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ++; AVX512-NEXT: vpextrq $1, %xmm1, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ++; AVX512-NEXT: vmovq %xmm1, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ++; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ++; AVX512-NEXT: vmovq %xmm2, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ++; AVX512-NEXT: vpextrq $1, %xmm2, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ++; AVX512-NEXT: vpextrq $1, %xmm0, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ++; AVX512-NEXT: vmovq %xmm0, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ++; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm0 ++; AVX512-NEXT: vmovq %xmm0, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ++; AVX512-NEXT: vpextrq $1, %xmm0, %rax ++; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 ++; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ++; AVX512-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 ++; AVX512-NEXT: retq + %ld = load <8 x i64>, <8 x i64> *%a + %cvt = sitofp <8 x i64> %ld to <8 x float> + ret <8 x float> %cvt +@@ -2733,12 +2842,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB74_1 + ; AVX1-NEXT: # BB#2: +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 + ; AVX1-NEXT: jmp .LBB74_3 + ; AVX1-NEXT: .LBB74_1: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 + ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 + ; AVX1-NEXT: .LBB74_3: + ; AVX1-NEXT: vmovq %xmm0, %rax +@@ -2747,12 +2856,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB74_4 + ; AVX1-NEXT: # BB#5: +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 + ; AVX1-NEXT: jmp .LBB74_6 + ; AVX1-NEXT: .LBB74_4: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2 + ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 + ; AVX1-NEXT: .LBB74_6: + ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +@@ -2763,12 +2872,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB74_7 + ; AVX1-NEXT: # BB#8: +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 + ; AVX1-NEXT: jmp .LBB74_9 + ; AVX1-NEXT: .LBB74_7: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2 + ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 + ; AVX1-NEXT: .LBB74_9: + ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +@@ -2778,16 +2887,14 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB74_10 + ; AVX1-NEXT: # BB#11: +-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] + ; AVX1-NEXT: vzeroupper + ; AVX1-NEXT: retq + ; AVX1-NEXT: .LBB74_10: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0 + ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 + ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] + ; AVX1-NEXT: vzeroupper +@@ -2802,12 +2909,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB74_1 + ; AVX2-NEXT: # BB#2: +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 + ; AVX2-NEXT: jmp .LBB74_3 + ; AVX2-NEXT: .LBB74_1: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 + ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 + ; AVX2-NEXT: .LBB74_3: + ; AVX2-NEXT: vmovq %xmm0, %rax +@@ -2816,12 +2923,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB74_4 + ; AVX2-NEXT: # BB#5: +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 + ; AVX2-NEXT: jmp .LBB74_6 + ; AVX2-NEXT: .LBB74_4: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2 + ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 + ; AVX2-NEXT: .LBB74_6: + ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +@@ -2832,12 +2939,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB74_7 + ; AVX2-NEXT: # BB#8: +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 + ; AVX2-NEXT: jmp .LBB74_9 + ; AVX2-NEXT: .LBB74_7: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2 + ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 + ; AVX2-NEXT: .LBB74_9: + ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +@@ -2847,16 +2954,14 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB74_10 + ; AVX2-NEXT: # BB#11: +-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] + ; AVX2-NEXT: vzeroupper + ; AVX2-NEXT: retq + ; AVX2-NEXT: .LBB74_10: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0 + ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 + ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] + ; AVX2-NEXT: vzeroupper +@@ -3094,12 +3199,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB78_1 + ; AVX1-NEXT: # BB#2: +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 + ; AVX1-NEXT: jmp .LBB78_3 + ; AVX1-NEXT: .LBB78_1: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 + ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 + ; AVX1-NEXT: .LBB78_3: + ; AVX1-NEXT: vmovq %xmm2, %rax +@@ -3108,12 +3213,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB78_4 + ; AVX1-NEXT: # BB#5: +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 + ; AVX1-NEXT: jmp .LBB78_6 + ; AVX1-NEXT: .LBB78_4: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm3 + ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 + ; AVX1-NEXT: .LBB78_6: + ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +@@ -3123,12 +3228,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB78_7 + ; AVX1-NEXT: # BB#8: +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 + ; AVX1-NEXT: jmp .LBB78_9 + ; AVX1-NEXT: .LBB78_7: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm4, %xmm4 + ; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 + ; AVX1-NEXT: .LBB78_9: + ; AVX1-NEXT: vpextrq $1, %xmm2, %rax +@@ -3137,12 +3242,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB78_10 + ; AVX1-NEXT: # BB#11: +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 + ; AVX1-NEXT: jmp .LBB78_12 + ; AVX1-NEXT: .LBB78_10: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm2 + ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 + ; AVX1-NEXT: .LBB78_12: + ; AVX1-NEXT: vpextrq $1, %xmm0, %rax +@@ -3151,12 +3256,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB78_13 + ; AVX1-NEXT: # BB#14: +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 + ; AVX1-NEXT: jmp .LBB78_15 + ; AVX1-NEXT: .LBB78_13: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm5 + ; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 + ; AVX1-NEXT: .LBB78_15: + ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] +@@ -3166,12 +3271,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB78_16 + ; AVX1-NEXT: # BB#17: +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 + ; AVX1-NEXT: jmp .LBB78_18 + ; AVX1-NEXT: .LBB78_16: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm3 + ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 + ; AVX1-NEXT: .LBB78_18: + ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] +@@ -3183,14 +3288,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB78_19 + ; AVX1-NEXT: # BB#20: +-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 + ; AVX1-NEXT: jmp .LBB78_21 + ; AVX1-NEXT: .LBB78_19: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm0 + ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 + ; AVX1-NEXT: .LBB78_21: + ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] +@@ -3201,12 +3304,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX1-NEXT: testq %rax, %rax + ; AVX1-NEXT: js .LBB78_22 + ; AVX1-NEXT: # BB#23: +-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 + ; AVX1-NEXT: jmp .LBB78_24 + ; AVX1-NEXT: .LBB78_22: + ; AVX1-NEXT: shrq %rax + ; AVX1-NEXT: orq %rax, %rcx +-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 ++; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm2 + ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 + ; AVX1-NEXT: .LBB78_24: + ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +@@ -3223,12 +3326,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB78_1 + ; AVX2-NEXT: # BB#2: +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 + ; AVX2-NEXT: jmp .LBB78_3 + ; AVX2-NEXT: .LBB78_1: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 + ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 + ; AVX2-NEXT: .LBB78_3: + ; AVX2-NEXT: vmovq %xmm2, %rax +@@ -3237,12 +3340,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB78_4 + ; AVX2-NEXT: # BB#5: +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 + ; AVX2-NEXT: jmp .LBB78_6 + ; AVX2-NEXT: .LBB78_4: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm3 + ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 + ; AVX2-NEXT: .LBB78_6: + ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +@@ -3252,12 +3355,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB78_7 + ; AVX2-NEXT: # BB#8: +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 + ; AVX2-NEXT: jmp .LBB78_9 + ; AVX2-NEXT: .LBB78_7: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm4, %xmm4 + ; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 + ; AVX2-NEXT: .LBB78_9: + ; AVX2-NEXT: vpextrq $1, %xmm2, %rax +@@ -3266,12 +3369,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB78_10 + ; AVX2-NEXT: # BB#11: +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 + ; AVX2-NEXT: jmp .LBB78_12 + ; AVX2-NEXT: .LBB78_10: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm2 + ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 + ; AVX2-NEXT: .LBB78_12: + ; AVX2-NEXT: vpextrq $1, %xmm0, %rax +@@ -3280,12 +3383,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB78_13 + ; AVX2-NEXT: # BB#14: +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 + ; AVX2-NEXT: jmp .LBB78_15 + ; AVX2-NEXT: .LBB78_13: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm5 + ; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 + ; AVX2-NEXT: .LBB78_15: + ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] +@@ -3295,12 +3398,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB78_16 + ; AVX2-NEXT: # BB#17: +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 + ; AVX2-NEXT: jmp .LBB78_18 + ; AVX2-NEXT: .LBB78_16: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm3 + ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 + ; AVX2-NEXT: .LBB78_18: + ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] +@@ -3312,14 +3415,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB78_19 + ; AVX2-NEXT: # BB#20: +-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 + ; AVX2-NEXT: jmp .LBB78_21 + ; AVX2-NEXT: .LBB78_19: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm0 + ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 + ; AVX2-NEXT: .LBB78_21: + ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] +@@ -3330,12 +3431,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { + ; AVX2-NEXT: testq %rax, %rax + ; AVX2-NEXT: js .LBB78_22 + ; AVX2-NEXT: # BB#23: +-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 + ; AVX2-NEXT: jmp .LBB78_24 + ; AVX2-NEXT: .LBB78_22: + ; AVX2-NEXT: shrq %rax + ; AVX2-NEXT: orq %rax, %rcx +-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 ++; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm2 + ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 + ; AVX2-NEXT: .LBB78_24: + ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +-- +2.11.0 + diff --git a/deps/patches/llvm-PR278923.patch b/deps/patches/llvm-PR278923.patch new file mode 100644 index 0000000000000..486777711193a --- /dev/null +++ b/deps/patches/llvm-PR278923.patch @@ -0,0 +1,69 @@ +From 77eee1c0f05c587e7fb8a9a2064908d7333dcfb9 Mon Sep 17 00:00:00 2001 +From: Marina Yatsina +Date: Wed, 17 Aug 2016 11:40:21 +0000 +Subject: [PATCH 5/5] Fixing bug committed in rev. 278321 + +In theory the indices of RC (and thus the index used for LiveRegs) may differ from the indices of OpRC. +Fixed the code to extract the correct RC index. +OpRC contains the first X consecutive elements of RC, and thus their indices are currently de facto the same, therefore a test cannot be added at this point. + +Differential Revision: https://reviews.llvm.org/D23491 + + + +git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@278923 91177308-0d34-0410-b5e6-96231b3b80d8 +--- + include/llvm/Target/TargetRegisterInfo.h | 6 ++++++ + lib/CodeGen/ExecutionDepsFix.cpp | 9 ++++++--- + 2 files changed, 12 insertions(+), 3 deletions(-) + +diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h +index e5a6c8ed2f2..e5642493928 100644 +--- a/include/llvm/Target/TargetRegisterInfo.h ++++ b/include/llvm/Target/TargetRegisterInfo.h +@@ -17,6 +17,7 @@ + #define LLVM_TARGET_TARGETREGISTERINFO_H + + #include "llvm/ADT/ArrayRef.h" ++#include "llvm/ADT/iterator_range.h" + #include "llvm/CodeGen/MachineBasicBlock.h" + #include "llvm/CodeGen/MachineValueType.h" + #include "llvm/IR/CallingConv.h" +@@ -86,6 +87,11 @@ public: + + /// Return the number of registers in this class. + unsigned getNumRegs() const { return MC->getNumRegs(); } ++ ++ iterator_range::const_iterator> ++ getRegisters() const { ++ return make_range(MC->begin(), MC->end()); ++ } + + /// Return the specified register in the class. + unsigned getRegister(unsigned i) const { +diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp +index 5f91db9251c..213dd58a31d 100644 +--- a/lib/CodeGen/ExecutionDepsFix.cpp ++++ b/lib/CodeGen/ExecutionDepsFix.cpp +@@ -509,12 +509,15 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, + // max clearance or clearance higher than Pref. + unsigned MaxClearance = 0; + unsigned MaxClearanceReg = OriginalReg; +- for (unsigned rx = 0; rx < OpRC->getNumRegs(); ++rx) { +- unsigned Clearance = CurInstr - LiveRegs[rx].Def; ++ for (auto Reg : OpRC->getRegisters()) { ++ assert(AliasMap[Reg].size() == 1 && ++ "Reg is expected to be mapped to a single index"); ++ int RCrx = *regIndices(Reg).begin(); ++ unsigned Clearance = CurInstr - LiveRegs[RCrx].Def; + if (Clearance <= MaxClearance) + continue; + MaxClearance = Clearance; +- MaxClearanceReg = OpRC->getRegister(rx); ++ MaxClearanceReg = Reg; + + if (MaxClearance > Pref) + break; +-- +2.11.0 +