[Hexagon] Performance regression with b2b

Harsha Jagasia · Krzysztof Parzyszek · commit 2b1c6df5a60a · 2022-01-04T14:09:47.000-08:00
For code below: { r7 = addasl(r3,r0,#2) r8 = addasl(r3,r2,#2) r5 = memw(r3+r0<<#2) r6 = memw(r3+r2<<#2) } { p1 = cmp.gtu(r6,r5) if (p1.new) memw(r8+#0) = r5 if (p1.new) memw(r7+#0) = r6 } { r0 = mux(p1,r2,r4) } In packetizer, a new packet is created for the cmp instruction since there arent enough resources in previous packet. Also it is determined that the cmp stalls by 2 cycles since it depends on the prior load of r5. In current packetizer implementation, the predicated store is evaluated for whether it can go in the same packet as compare, and since the compare stalls, the stall of the predicated store does not matter and it can go in the same packet as the cmp. However the predicated store will stall for more cycles because of its dependence on the addasl instruction and to avoid that stall we can put it in a new packet. Improve the packetizer to check if an instruction being added to packet will stall longer than instruction already in packet and if so create a new packet.
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -1696,9 +1696,12 @@ HexagonPacketizerList::addToPacket(MachineInstr &MI) {
   MachineBasicBlock::iterator MII = MI.getIterator();
   MachineBasicBlock *MBB = MI.getParent();
 
-  if (CurrentPacketMIs.empty())
+  if (CurrentPacketMIs.empty()) {
     PacketStalls = false;
+    PacketStallCycles = 0;
+  }
   PacketStalls |= producesStall(MI);
+  PacketStallCycles = std::max(PacketStallCycles, calcStall(MI));
 
   if (MI.isImplicitDef()) {
     // Add to the packet to allow subsequent instructions to be checked
@@ -1878,12 +1881,7 @@ bool HexagonPacketizerList::isPureSlot0InsnWithNoSlot1Store(
 }
 
 // V60 forward scheduling.
-bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
-  // If the packet already stalls, then ignore the stall from a subsequent
-  // instruction in the same packet.
-  if (PacketStalls)
-    return false;
-
+unsigned int HexagonPacketizerList::calcStall(const MachineInstr &I) {
   // Check whether the previous packet is in a different loop. If this is the
   // case, there is little point in trying to avoid a stall because that would
   // favor the rare case (loop entry) over the common case (loop iteration).
@@ -1895,10 +1893,12 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
     auto *OldBB = OldPacketMIs.front()->getParent();
     auto *ThisBB = I.getParent();
     if (MLI->getLoopFor(OldBB) != MLI->getLoopFor(ThisBB))
-      return false;
+      return 0;
   }
 
   SUnit *SUI = MIToSUnit[const_cast<MachineInstr *>(&I)];
+  if (!SUI)
+    return 0;
 
   // If the latency is 0 and there is a data dependence between this
   // instruction and any instruction in the current packet, we disregard any
@@ -1927,7 +1927,7 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
       if (Pred.getSUnit() == SUJ)
         if ((Pred.getLatency() == 0 && Pred.isAssignedRegDep()) ||
             HII->isNewValueJump(I) || HII->isToBeScheduledASAP(*J, I))
-          return false;
+          return 0;
   }
 
   // Check if the latency is greater than one between this instruction and any
@@ -1936,10 +1936,20 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
     SUnit *SUJ = MIToSUnit[J];
     for (auto &Pred : SUI->Preds)
       if (Pred.getSUnit() == SUJ && Pred.getLatency() > 1)
-        return true;
+        return Pred.getLatency();
   }
 
-  return false;
+  return 0;
+}
+
+bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
+  unsigned int Latency = calcStall(I);
+  if (Latency == 0)
+    return false;
+  // Ignore stall unless it stalls more than previous instruction in packet
+  if (PacketStalls)
+    return Latency > PacketStallCycles;
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -56,6 +56,9 @@ class HexagonPacketizerList : public VLIWPacketizerList {
   // Set to true if the packet contains an instruction that stalls with an
   // instruction from the previous packet.
   bool PacketStalls = false;
+  // Set to the number of cycles of stall a given instruction will incur
+  // because of dependence on instruction in previous packet.
+  unsigned int PacketStallCycles = 0;
 
   // Set to true if the packet has a duplex pair of sub-instructions.
   bool PacketHasDuplex = false;
@@ -157,6 +160,7 @@ class HexagonPacketizerList : public VLIWPacketizerList {
   bool hasDualStoreDependence(const MachineInstr &I, const MachineInstr &J);
   bool producesStall(const MachineInstr &MI);
   bool isPureSlot0InsnWithNoSlot1Store(const MachineInstr &MI);
+  unsigned int calcStall(const MachineInstr &MI);
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/Hexagon/nbench1.ll b/llvm/test/CodeGen/Hexagon/nbench1.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=hexagon -O3 < %s | FileCheck %s
+
+; if instruction being considered for addition to packet has higher latency,
+; end existing packet and start a new one.
+
+; CHECK: .LBB0_4:
+; CHECK: p{{[0-3]+}} = cmp.gtu(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK-NEXT: }
+
+@array = external dso_local local_unnamed_addr global i32*, align 4
+
+; Function Attrs: nofree norecurse nounwind
+define dso_local void @NumSift(i32 %i, i32 %j) local_unnamed_addr #0 {
+entry:
+  %add36 = shl i32 %i, 1
+  %cmp.not37 = icmp ugt i32 %add36, %j
+  br i1 %cmp.not37, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %entry
+  %0 = load i32*, i32** @array, align 4
+  %add16 = add i32 %j, 1
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.lr.ph, %if.end17
+  %add39 = phi i32 [ %add36, %while.body.lr.ph ], [ %add, %if.end17 ]
+  %i.addr.038 = phi i32 [ %i, %while.body.lr.ph ], [ %i.addr.1, %if.end17 ]
+  %cmp2 = icmp ult i32 %add39, %j
+  br i1 %cmp2, label %if.then, label %if.end7
+
+if.then:                                          ; preds = %while.body
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 %add39
+  %1 = load i32, i32* %arrayidx, align 4
+  %add3 = or i32 %add39, 1
+  %arrayidx4 = getelementptr inbounds i32, i32* %0, i32 %add3
+  %2 = load i32, i32* %arrayidx4, align 4
+  %cmp5 = icmp ult i32 %1, %2
+  %spec.select = select i1 %cmp5, i32 %add3, i32 %add39
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.then, %while.body
+  %k.0 = phi i32 [ %add39, %while.body ], [ %spec.select, %if.then ]
+  %arrayidx8 = getelementptr inbounds i32, i32* %0, i32 %i.addr.038
+  %3 = load i32, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %0, i32 %k.0
+  %4 = load i32, i32* %arrayidx9, align 4
+  %cmp10 = icmp ult i32 %3, %4
+  br i1 %cmp10, label %if.then11, label %if.end17
+
+if.then11:                                        ; preds = %if.end7
+  store i32 %3, i32* %arrayidx9, align 4
+  store i32 %4, i32* %arrayidx8, align 4
+  br label %if.end17
+
+if.end17:                                         ; preds = %if.end7, %if.then11
+  %i.addr.1 = phi i32 [ %k.0, %if.then11 ], [ %add16, %if.end7 ]
+  %add = shl i32 %i.addr.1, 1
+  %cmp.not = icmp ugt i32 %add, %j
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.end:                                        ; preds = %if.end17, %entry
+  ret void
+}
+
+attributes #0 = { "target-cpu"="hexagonv65" }