Skip to content

Commit 2b1c6df

Browse files
Harsha JagasiaKrzysztof Parzyszek
Harsha Jagasia
authored and
Krzysztof Parzyszek
committed
[Hexagon] Performance regression with b2b
For code below: { r7 = addasl(r3,r0,#2) r8 = addasl(r3,r2,#2) r5 = memw(r3+r0<<#2) r6 = memw(r3+r2<<#2) } { p1 = cmp.gtu(r6,r5) if (p1.new) memw(r8+#0) = r5 if (p1.new) memw(r7+#0) = r6 } { r0 = mux(p1,r2,r4) } In packetizer, a new packet is created for the cmp instruction since there arent enough resources in previous packet. Also it is determined that the cmp stalls by 2 cycles since it depends on the prior load of r5. In current packetizer implementation, the predicated store is evaluated for whether it can go in the same packet as compare, and since the compare stalls, the stall of the predicated store does not matter and it can go in the same packet as the cmp. However the predicated store will stall for more cycles because of its dependence on the addasl instruction and to avoid that stall we can put it in a new packet. Improve the packetizer to check if an instruction being added to packet will stall longer than instruction already in packet and if so create a new packet.
1 parent d5b6e30 commit 2b1c6df

File tree

3 files changed

+89
-11
lines changed

3 files changed

+89
-11
lines changed

llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp

+21-11
Original file line numberDiff line numberDiff line change
@@ -1696,9 +1696,12 @@ HexagonPacketizerList::addToPacket(MachineInstr &MI) {
16961696
MachineBasicBlock::iterator MII = MI.getIterator();
16971697
MachineBasicBlock *MBB = MI.getParent();
16981698

1699-
if (CurrentPacketMIs.empty())
1699+
if (CurrentPacketMIs.empty()) {
17001700
PacketStalls = false;
1701+
PacketStallCycles = 0;
1702+
}
17011703
PacketStalls |= producesStall(MI);
1704+
PacketStallCycles = std::max(PacketStallCycles, calcStall(MI));
17021705

17031706
if (MI.isImplicitDef()) {
17041707
// Add to the packet to allow subsequent instructions to be checked
@@ -1878,12 +1881,7 @@ bool HexagonPacketizerList::isPureSlot0InsnWithNoSlot1Store(
18781881
}
18791882

18801883
// V60 forward scheduling.
1881-
bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
1882-
// If the packet already stalls, then ignore the stall from a subsequent
1883-
// instruction in the same packet.
1884-
if (PacketStalls)
1885-
return false;
1886-
1884+
unsigned int HexagonPacketizerList::calcStall(const MachineInstr &I) {
18871885
// Check whether the previous packet is in a different loop. If this is the
18881886
// case, there is little point in trying to avoid a stall because that would
18891887
// favor the rare case (loop entry) over the common case (loop iteration).
@@ -1895,10 +1893,12 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
18951893
auto *OldBB = OldPacketMIs.front()->getParent();
18961894
auto *ThisBB = I.getParent();
18971895
if (MLI->getLoopFor(OldBB) != MLI->getLoopFor(ThisBB))
1898-
return false;
1896+
return 0;
18991897
}
19001898

19011899
SUnit *SUI = MIToSUnit[const_cast<MachineInstr *>(&I)];
1900+
if (!SUI)
1901+
return 0;
19021902

19031903
// If the latency is 0 and there is a data dependence between this
19041904
// instruction and any instruction in the current packet, we disregard any
@@ -1927,7 +1927,7 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
19271927
if (Pred.getSUnit() == SUJ)
19281928
if ((Pred.getLatency() == 0 && Pred.isAssignedRegDep()) ||
19291929
HII->isNewValueJump(I) || HII->isToBeScheduledASAP(*J, I))
1930-
return false;
1930+
return 0;
19311931
}
19321932

19331933
// Check if the latency is greater than one between this instruction and any
@@ -1936,10 +1936,20 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
19361936
SUnit *SUJ = MIToSUnit[J];
19371937
for (auto &Pred : SUI->Preds)
19381938
if (Pred.getSUnit() == SUJ && Pred.getLatency() > 1)
1939-
return true;
1939+
return Pred.getLatency();
19401940
}
19411941

1942-
return false;
1942+
return 0;
1943+
}
1944+
1945+
bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
1946+
unsigned int Latency = calcStall(I);
1947+
if (Latency == 0)
1948+
return false;
1949+
// Ignore stall unless it stalls more than previous instruction in packet
1950+
if (PacketStalls)
1951+
return Latency > PacketStallCycles;
1952+
return true;
19431953
}
19441954

19451955
//===----------------------------------------------------------------------===//

llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h

+4
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ class HexagonPacketizerList : public VLIWPacketizerList {
5656
// Set to true if the packet contains an instruction that stalls with an
5757
// instruction from the previous packet.
5858
bool PacketStalls = false;
59+
// Set to the number of cycles of stall a given instruction will incur
60+
// because of dependence on instruction in previous packet.
61+
unsigned int PacketStallCycles = 0;
5962

6063
// Set to true if the packet has a duplex pair of sub-instructions.
6164
bool PacketHasDuplex = false;
@@ -157,6 +160,7 @@ class HexagonPacketizerList : public VLIWPacketizerList {
157160
bool hasDualStoreDependence(const MachineInstr &I, const MachineInstr &J);
158161
bool producesStall(const MachineInstr &MI);
159162
bool isPureSlot0InsnWithNoSlot1Store(const MachineInstr &MI);
163+
unsigned int calcStall(const MachineInstr &MI);
160164
};
161165

162166
} // end namespace llvm

llvm/test/CodeGen/Hexagon/nbench1.ll

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
; RUN: llc -march=hexagon -O3 < %s | FileCheck %s
2+
3+
; if instruction being considered for addition to packet has higher latency,
4+
; end existing packet and start a new one.
5+
6+
; CHECK: .LBB0_4:
7+
; CHECK: p{{[0-3]+}} = cmp.gtu(r{{[0-9]+}},r{{[0-9]+}})
8+
; CHECK-NEXT: }
9+
10+
@array = external dso_local local_unnamed_addr global i32*, align 4
11+
12+
; Function Attrs: nofree norecurse nounwind
13+
define dso_local void @NumSift(i32 %i, i32 %j) local_unnamed_addr #0 {
14+
entry:
15+
%add36 = shl i32 %i, 1
16+
%cmp.not37 = icmp ugt i32 %add36, %j
17+
br i1 %cmp.not37, label %while.end, label %while.body.lr.ph
18+
19+
while.body.lr.ph: ; preds = %entry
20+
%0 = load i32*, i32** @array, align 4
21+
%add16 = add i32 %j, 1
22+
br label %while.body
23+
24+
while.body: ; preds = %while.body.lr.ph, %if.end17
25+
%add39 = phi i32 [ %add36, %while.body.lr.ph ], [ %add, %if.end17 ]
26+
%i.addr.038 = phi i32 [ %i, %while.body.lr.ph ], [ %i.addr.1, %if.end17 ]
27+
%cmp2 = icmp ult i32 %add39, %j
28+
br i1 %cmp2, label %if.then, label %if.end7
29+
30+
if.then: ; preds = %while.body
31+
%arrayidx = getelementptr inbounds i32, i32* %0, i32 %add39
32+
%1 = load i32, i32* %arrayidx, align 4
33+
%add3 = or i32 %add39, 1
34+
%arrayidx4 = getelementptr inbounds i32, i32* %0, i32 %add3
35+
%2 = load i32, i32* %arrayidx4, align 4
36+
%cmp5 = icmp ult i32 %1, %2
37+
%spec.select = select i1 %cmp5, i32 %add3, i32 %add39
38+
br label %if.end7
39+
40+
if.end7: ; preds = %if.then, %while.body
41+
%k.0 = phi i32 [ %add39, %while.body ], [ %spec.select, %if.then ]
42+
%arrayidx8 = getelementptr inbounds i32, i32* %0, i32 %i.addr.038
43+
%3 = load i32, i32* %arrayidx8, align 4
44+
%arrayidx9 = getelementptr inbounds i32, i32* %0, i32 %k.0
45+
%4 = load i32, i32* %arrayidx9, align 4
46+
%cmp10 = icmp ult i32 %3, %4
47+
br i1 %cmp10, label %if.then11, label %if.end17
48+
49+
if.then11: ; preds = %if.end7
50+
store i32 %3, i32* %arrayidx9, align 4
51+
store i32 %4, i32* %arrayidx8, align 4
52+
br label %if.end17
53+
54+
if.end17: ; preds = %if.end7, %if.then11
55+
%i.addr.1 = phi i32 [ %k.0, %if.then11 ], [ %add16, %if.end7 ]
56+
%add = shl i32 %i.addr.1, 1
57+
%cmp.not = icmp ugt i32 %add, %j
58+
br i1 %cmp.not, label %while.end, label %while.body
59+
60+
while.end: ; preds = %if.end17, %entry
61+
ret void
62+
}
63+
64+
attributes #0 = { "target-cpu"="hexagonv65" }

0 commit comments

Comments
 (0)