-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][CodeGen] true16 codegen pattern for fma #127240
[AMDGPU][True16][CodeGen] true16 codegen pattern for fma #127240
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesPrevious PR #122950 get reverted since one test is failing repen this PR and fixed the problem Patch is 66.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127240.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d8f3f9c54abc1..9cc74a7acd8ae 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -198,6 +198,8 @@ static unsigned macToMad(unsigned Opc) {
return AMDGPU::V_FMA_F32_e64;
case AMDGPU::V_FMAC_F16_e64:
return AMDGPU::V_FMA_F16_gfx9_e64;
+ case AMDGPU::V_FMAC_F16_t16_e64:
+ return AMDGPU::V_FMA_F16_gfx9_t16_e64;
case AMDGPU::V_FMAC_F16_fake16_e64:
return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
case AMDGPU::V_FMAC_LEGACY_F32_e64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index baacb5d3d5455..0a01ee1dc3a71 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3544,6 +3544,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
@@ -3564,6 +3565,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
bool IsFMA =
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@@ -3597,16 +3599,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
- : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
+ : ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAMK_F16_t16
+ : AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16)
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
- // would also require restricting their register classes. For now
- // just bail out.
- if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
+ // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
+ // restricting their register classes. For now just bail out.
+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
+ NewOpc == AMDGPU::V_FMAMK_F16_fake16)
return false;
const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
@@ -3621,7 +3626,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->setIsKill(RegSrc->isKill());
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3676,23 +3681,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
- : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
+ : ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAAK_F16_t16
+ : AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16)
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
- // would also require restricting their register classes. For now
- // just bail out.
- if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
+ // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
+ // restricting their register classes. For now just bail out.
+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
+ NewOpc == AMDGPU::V_FMAAK_F16_fake16)
return false;
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3879,8 +3887,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
return AMDGPU::V_FMA_LEGACY_F32_e64;
case AMDGPU::V_FMAC_F16_e32:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
- return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
+ return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMA_F16_gfx9_t16_e64
+ : AMDGPU::V_FMA_F16_gfx9_fake16_e64
: AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_F32_e32:
case AMDGPU::V_FMAC_F32_e64:
@@ -3946,19 +3957,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
- assert(
- Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
- "V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
- "pre-RA");
+ assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
+ Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
+ "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
+ "present "
+ "pre-RA");
// Handle MAC/FMAC.
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3973,6 +3987,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return nullptr;
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4058,8 +4073,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
int64_t Imm;
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
unsigned NewOpc =
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
- : AMDGPU::V_FMAAK_F16)
+ IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts()
+ ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAAK_F16_t16
+ : AMDGPU::V_FMAAK_F16_fake16
+ : AMDGPU::V_FMAAK_F16)
: AMDGPU::V_FMAAK_F32)
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
if (pseudoToMCOpcode(NewOpc) != -1) {
@@ -4076,11 +4094,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
}
- unsigned NewOpc =
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
- : AMDGPU::V_FMAMK_F16)
- : AMDGPU::V_FMAMK_F32)
- : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
+ unsigned NewOpc = IsFMA
+ ? (IsF16 ? (ST.hasTrue16BitInsts()
+ ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAMK_F16_t16
+ : AMDGPU::V_FMAMK_F16_fake16
+ : AMDGPU::V_FMAMK_F16)
+ : AMDGPU::V_FMAMK_F32)
+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
if (pseudoToMCOpcode(NewOpc) != -1) {
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
@@ -4526,6 +4547,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMAC_F64_e64:
@@ -5582,7 +5604,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
+ case AMDGPU::S_FMAC_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
+ : AMDGPU::V_FMAC_F16_fake16_e64;
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6e08aff24ec23..3faf0795157dc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3287,6 +3287,14 @@ def : GCNPat <
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat <
+ (fma (f16 (VOP3NoMods f16:$src0)),
+ (f16 (VOP3NoMods f16:$src1)),
+ (f16 (VOP3NoMods f16:$src2))),
+ (V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+ SRCMODS.NONE, $src2)
+>;
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
(fma (f16 (VOP3NoMods f16:$src0)),
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 979812e07fc3f..f03cde455f295 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -455,9 +455,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ NewOpcode = AMDGPU::V_FMAAK_F16;
+ break;
+ case AMDGPU::V_FMA_F16_gfx9_t16_e64:
+ NewOpcode = AMDGPU::V_FMAAK_F16_t16;
+ break;
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
- NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
- : AMDGPU::V_FMAAK_F16;
+ NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
break;
}
}
@@ -485,9 +489,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ NewOpcode = AMDGPU::V_FMAMK_F16;
+ break;
+ case AMDGPU::V_FMA_F16_gfx9_t16_e64:
+ NewOpcode = AMDGPU::V_FMAMK_F16_t16;
+ break;
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
- NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
- : AMDGPU::V_FMAMK_F16;
+ NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
break;
}
}
@@ -959,6 +967,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
+ MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
shrinkMadFma(MI);
continue;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
index 99e6c5d06a0e1..0b09cabf25a16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
@@ -3,7 +3,8 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define float @v_fma_f32(float %x, float %y, float %z) {
; GFX6-LABEL: v_fma_f32:
@@ -107,11 +108,18 @@ define half @v_fma_f16(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fma = call half @llvm.fma.f16(half %x, half %y, half %z)
ret half %fma
}
@@ -145,11 +153,17 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, -v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16_fneg_lhs:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, -v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16_fneg_lhs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16_fneg_lhs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg half %x
%fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z)
ret half %fma
@@ -184,11 +198,17 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, -v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16_fneg_rhs:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, -v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16_fneg_rhs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16_fneg_rhs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.y = fneg half %y
%fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z)
ret half %fma
@@ -223,11 +243,17 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16_fneg_add:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, v1, -v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16_fneg_add:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16_fneg_add:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.z = fneg half %z
%fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z)
ret half %fma
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index ac7944f25fe37..23e4b80b61f69 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
+# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow
+# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 52a23690dcf53..a33fd03e0ce03 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -3,8 +3,10 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx...
[truncated]
|
@llvm/pr-subscribers-llvm-globalisel Author: Brox Chen (broxigarchen) ChangesPrevious PR #122950 get reverted since one test is failing repen this PR and fixed the problem Patch is 66.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127240.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d8f3f9c54abc1..9cc74a7acd8ae 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -198,6 +198,8 @@ static unsigned macToMad(unsigned Opc) {
return AMDGPU::V_FMA_F32_e64;
case AMDGPU::V_FMAC_F16_e64:
return AMDGPU::V_FMA_F16_gfx9_e64;
+ case AMDGPU::V_FMAC_F16_t16_e64:
+ return AMDGPU::V_FMA_F16_gfx9_t16_e64;
case AMDGPU::V_FMAC_F16_fake16_e64:
return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
case AMDGPU::V_FMAC_LEGACY_F32_e64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index baacb5d3d5455..0a01ee1dc3a71 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3544,6 +3544,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
@@ -3564,6 +3565,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
bool IsFMA =
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@@ -3597,16 +3599,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
- : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
+ : ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAMK_F16_t16
+ : AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16)
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
- // would also require restricting their register classes. For now
- // just bail out.
- if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
+ // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
+ // restricting their register classes. For now just bail out.
+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
+ NewOpc == AMDGPU::V_FMAMK_F16_fake16)
return false;
const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
@@ -3621,7 +3626,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->setIsKill(RegSrc->isKill());
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3676,23 +3681,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
- : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
+ : ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAAK_F16_t16
+ : AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16)
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
- // would also require restricting their register classes. For now
- // just bail out.
- if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
+ // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
+ // restricting their register classes. For now just bail out.
+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
+ NewOpc == AMDGPU::V_FMAAK_F16_fake16)
return false;
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3879,8 +3887,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
return AMDGPU::V_FMA_LEGACY_F32_e64;
case AMDGPU::V_FMAC_F16_e32:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
- return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
+ return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMA_F16_gfx9_t16_e64
+ : AMDGPU::V_FMA_F16_gfx9_fake16_e64
: AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_F32_e32:
case AMDGPU::V_FMAC_F32_e64:
@@ -3946,19 +3957,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
- assert(
- Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
- "V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
- "pre-RA");
+ assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
+ Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
+ "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
+ "present "
+ "pre-RA");
// Handle MAC/FMAC.
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3973,6 +3987,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return nullptr;
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4058,8 +4073,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
int64_t Imm;
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
unsigned NewOpc =
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
- : AMDGPU::V_FMAAK_F16)
+ IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts()
+ ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAAK_F16_t16
+ : AMDGPU::V_FMAAK_F16_fake16
+ : AMDGPU::V_FMAAK_F16)
: AMDGPU::V_FMAAK_F32)
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
if (pseudoToMCOpcode(NewOpc) != -1) {
@@ -4076,11 +4094,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
}
- unsigned NewOpc =
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
- : AMDGPU::V_FMAMK_F16)
- : AMDGPU::V_FMAMK_F32)
- : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
+ unsigned NewOpc = IsFMA
+ ? (IsF16 ? (ST.hasTrue16BitInsts()
+ ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAMK_F16_t16
+ : AMDGPU::V_FMAMK_F16_fake16
+ : AMDGPU::V_FMAMK_F16)
+ : AMDGPU::V_FMAMK_F32)
+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
if (pseudoToMCOpcode(NewOpc) != -1) {
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
@@ -4526,6 +4547,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMAC_F64_e64:
@@ -5582,7 +5604,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
+ case AMDGPU::S_FMAC_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
+ : AMDGPU::V_FMAC_F16_fake16_e64;
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6e08aff24ec23..3faf0795157dc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3287,6 +3287,14 @@ def : GCNPat <
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat <
+ (fma (f16 (VOP3NoMods f16:$src0)),
+ (f16 (VOP3NoMods f16:$src1)),
+ (f16 (VOP3NoMods f16:$src2))),
+ (V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+ SRCMODS.NONE, $src2)
+>;
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
(fma (f16 (VOP3NoMods f16:$src0)),
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 979812e07fc3f..f03cde455f295 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -455,9 +455,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ NewOpcode = AMDGPU::V_FMAAK_F16;
+ break;
+ case AMDGPU::V_FMA_F16_gfx9_t16_e64:
+ NewOpcode = AMDGPU::V_FMAAK_F16_t16;
+ break;
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
- NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
- : AMDGPU::V_FMAAK_F16;
+ NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
break;
}
}
@@ -485,9 +489,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ NewOpcode = AMDGPU::V_FMAMK_F16;
+ break;
+ case AMDGPU::V_FMA_F16_gfx9_t16_e64:
+ NewOpcode = AMDGPU::V_FMAMK_F16_t16;
+ break;
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
- NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
- : AMDGPU::V_FMAMK_F16;
+ NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
break;
}
}
@@ -959,6 +967,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
+ MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
shrinkMadFma(MI);
continue;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
index 99e6c5d06a0e1..0b09cabf25a16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
@@ -3,7 +3,8 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define float @v_fma_f32(float %x, float %y, float %z) {
; GFX6-LABEL: v_fma_f32:
@@ -107,11 +108,18 @@ define half @v_fma_f16(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fma = call half @llvm.fma.f16(half %x, half %y, half %z)
ret half %fma
}
@@ -145,11 +153,17 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, -v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16_fneg_lhs:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, -v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16_fneg_lhs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16_fneg_lhs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg half %x
%fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z)
ret half %fma
@@ -184,11 +198,17 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, -v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16_fneg_rhs:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, -v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16_fneg_rhs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16_fneg_rhs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.y = fneg half %y
%fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z)
ret half %fma
@@ -223,11 +243,17 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16_fneg_add:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, v1, -v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16_fneg_add:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16_fneg_add:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.z = fneg half %z
%fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z)
ret half %fma
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index ac7944f25fe37..23e4b80b61f69 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
+# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow
+# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 52a23690dcf53..a33fd03e0ce03 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -3,8 +3,10 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx...
[truncated]
|
unsigned NewOpc = IsFMA | ||
? (IsF16 ? (ST.hasTrue16BitInsts() | ||
? ST.useRealTrue16Insts() | ||
? AMDGPU::V_FMAMK_F16_t16 | ||
: AMDGPU::V_FMAMK_F16_fake16 | ||
: AMDGPU::V_FMAMK_F16) | ||
: AMDGPU::V_FMAMK_F32) | ||
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Factor both of these nasty nested ternary operators into a helper function
true16 codegen pattern for f16 fma. created a duplicated shrink-mad-fma-gfx10.mir from shrink-mad-fma to seperate pre-GFX11 and GFX11 mir test.
507cf0d
to
57d79ed
Compare
"V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be " | ||
"present " | ||
"pre-RA"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be " | |
"present " | |
"pre-RA"); | |
"V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be present pre-RA"); |
Unnecessary literal break
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wouldn't clang-format fix it. This can be a long string for a single line.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think it handles merge or split of string literals. It didn't put together the tiny "pre-RA" part back together with the previous piece
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it splits string literals if it's too long, but it does not merge automatically. Fixed
Previous PR #122950 get reverted since it hit the buildbot failure. Another patch get merged when this PR is under review, and thus causing one test not up to date.
repen this PR and fixed the issue.