Skip to content

Commit 1382a3a

Browse files
committed
Revert "AMDGPU: Fix handling of alignment padding in DAG argument lowering"
This reverts commit r337021. WARNING: MemorySanitizer: use-of-uninitialized-value #0 0x1415cd65 in void write_signed<long>(llvm::raw_ostream&, long, unsigned long, llvm::IntegerStyle) /code/llvm-project/llvm/lib/Support/NativeFormatting.cpp:95:7 brson#1 0x1415c900 in llvm::write_integer(llvm::raw_ostream&, long, unsigned long, llvm::IntegerStyle) /code/llvm-project/llvm/lib/Support/NativeFormatting.cpp:121:3 brson#2 0x1472357f in llvm::raw_ostream::operator<<(long) /code/llvm-project/llvm/lib/Support/raw_ostream.cpp:117:3 brson#3 0x13bb9d4 in llvm::raw_ostream::operator<<(int) /code/llvm-project/llvm/include/llvm/Support/raw_ostream.h:210:18 brson#4 0x3c2bc18 in void printField<unsigned int, &(amd_kernel_code_s::amd_kernel_code_version_major)>(llvm::StringRef, amd_kernel_code_s const&, llvm::raw_ostream&) /code/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp:78:23 brson#5 0x3c250ba in llvm::printAmdKernelCodeField(amd_kernel_code_s const&, int, llvm::raw_ostream&) /code/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp:104:5 brson#6 0x3c27ca3 in llvm::dumpAmdKernelCode(amd_kernel_code_s const*, llvm::raw_ostream&, char const*) /code/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp:113:5 brson#7 0x3a46e6c in llvm::AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(amd_kernel_code_s const&) /code/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp:161:3 brson#8 0xd371e4 in llvm::AMDGPUAsmPrinter::EmitFunctionBodyStart() /code/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp:204:26 [...] Uninitialized value was created by an allocation of 'KernelCode' in the stack frame of function '_ZN4llvm16AMDGPUAsmPrinter21EmitFunctionBodyStartEv' #0 0xd36650 in llvm::AMDGPUAsmPrinter::EmitFunctionBodyStart() /code/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp:192 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@337079 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 383081c commit 1382a3a

17 files changed

+214
-420
lines changed

lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

+4-10
Original file line numberDiff line numberDiff line change
@@ -1128,13 +1128,6 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
11281128
void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
11291129
const SIProgramInfo &CurrentProgramInfo,
11301130
const MachineFunction &MF) const {
1131-
const Function &F = MF.getFunction();
1132-
1133-
// Avoid asserting on erroneous cases.
1134-
if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
1135-
F.getCallingConv() != CallingConv::SPIR_KERNEL)
1136-
return;
1137-
11381131
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11391132
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
11401133

@@ -1181,8 +1174,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
11811174
if (STM.isXNACKEnabled())
11821175
Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
11831176

1184-
unsigned MaxKernArgAlign;
1185-
Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1177+
// FIXME: Should use getKernArgSize
1178+
Out.kernarg_segment_byte_size =
1179+
STM.getKernArgSegmentSize(MF.getFunction(), MFI->getExplicitKernArgSize());
11861180
Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
11871181
Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
11881182
Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
@@ -1191,7 +1185,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
11911185
// These alignment values are specified in powers of two, so alignment =
11921186
// 2^n. The minimum alignment is 2^4 = 16.
11931187
Out.kernarg_segment_alignment = std::max((size_t)4,
1194-
countTrailingZeros(MaxKernArgAlign));
1188+
countTrailingZeros(MFI->getMaxKernArgAlign()));
11951189

11961190
if (STM.debuggerEmitPrologue()) {
11971191
Out.debug_wavefront_private_segment_offset_sgpr =

lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

+5-6
Original file line numberDiff line numberDiff line change
@@ -209,16 +209,15 @@ Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps(
209209
const Function &F = MF.getFunction();
210210

211211
// Avoid asserting on erroneous cases.
212-
if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
213-
F.getCallingConv() != CallingConv::SPIR_KERNEL)
212+
if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
214213
return HSACodeProps;
215214

216-
unsigned MaxKernArgAlign;
217-
HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F,
218-
MaxKernArgAlign);
215+
HSACodeProps.mKernargSegmentSize =
216+
STM.getKernArgSegmentSize(F, MFI.getExplicitKernArgSize());
219217
HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
220218
HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
221-
HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u);
219+
HSACodeProps.mKernargSegmentAlign =
220+
std::max(uint32_t(4), MFI.getMaxKernArgAlign());
222221
HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
223222
HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR;
224223
HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR;

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

+75-108
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
#include "SIInstrInfo.h"
3131
#include "SIMachineFunctionInfo.h"
3232
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
33-
#include "llvm/CodeGen/Analysis.h"
3433
#include "llvm/CodeGen/CallingConvLower.h"
3534
#include "llvm/CodeGen/MachineFunction.h"
3635
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -41,6 +40,18 @@
4140
#include "llvm/Support/KnownBits.h"
4241
using namespace llvm;
4342

43+
static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
44+
CCValAssign::LocInfo LocInfo,
45+
ISD::ArgFlagsTy ArgFlags, CCState &State) {
46+
MachineFunction &MF = State.getMachineFunction();
47+
AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
48+
49+
uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
50+
ArgFlags.getOrigAlign());
51+
State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
52+
return true;
53+
}
54+
4455
static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
4556
CCValAssign::LocInfo LocInfo,
4657
ISD::ArgFlagsTy ArgFlags, CCState &State,
@@ -899,118 +910,74 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
899910
/// for each individual part is i8. We pass the memory type as LocVT to the
900911
/// calling convention analysis function and the register type (Ins[x].VT) as
901912
/// the ValVT.
902-
void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
903-
CCState &State,
904-
const SmallVectorImpl<ISD::InputArg> &Ins) const {
905-
const MachineFunction &MF = State.getMachineFunction();
906-
const Function &Fn = MF.getFunction();
907-
LLVMContext &Ctx = Fn.getParent()->getContext();
908-
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
909-
const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
910-
911-
unsigned MaxAlign = 1;
912-
uint64_t ExplicitArgOffset = 0;
913-
const DataLayout &DL = Fn.getParent()->getDataLayout();
914-
915-
unsigned InIndex = 0;
916-
917-
for (const Argument &Arg : Fn.args()) {
918-
Type *BaseArgTy = Arg.getType();
919-
unsigned Align = DL.getABITypeAlignment(BaseArgTy);
920-
MaxAlign = std::max(Align, MaxAlign);
921-
unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
922-
923-
uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
924-
ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
925-
926-
// We're basically throwing away everything passed into us and starting over
927-
// to get accurate in-memory offsets. The "PartOffset" is completely useless
928-
// to us as computed in Ins.
929-
//
930-
// We also need to figure out what type legalization is trying to do to get
931-
// the correct memory offsets.
932-
933-
SmallVector<EVT, 16> ValueVTs;
934-
SmallVector<uint64_t, 16> Offsets;
935-
ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
936-
937-
for (unsigned Value = 0, NumValues = ValueVTs.size();
938-
Value != NumValues; ++Value) {
939-
uint64_t BasePartOffset = Offsets[Value];
940-
941-
EVT ArgVT = ValueVTs[Value];
942-
EVT MemVT = ArgVT;
943-
MVT RegisterVT =
944-
getRegisterTypeForCallingConv(Ctx, ArgVT);
945-
unsigned NumRegs =
946-
getNumRegistersForCallingConv(Ctx, ArgVT);
947-
948-
if (!Subtarget->isAmdHsaOS() &&
949-
(ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) {
950-
// The ABI says the caller will extend these values to 32-bits.
951-
MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32;
952-
} else if (NumRegs == 1) {
953-
// This argument is not split, so the IR type is the memory type.
954-
if (ArgVT.isExtended()) {
955-
// We have an extended type, like i24, so we should just use the
956-
// register type.
957-
MemVT = RegisterVT;
958-
} else {
959-
MemVT = ArgVT;
960-
}
961-
} else if (ArgVT.isVector() && RegisterVT.isVector() &&
962-
ArgVT.getScalarType() == RegisterVT.getScalarType()) {
963-
assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
964-
// We have a vector value which has been split into a vector with
965-
// the same scalar type, but fewer elements. This should handle
966-
// all the floating-point vector types.
967-
MemVT = RegisterVT;
968-
} else if (ArgVT.isVector() &&
969-
ArgVT.getVectorNumElements() == NumRegs) {
970-
// This arg has been split so that each element is stored in a separate
971-
// register.
972-
MemVT = ArgVT.getScalarType();
973-
} else if (ArgVT.isExtended()) {
974-
// We have an extended type, like i65.
975-
MemVT = RegisterVT;
913+
void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
914+
const SmallVectorImpl<ISD::InputArg> &Ins) const {
915+
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
916+
const ISD::InputArg &In = Ins[i];
917+
EVT MemVT;
918+
919+
unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
920+
921+
if (!Subtarget->isAmdHsaOS() &&
922+
(In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
923+
// The ABI says the caller will extend these values to 32-bits.
924+
MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
925+
} else if (NumRegs == 1) {
926+
// This argument is not split, so the IR type is the memory type.
927+
assert(!In.Flags.isSplit());
928+
if (In.ArgVT.isExtended()) {
929+
// We have an extended type, like i24, so we should just use the register type
930+
MemVT = In.VT;
976931
} else {
977-
unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
978-
assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
979-
if (RegisterVT.isInteger()) {
980-
MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
981-
} else if (RegisterVT.isVector()) {
982-
assert(!RegisterVT.getScalarType().isFloatingPoint());
983-
unsigned NumElements = RegisterVT.getVectorNumElements();
984-
assert(MemoryBits % NumElements == 0);
985-
// This vector type has been split into another vector type with
986-
// a different elements size.
987-
EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
988-
MemoryBits / NumElements);
989-
MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
990-
} else {
991-
llvm_unreachable("cannot deduce memory type.");
992-
}
932+
MemVT = In.ArgVT;
993933
}
994-
995-
// Convert one element vectors to scalar.
996-
if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
997-
MemVT = MemVT.getScalarType();
998-
999-
if (MemVT.isExtended()) {
1000-
// This should really only happen if we have vec3 arguments
1001-
assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
1002-
MemVT = MemVT.getPow2VectorType(State.getContext());
934+
} else if (In.ArgVT.isVector() && In.VT.isVector() &&
935+
In.ArgVT.getScalarType() == In.VT.getScalarType()) {
936+
assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
937+
// We have a vector value which has been split into a vector with
938+
// the same scalar type, but fewer elements. This should handle
939+
// all the floating-point vector types.
940+
MemVT = In.VT;
941+
} else if (In.ArgVT.isVector() &&
942+
In.ArgVT.getVectorNumElements() == NumRegs) {
943+
// This arg has been split so that each element is stored in a separate
944+
// register.
945+
MemVT = In.ArgVT.getScalarType();
946+
} else if (In.ArgVT.isExtended()) {
947+
// We have an extended type, like i65.
948+
MemVT = In.VT;
949+
} else {
950+
unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
951+
assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
952+
if (In.VT.isInteger()) {
953+
MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
954+
} else if (In.VT.isVector()) {
955+
assert(!In.VT.getScalarType().isFloatingPoint());
956+
unsigned NumElements = In.VT.getVectorNumElements();
957+
assert(MemoryBits % NumElements == 0);
958+
// This vector type has been split into another vector type with
959+
// a different elements size.
960+
EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
961+
MemoryBits / NumElements);
962+
MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
963+
} else {
964+
llvm_unreachable("cannot deduce memory type.");
1003965
}
966+
}
1004967

1005-
unsigned PartOffset = 0;
1006-
for (unsigned i = 0; i != NumRegs; ++i) {
1007-
State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1008-
BasePartOffset + PartOffset,
1009-
MemVT.getSimpleVT(),
1010-
CCValAssign::Full));
1011-
PartOffset += MemVT.getStoreSize();
1012-
}
968+
// Convert one element vectors to scalar.
969+
if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
970+
MemVT = MemVT.getScalarType();
971+
972+
if (MemVT.isExtended()) {
973+
// This should really only happen if we have vec3 arguments
974+
assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
975+
MemVT = MemVT.getPow2VectorType(State.getContext());
1013976
}
977+
978+
assert(MemVT.isSimple());
979+
allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
980+
State);
1014981
}
1015982
}
1016983

lib/Target/AMDGPU/AMDGPUISelLowering.h

+2-5
Original file line numberDiff line numberDiff line change
@@ -122,11 +122,8 @@ class AMDGPUTargetLowering : public TargetLowering {
122122
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
123123
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
124124
SmallVectorImpl<SDValue> &Results) const;
125-
126-
void analyzeFormalArgumentsCompute(
127-
CCState &State,
128-
const SmallVectorImpl<ISD::InputArg> &Ins) const;
129-
125+
void analyzeFormalArgumentsCompute(CCState &State,
126+
const SmallVectorImpl<ISD::InputArg> &Ins) const;
130127
public:
131128
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
132129

lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
7777
const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
7878
const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
7979

80-
unsigned MaxAlign;
8180
// FIXME: Alignment is broken broken with explicit arg offset.;
82-
const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
81+
const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F);
8382
if (TotalKernArgSize == 0)
8483
return false;
8584

@@ -92,11 +91,13 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
9291
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
9392

9493
unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
94+
unsigned MaxAlign = 1;
9595
uint64_t ExplicitArgOffset = 0;
9696

9797
for (Argument &Arg : F.args()) {
9898
Type *ArgTy = Arg.getType();
9999
unsigned Align = DL.getABITypeAlignment(ArgTy);
100+
MaxAlign = std::max(Align, MaxAlign);
100101
unsigned Size = DL.getTypeSizeInBits(ArgTy);
101102
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
102103

lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

+2-9
Original file line numberDiff line numberDiff line change
@@ -24,23 +24,16 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
2424
NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
2525
MemoryBound(false),
2626
WaveLimiter(false) {
27-
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
28-
2927
// FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
3028
// except reserved size is not correctly aligned.
31-
const Function &F = MF.getFunction();
3229

3330
if (auto *Resolver = MF.getMMI().getResolver()) {
3431
if (AMDGPUPerfHintAnalysis *PHA = static_cast<AMDGPUPerfHintAnalysis*>(
3532
Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) {
36-
MemoryBound = PHA->isMemoryBound(&F);
37-
WaveLimiter = PHA->needsWaveLimiter(&F);
33+
MemoryBound = PHA->isMemoryBound(&MF.getFunction());
34+
WaveLimiter = PHA->needsWaveLimiter(&MF.getFunction());
3835
}
3936
}
40-
41-
CallingConv::ID CC = F.getCallingConv();
42-
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
43-
ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
4437
}
4538

4639
unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,

lib/Target/AMDGPU/AMDGPUMachineFunction.h

+13-2
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
2323
SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;
2424

2525
protected:
26-
uint64_t ExplicitKernArgSize; // Cache for this.
27-
unsigned MaxKernArgAlign; // Cache for this.
26+
uint64_t ExplicitKernArgSize;
27+
unsigned MaxKernArgAlign;
2828

2929
/// Number of bytes in the LDS that are being used.
3030
unsigned LDSSize;
@@ -44,6 +44,17 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
4444
public:
4545
AMDGPUMachineFunction(const MachineFunction &MF);
4646

47+
uint64_t allocateKernArg(uint64_t Size, unsigned Align) {
48+
assert(isPowerOf2_32(Align));
49+
ExplicitKernArgSize = alignTo(ExplicitKernArgSize, Align);
50+
51+
uint64_t Result = ExplicitKernArgSize;
52+
ExplicitKernArgSize += Size;
53+
54+
MaxKernArgAlign = std::max(Align, MaxKernArgAlign);
55+
return Result;
56+
}
57+
4758
uint64_t getExplicitKernArgSize() const {
4859
return ExplicitKernArgSize;
4960
}

0 commit comments

Comments
 (0)