|
| 1 | +From da4504b2d3c6629fbd58634bf76f1b85939d07cf Mon Sep 17 00:00:00 2001 |
| 2 | +From: Keno Fischer < [email protected]> |
| 3 | +Date: Fri, 15 Sep 2017 18:30:59 -0400 |
| 4 | +Subject: [PATCH] [Mem2Reg] Also handle memcpy |
| 5 | + |
| 6 | +Summary: |
| 7 | +In julia, when we know we're moving data between two memory locations, |
| 8 | +we always emit that as a memcpy rather than a load/store pair. However, |
| 9 | +this can give worse optimization results in certain cases because some |
| 10 | +optimizations that can handle load/store pairs cannot handle memcpys. |
| 11 | +Mem2reg is one of these optimizations. This patch adds rudamentary |
| 12 | +support for mem2reg for recognizing memcpys that cover the whole alloca |
| 13 | +we're promoting. While several more sophisticated passes (SROA, GVN) |
| 14 | +can get similar optimizations, it is preferable to have these kinds |
| 15 | +of cases caught early to expose optimization opportunities before |
| 16 | +getting to these later passes. The approach taken here is to split |
| 17 | +the memcpy into a load/store pair early (after legality analysis) |
| 18 | +and retain the rest of the analysis only on loads/stores. It would |
| 19 | +be possible of course to leave the memcpy as is and generate the |
| 20 | +left over load or store only on demand. However, that would entail |
| 21 | +a significantly larger patch for unclear benefit. |
| 22 | + |
| 23 | +Reviewers: chandlerc, dberlin |
| 24 | + |
| 25 | +Subscribers: llvm-commits |
| 26 | + |
| 27 | +Differential Revision: https://reviews.llvm.org/D37939 |
| 28 | +--- |
| 29 | + lib/Transforms/Utils/PromoteMemoryToRegister.cpp | 166 ++++++++++++++++++++--- |
| 30 | + test/Transforms/Mem2Reg/memcpy.ll | 101 ++++++++++++++ |
| 31 | + 2 files changed, 251 insertions(+), 16 deletions(-) |
| 32 | + create mode 100644 test/Transforms/Mem2Reg/memcpy.ll |
| 33 | + |
| 34 | +diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp |
| 35 | +index ac28f59..b08a0a1 100644 |
| 36 | +--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp |
| 37 | ++++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp |
| 38 | +@@ -49,6 +49,58 @@ STATISTIC(NumSingleStore, "Number of alloca's promoted with a single store"); |
| 39 | + STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); |
| 40 | + STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); |
| 41 | + |
| 42 | ++static bool isSplittableMemCpy(const MemCpyInst *MCI, const AllocaInst *AI) { |
| 43 | ++ // Punt if this alloca is an array allocation |
| 44 | ++ if (AI->isArrayAllocation()) |
| 45 | ++ return false; |
| 46 | ++ if (MCI->isVolatile()) |
| 47 | ++ return false; |
| 48 | ++ Value *Length = MCI->getLength(); |
| 49 | ++ if (!isa<ConstantInt>(Length)) |
| 50 | ++ return false; |
| 51 | ++ // Anything less than the full alloca, we leave for SROA |
| 52 | ++ const DataLayout &DL = AI->getModule()->getDataLayout(); |
| 53 | ++ size_t AIElSize = DL.getTypeAllocSize(AI->getAllocatedType()); |
| 54 | ++ if (cast<ConstantInt>(Length)->getZExtValue() != AIElSize) |
| 55 | ++ return false; |
| 56 | ++ // If the other argument is also an alloca, we need to be sure that either |
| 57 | ++ // the types are bitcastable, or the other alloca is not eligible for |
| 58 | ++ // promotion (e.g. because the memcpy is for less than the whole size of |
| 59 | ++ // that alloca), otherwise we risk turning an allocatable alloca into a |
| 60 | ++ // non-allocatable one when splitting the memcpy. |
| 61 | ++ AllocaInst *OtherAI = dyn_cast<AllocaInst>( |
| 62 | ++ AI == MCI->getSource() ? MCI->getDest() : MCI->getSource()); |
| 63 | ++ if (OtherAI) { |
| 64 | ++ if (!CastInst::isBitCastable(AI->getAllocatedType(), |
| 65 | ++ OtherAI->getAllocatedType()) && |
| 66 | ++ DL.getTypeAllocSize(OtherAI->getAllocatedType()) == AIElSize) |
| 67 | ++ return false; |
| 68 | ++ } |
| 69 | ++ return true; |
| 70 | ++} |
| 71 | ++ |
| 72 | ++/// Look at the result of a bitcast and see if it's only used by lifetime |
| 73 | ++/// intrinsics or splittable memcpys. This is needed, because IRBuilder |
| 74 | ++/// will always insert a bitcast to i8* for these intrinsics. |
| 75 | ++static bool onlyHasCanonicalizableUsers(const AllocaInst *AI, const Value *V) { |
| 76 | ++ for (const User *U : V->users()) { |
| 77 | ++ const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); |
| 78 | ++ if (!II) |
| 79 | ++ return false; |
| 80 | ++ |
| 81 | ++ if (isa<MemCpyInst>(II)) { |
| 82 | ++ if (!isSplittableMemCpy(cast<MemCpyInst>(II), AI)) |
| 83 | ++ return false; |
| 84 | ++ continue; |
| 85 | ++ } |
| 86 | ++ |
| 87 | ++ if (II->getIntrinsicID() != Intrinsic::lifetime_start && |
| 88 | ++ II->getIntrinsicID() != Intrinsic::lifetime_end) |
| 89 | ++ return false; |
| 90 | ++ } |
| 91 | ++ return true; |
| 92 | ++} |
| 93 | ++ |
| 94 | + bool llvm::isAllocaPromotable(const AllocaInst *AI) { |
| 95 | + // FIXME: If the memory unit is of pointer or integer type, we can permit |
| 96 | + // assignments to subsections of the memory unit. |
| 97 | +@@ -68,6 +120,9 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) { |
| 98 | + // not have any meaning for a local alloca. |
| 99 | + if (SI->isVolatile()) |
| 100 | + return false; |
| 101 | ++ } else if (const MemCpyInst *MCI = dyn_cast<MemCpyInst>(U)) { |
| 102 | ++ if (!isSplittableMemCpy(MCI, AI)) |
| 103 | ++ return false; |
| 104 | + } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { |
| 105 | + if (II->getIntrinsicID() != Intrinsic::lifetime_start && |
| 106 | + II->getIntrinsicID() != Intrinsic::lifetime_end) |
| 107 | +@@ -75,7 +130,7 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) { |
| 108 | + } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { |
| 109 | + if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS)) |
| 110 | + return false; |
| 111 | +- if (!onlyUsedByLifetimeMarkers(BCI)) |
| 112 | ++ if (!onlyHasCanonicalizableUsers(AI, BCI)) |
| 113 | + return false; |
| 114 | + } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) { |
| 115 | + if (GEPI->getType() != Type::getInt8PtrTy(U->getContext(), AS)) |
| 116 | +@@ -181,7 +235,13 @@ public: |
| 117 | + /// This code only looks at accesses to allocas. |
| 118 | + static bool isInterestingInstruction(const Instruction *I) { |
| 119 | ++ if (isa<MemCpyInst>(I)) { |
| 120 | ++ const MemCpyInst *MCI = cast<MemCpyInst>(I); |
| 121 | ++ return isa<AllocaInst>(MCI->getSource()) || |
| 122 | ++ isa<AllocaInst>(MCI->getDest()); |
| 123 | ++ } else { |
| 124 | + return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) || |
| 125 | + (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1))); |
| 126 | + } |
| 127 | ++ } |
| 128 | + |
| 129 | + /// Get or calculate the index of the specified instruction. |
| 130 | +@@ -208,6 +264,25 @@ public: |
| 131 | + return It->second; |
| 132 | + } |
| 133 | + |
| 134 | ++ // When we split a memcpy intrinsic, we need to update the numbering in this |
| 135 | ++ // struct. To make sure the relative ordering remains the same, we give both |
| 136 | ++ // the LI and the SI the number that the MCI used to have (if they are both |
| 137 | ++ // interesting). This means that they will have equal numbers, which usually |
| 138 | ++ // can't happen. However, since they can never reference the same alloca |
| 139 | ++ // (since memcpy operands may not overlap), this is fine, because we will |
| 140 | ++ // never compare instruction indices for instructions that operate on distinct |
| 141 | ++ // allocas. |
| 142 | ++ void splitMemCpy(MemCpyInst *MCI, LoadInst *LI, StoreInst *SI) { |
| 143 | ++ DenseMap<const Instruction *, unsigned>::iterator It = |
| 144 | ++ InstNumbers.find(MCI); |
| 145 | ++ if (It == InstNumbers.end()) |
| 146 | ++ return; |
| 147 | ++ unsigned MemCpyNumber = It->second; |
| 148 | ++ InstNumbers[LI] = MemCpyNumber; |
| 149 | ++ InstNumbers[SI] = MemCpyNumber; |
| 150 | ++ deleteValue(MCI); |
| 151 | ++ } |
| 152 | ++ |
| 153 | + void deleteValue(const Instruction *I) { InstNumbers.erase(I); } |
| 154 | + |
| 155 | + void clear() { InstNumbers.clear(); } |
| 156 | +@@ -305,9 +380,58 @@ static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) { |
| 157 | + AC->registerAssumption(CI); |
| 158 | + } |
| 159 | + |
| 160 | +-static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { |
| 161 | +- // Knowing that this alloca is promotable, we know that it's safe to kill all |
| 162 | +- // instructions except for load and store. |
| 163 | ++/// Split a memcpy instruction into the corresponding load/store. It is a little |
| 164 | ++/// more complicated than one might imagine, because we need to deal with the |
| 165 | ++/// fact that the side of the copy we're not currently processing might also |
| 166 | ++/// be a promotable alloca. We need to be careful to not break the promotable |
| 167 | ++/// predicate for that other alloca (if any). |
| 168 | ++static void doMemCpySplit(LargeBlockInfo &LBI, MemCpyInst *MCI, |
| 169 | ++ AllocaInst *AI) { |
| 170 | ++ AAMDNodes AA; |
| 171 | ++ MCI->getAAMetadata(AA); |
| 172 | ++ Value *MCISrc = MCI->getSource(); |
| 173 | ++ Type *LoadType = AI->getAllocatedType(); |
| 174 | ++ AllocaInst *SrcAI = dyn_cast<AllocaInst>(MCISrc); |
| 175 | ++ if (SrcAI && SrcAI->getType() != AI->getType()) { |
| 176 | ++ if (CastInst::isBitCastable(SrcAI->getAllocatedType(), LoadType)) |
| 177 | ++ LoadType = SrcAI->getAllocatedType(); |
| 178 | ++ } |
| 179 | ++ if (cast<PointerType>(MCISrc->getType())->getElementType() != LoadType) |
| 180 | ++ MCISrc = CastInst::Create( |
| 181 | ++ Instruction::BitCast, MCISrc, |
| 182 | ++ LoadType->getPointerTo( |
| 183 | ++ cast<PointerType>(MCISrc->getType())->getAddressSpace()), |
| 184 | ++ "", MCI); |
| 185 | ++ // This might add to the end of the use list, but that's fine. At worst, |
| 186 | ++ // we'd not visit the instructions we insert here, but we don't care |
| 187 | ++ // about them in this loop anyway. |
| 188 | ++ LoadInst *LI = new LoadInst(LoadType, MCISrc, "", MCI->isVolatile(), |
| 189 | ++ MCI->getAlignment(), MCI); |
| 190 | ++ Value *Val = LI; |
| 191 | ++ Value *MCIDest = MCI->getDest(); |
| 192 | ++ AllocaInst *DestAI = dyn_cast<AllocaInst>(MCIDest); |
| 193 | ++ Type *DestElTy = DestAI ? DestAI->getAllocatedType() : AI->getAllocatedType(); |
| 194 | ++ if (LI->getType() != DestElTy && |
| 195 | ++ CastInst::isBitCastable(LI->getType(), DestElTy)) |
| 196 | ++ Val = CastInst::Create(Instruction::BitCast, Val, DestElTy, "", MCI); |
| 197 | ++ if (cast<PointerType>(MCIDest->getType())->getElementType() != Val->getType()) |
| 198 | ++ MCIDest = CastInst::Create( |
| 199 | ++ Instruction::BitCast, MCIDest, |
| 200 | ++ Val->getType()->getPointerTo( |
| 201 | ++ cast<PointerType>(MCIDest->getType())->getAddressSpace()), |
| 202 | ++ "", MCI); |
| 203 | ++ StoreInst *SI = |
| 204 | ++ new StoreInst(Val, MCIDest, MCI->isVolatile(), MCI->getAlignment(), MCI); |
| 205 | ++ LI->setAAMetadata(AA); |
| 206 | ++ SI->setAAMetadata(AA); |
| 207 | ++ LBI.splitMemCpy(MCI, LI, SI); |
| 208 | ++ MCI->eraseFromParent(); |
| 209 | ++} |
| 210 | ++ |
| 211 | ++static void canonicalizeUsers(LargeBlockInfo &LBI, AllocaInst *AI) { |
| 212 | ++ // Knowing that this alloca is promotable, we know that it's safe to split |
| 213 | ++ // MTIs into load/store and to kill all other instructions except for |
| 214 | ++ // load and store. |
| 215 | + |
| 216 | + for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) { |
| 217 | + Instruction *I = cast<Instruction>(*UI); |
| 218 | +@@ -315,14 +439,24 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { |
| 219 | + if (isa<LoadInst>(I) || isa<StoreInst>(I)) |
| 220 | + continue; |
| 221 | + |
| 222 | ++ if (isa<MemCpyInst>(I)) { |
| 223 | ++ MemCpyInst *MCI = cast<MemCpyInst>(I); |
| 224 | ++ doMemCpySplit(LBI, MCI, AI); |
| 225 | ++ continue; |
| 226 | ++ } |
| 227 | ++ |
| 228 | + if (!I->getType()->isVoidTy()) { |
| 229 | +- // The only users of this bitcast/GEP instruction are lifetime intrinsics. |
| 230 | +- // Follow the use/def chain to erase them now instead of leaving it for |
| 231 | +- // dead code elimination later. |
| 232 | ++ // The only users of this bitcast/GEP instruction are lifetime/memcpy |
| 233 | ++ // intrinsics. Split memcpys and delete lifetime intrinsics. |
| 234 | + for (auto UUI = I->user_begin(), UUE = I->user_end(); UUI != UUE;) { |
| 235 | + Instruction *Inst = cast<Instruction>(*UUI); |
| 236 | + ++UUI; |
| 237 | +- Inst->eraseFromParent(); |
| 238 | ++ if (isa<MemCpyInst>(Inst)) { |
| 239 | ++ doMemCpySplit(LBI, cast<MemCpyInst>(Inst), AI); |
| 240 | ++ } else { |
| 241 | ++ // Must be a lifetime intrinsic |
| 242 | ++ Inst->eraseFromParent(); |
| 243 | ++ } |
| 244 | + } |
| 245 | + } |
| 246 | + I->eraseFromParent(); |
| 247 | +@@ -542,7 +676,7 @@ void PromoteMem2Reg::run() { |
| 248 | + assert(AI->getParent()->getParent() == &F && |
| 249 | + "All allocas should be in the same function, which is same as DF!"); |
| 250 | + |
| 251 | +- removeLifetimeIntrinsicUsers(AI); |
| 252 | ++ canonicalizeUsers(LBI, AI); |
| 253 | + |
| 254 | + if (AI->use_empty()) { |
| 255 | + // If there are no uses of the alloca, just delete it now. |
| 256 | +diff --git a/test/Transforms/Mem2Reg/memcpy.ll b/test/Transforms/Mem2Reg/memcpy.ll |
| 257 | +new file mode 100644 |
| 258 | +index 0000000..fbc4096 |
| 259 | +--- /dev/null |
| 260 | ++++ b/test/Transforms/Mem2Reg/memcpy.ll |
| 261 | +@@ -0,0 +1,101 @@ |
| 262 | ++; RUN: opt < %s -mem2reg -S | FileCheck %s |
| 263 | ++ |
| 264 | ++target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" |
| 265 | ++ |
| 266 | ++declare void @llvm.memcpy.p0i128.p0i64.i32(i128 *, i64 *, i32, i32, i1) |
| 267 | ++declare void @llvm.memcpy.p0i8.p0i8.i32(i8 *, i8 *, i32, i32, i1) |
| 268 | ++declare void @llvm.memcpy.p0i64.p0i64.i32(i64 *, i64 *, i32, i32, i1) |
| 269 | ++declare void @llvm.memcpy.p0f64.p0i64.i32(double *, i64 *, i32, i32, i1) |
| 270 | ++ |
| 271 | ++define i128 @test_cpy_different(i64) { |
| 272 | ++; CHECK-LABEL: @test_cpy_different |
| 273 | ++; CHECK-NOT: alloca i64 |
| 274 | ++; CHECK: store i64 %0 |
| 275 | ++ %a = alloca i64 |
| 276 | ++ %b = alloca i128 |
| 277 | ++ store i128 0, i128 *%b |
| 278 | ++ store i64 %0, i64 *%a |
| 279 | ++ call void @llvm.memcpy.p0i128.p0i64.i32(i128 *%b, i64 *%a, i32 8, i32 0, i1 0) |
| 280 | ++ %loaded = load i128, i128 *%b |
| 281 | ++ ret i128 %loaded |
| 282 | ++} |
| 283 | ++ |
| 284 | ++define i64 @test_cpy_same(i64) { |
| 285 | ++; CHECK-LABEL: @test_cpy_same |
| 286 | ++; CHECK-NOT: alloca |
| 287 | ++; CHECK: ret i64 %0 |
| 288 | ++ %a = alloca i64 |
| 289 | ++ %b = alloca i64 |
| 290 | ++ store i64 %0, i64 *%a |
| 291 | ++ call void @llvm.memcpy.p0i64.p0i64.i32(i64 *%b, i64 *%a, i32 8, i32 0, i1 0) |
| 292 | ++ %loaded = load i64, i64 *%b |
| 293 | ++ ret i64 %loaded |
| 294 | ++} |
| 295 | ++ |
| 296 | ++define double @test_cpy_different_type(i64) { |
| 297 | ++; CHECK-LABEL: @test_cpy_different_type |
| 298 | ++; CHECK-NOT: alloca |
| 299 | ++; CHECK: bitcast i64 %0 to double |
| 300 | ++ %a = alloca i64 |
| 301 | ++ %b = alloca double |
| 302 | ++ store i64 %0, i64 *%a |
| 303 | ++ call void @llvm.memcpy.p0f64.p0i64.i32(double *%b, i64 *%a, i32 8, i32 0, i1 0) |
| 304 | ++ %loaded = load double, double *%b |
| 305 | ++ ret double %loaded |
| 306 | ++} |
| 307 | ++ |
| 308 | ++define i128 @test_cpy_differenti8(i64) { |
| 309 | ++; CHECK-LABEL: @test_cpy_differenti8 |
| 310 | ++; CHECK-NOT: alloca i64 |
| 311 | ++; CHECK: store i64 %0 |
| 312 | ++ %a = alloca i64 |
| 313 | ++ %b = alloca i128 |
| 314 | ++ store i128 0, i128 *%b |
| 315 | ++ store i64 %0, i64 *%a |
| 316 | ++ %acast = bitcast i64* %a to i8* |
| 317 | ++ %bcast = bitcast i128* %b to i8* |
| 318 | ++ call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0) |
| 319 | ++ %loaded = load i128, i128 *%b |
| 320 | ++ ret i128 %loaded |
| 321 | ++} |
| 322 | ++ |
| 323 | ++define i64 @test_cpy_samei8(i64) { |
| 324 | ++; CHECK-LABEL: @test_cpy_samei8 |
| 325 | ++; CHECK-NOT: alloca |
| 326 | ++; CHECK: ret i64 %0 |
| 327 | ++ %a = alloca i64 |
| 328 | ++ %b = alloca i64 |
| 329 | ++ store i64 %0, i64 *%a |
| 330 | ++ %acast = bitcast i64* %a to i8* |
| 331 | ++ %bcast = bitcast i64* %b to i8* |
| 332 | ++ call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0) |
| 333 | ++ %loaded = load i64, i64 *%b |
| 334 | ++ ret i64 %loaded |
| 335 | ++} |
| 336 | ++ |
| 337 | ++define double @test_cpy_different_typei8(i64) { |
| 338 | ++; CHECK-LABEL: @test_cpy_different_typei8 |
| 339 | ++; CHECK-NOT: alloca |
| 340 | ++; CHECK: bitcast i64 %0 to double |
| 341 | ++ %a = alloca i64 |
| 342 | ++ %b = alloca double |
| 343 | ++ store i64 %0, i64 *%a |
| 344 | ++ %acast = bitcast i64* %a to i8* |
| 345 | ++ %bcast = bitcast double* %b to i8* |
| 346 | ++ call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0) |
| 347 | ++ %loaded = load double, double *%b |
| 348 | ++ ret double %loaded |
| 349 | ++} |
| 350 | ++ |
| 351 | ++define i64 @test_cpy_differenti8_reverse(i128) { |
| 352 | ++; CHECK-LABEL: @test_cpy_differenti8_reverse |
| 353 | ++; CHECK-NOT: alloca i64 |
| 354 | ++ %a = alloca i64 |
| 355 | ++ %b = alloca i128 |
| 356 | ++ store i128 %0, i128 *%b |
| 357 | ++ %acast = bitcast i64* %a to i8* |
| 358 | ++ %bcast = bitcast i128* %b to i8* |
| 359 | ++ call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%acast, i8 *%bcast, i32 8, i32 0, i1 0) |
| 360 | ++ %loaded = load i64, i64 *%a |
| 361 | ++ ret i64 %loaded |
| 362 | ++} |
| 363 | +-- |
| 364 | +2.9.3 |
| 365 | + |
0 commit comments