diff options
Diffstat (limited to 'lib/Transforms/Scalar')
39 files changed, 2453 insertions, 1023 deletions
diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 5c74885..5aa2b97 100644 --- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -23,15 +23,15 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -71,7 +71,6 @@ struct AlignmentFromAssumptions : public FunctionPass { ScalarEvolution *SE; DominatorTree *DT; - const DataLayout *DL; bool extractAlignmentInfo(CallInst *I, Value *&AAPtr, const SCEV *&AlignSCEV, const SCEV *&OffSCEV); @@ -123,7 +122,7 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV, // If the displacement is not an exact multiple, but the remainder is a // constant, then return this remainder (but only if it is a power of 2). - uint64_t DiffUnitsAbs = abs64(DiffUnits); + uint64_t DiffUnitsAbs = std::abs(DiffUnits); if (isPowerOf2_64(DiffUnitsAbs)) return (unsigned) DiffUnitsAbs; } @@ -316,7 +315,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) { continue; if (Instruction *K = dyn_cast<Instruction>(J)) - if (isValidAssumeForContext(ACall, K, DL, DT)) + if (isValidAssumeForContext(ACall, K, DT)) WorkList.push_back(K); } @@ -400,7 +399,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) { Visited.insert(J); for (User *UJ : J->users()) { Instruction *K = cast<Instruction>(UJ); - if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DL, DT)) + if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DT)) WorkList.push_back(K); } } @@ -413,8 +412,6 @@ bool AlignmentFromAssumptions::runOnFunction(Function &F) { auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; NewDestAlignments.clear(); NewSrcAlignments.clear(); diff --git a/lib/Transforms/Scalar/Android.mk b/lib/Transforms/Scalar/Android.mk index ed803cd..cf30f39 100644 --- a/lib/Transforms/Scalar/Android.mk +++ b/lib/Transforms/Scalar/Android.mk @@ -20,6 +20,7 @@ transforms_scalar_SRC_FILES := \ LoopDeletion.cpp \ LoopIdiomRecognize.cpp \ LoopInstSimplify.cpp \ + LoopInterchange.cpp \ LoopRerollPass.cpp \ LoopRotation.cpp \ LoopStrengthReduce.cpp \ diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp index c7bd79d..09c605e 100644 --- a/lib/Transforms/Scalar/BDCE.cpp +++ b/lib/Transforms/Scalar/BDCE.cpp @@ -64,7 +64,6 @@ struct BDCE : public FunctionPass { APInt &KnownZero2, APInt &KnownOne2); AssumptionCache *AC; - const DataLayout *DL; DominatorTree *DT; }; } @@ -95,20 +94,21 @@ void BDCE::determineLiveOperandBits(const Instruction *UserI, // however, want to do this twice, so we cache the result in APInts that live // in the caller. For the two-relevant-operands case, both operand values are // provided here. - auto ComputeKnownBits = [&](unsigned BitWidth, const Value *V1, - const Value *V2) { - KnownZero = APInt(BitWidth, 0); - KnownOne = APInt(BitWidth, 0); - computeKnownBits(const_cast<Value*>(V1), KnownZero, KnownOne, DL, 0, AC, - UserI, DT); - - if (V2) { - KnownZero2 = APInt(BitWidth, 0); - KnownOne2 = APInt(BitWidth, 0); - computeKnownBits(const_cast<Value*>(V2), KnownZero2, KnownOne2, DL, 0, AC, - UserI, DT); - } - }; + auto ComputeKnownBits = + [&](unsigned BitWidth, const Value *V1, const Value *V2) { + const DataLayout &DL = I->getModule()->getDataLayout(); + KnownZero = APInt(BitWidth, 0); + KnownOne = APInt(BitWidth, 0); + computeKnownBits(const_cast<Value *>(V1), KnownZero, KnownOne, DL, 0, + AC, UserI, DT); + + if (V2) { + KnownZero2 = APInt(BitWidth, 0); + KnownOne2 = APInt(BitWidth, 0); + computeKnownBits(const_cast<Value *>(V2), KnownZero2, KnownOne2, DL, + 0, AC, UserI, DT); + } + }; switch (UserI->getOpcode()) { default: break; @@ -263,7 +263,6 @@ bool BDCE::runOnFunction(Function& F) { return false; AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - DL = F.getParent()->getDataLayout(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DenseMap<Instruction *, APInt> AliveBits; diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index d297eb1..d12fdb7 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -18,6 +18,7 @@ add_llvm_library(LLVMScalarOpts LoopDeletion.cpp LoopIdiomRecognize.cpp LoopInstSimplify.cpp + LoopInterchange.cpp LoopRerollPass.cpp LoopRotation.cpp LoopStrengthReduce.cpp diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp index e3aab4b..4288742 100644 --- a/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -43,6 +43,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include <tuple> using namespace llvm; diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp index 29d4e05..c974ebb 100644 --- a/lib/Transforms/Scalar/ConstantProp.cpp +++ b/lib/Transforms/Scalar/ConstantProp.cpp @@ -22,7 +22,6 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/Constant.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" @@ -68,8 +67,7 @@ bool ConstantPropagation::runOnFunction(Function &F) { WorkList.insert(&*i); } bool Changed = false; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; + const DataLayout &DL = F.getParent()->getDataLayout(); TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 5a3b5cf..912d527 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -126,8 +127,9 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) { Changed = true; } - // FIXME: Provide DL, TLI, DT, AT to SimplifyInstruction. - if (Value *V = SimplifyInstruction(P)) { + // FIXME: Provide TLI, DT, AT to SimplifyInstruction. + const DataLayout &DL = BB->getModule()->getDataLayout(); + if (Value *V = SimplifyInstruction(P, DL)) { P->replaceAllUsesWith(V); P->eraseFromParent(); Changed = true; diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index c2ce1d5..cb8981b 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -33,7 +34,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -78,7 +79,8 @@ namespace { bool HandleFree(CallInst *F); bool handleEndBlock(BasicBlock &BB); void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, - SmallSetVector<Value*, 16> &DeadStackObjects); + SmallSetVector<Value *, 16> &DeadStackObjects, + const DataLayout &DL); void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -194,18 +196,12 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { /// describe the memory operations for this instruction. static AliasAnalysis::Location getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { - const DataLayout *DL = AA.getDataLayout(); if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) return AA.getLocation(SI); if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) { // memcpy/memmove/memset. AliasAnalysis::Location Loc = AA.getLocationForDest(MI); - // If we don't have target data around, an unknown size in Location means - // that we should use the size of the pointee type. This isn't valid for - // memset/memcpy, which writes more than an i8. - if (Loc.Size == AliasAnalysis::UnknownSize && DL == nullptr) - return AliasAnalysis::Location(); return Loc; } @@ -215,11 +211,6 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { switch (II->getIntrinsicID()) { default: return AliasAnalysis::Location(); // Unhandled intrinsic. case Intrinsic::init_trampoline: - // If we don't have target data around, an unknown size in Location means - // that we should use the size of the pointee type. This isn't valid for - // init.trampoline, which writes more than an i8. - if (!DL) return AliasAnalysis::Location(); - // FIXME: We don't know the size of the trampoline, so we can't really // handle it here. return AliasAnalysis::Location(II->getArgOperand(0)); @@ -321,9 +312,10 @@ static Value *getStoredPointerOperand(Instruction *I) { return CS.getArgument(0); } -static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) { +static uint64_t getPointerSize(const Value *V, const DataLayout &DL, + const TargetLibraryInfo *TLI) { uint64_t Size; - if (getObjectSize(V, Size, AA.getDataLayout(), AA.getTargetLibraryInfo())) + if (getObjectSize(V, Size, DL, TLI)) return Size; return AliasAnalysis::UnknownSize; } @@ -343,10 +335,9 @@ namespace { /// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, const AliasAnalysis::Location &Earlier, - AliasAnalysis &AA, - int64_t &EarlierOff, - int64_t &LaterOff) { - const DataLayout *DL = AA.getDataLayout(); + const DataLayout &DL, + const TargetLibraryInfo *TLI, + int64_t &EarlierOff, int64_t &LaterOff) { const Value *P1 = Earlier.Ptr->stripPointerCasts(); const Value *P2 = Later.Ptr->stripPointerCasts(); @@ -367,7 +358,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // Otherwise, we have to have size information, and the later store has to be // larger than the earlier one. if (Later.Size == AliasAnalysis::UnknownSize || - Earlier.Size == AliasAnalysis::UnknownSize || DL == nullptr) + Earlier.Size == AliasAnalysis::UnknownSize) return OverwriteUnknown; // Check to see if the later store is to the entire object (either a global, @@ -382,7 +373,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, return OverwriteUnknown; // If the "Later" store is to a recognizable object, get its size. - uint64_t ObjectSize = getPointerSize(UO2, AA); + uint64_t ObjectSize = getPointerSize(UO2, DL, TLI); if (ObjectSize != AliasAnalysis::UnknownSize) if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size) return OverwriteComplete; @@ -560,8 +551,10 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { if (isRemovable(DepWrite) && !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { int64_t InstWriteOffset, DepWriteOffset; - OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA, - DepWriteOffset, InstWriteOffset); + const DataLayout &DL = BB.getModule()->getDataLayout(); + OverwriteResult OR = + isOverwrite(Loc, DepLoc, DL, AA->getTargetLibraryInfo(), + DepWriteOffset, InstWriteOffset); if (OR == OverwriteComplete) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); @@ -655,6 +648,7 @@ bool DSE::HandleFree(CallInst *F) { AliasAnalysis::Location Loc = AliasAnalysis::Location(F->getOperand(0)); SmallVector<BasicBlock *, 16> Blocks; Blocks.push_back(F->getParent()); + const DataLayout &DL = F->getModule()->getDataLayout(); while (!Blocks.empty()) { BasicBlock *BB = Blocks.pop_back_val(); @@ -668,7 +662,7 @@ bool DSE::HandleFree(CallInst *F) { break; Value *DepPointer = - GetUnderlyingObject(getStoredPointerOperand(Dependency)); + GetUnderlyingObject(getStoredPointerOperand(Dependency), DL); // Check for aliasing. if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) @@ -728,6 +722,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) { if (AI->hasByValOrInAllocaAttr()) DeadStackObjects.insert(AI); + const DataLayout &DL = BB.getModule()->getDataLayout(); + // Scan the basic block backwards for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){ --BBI; @@ -736,7 +732,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { if (hasMemoryWrite(BBI, TLI) && isRemovable(BBI)) { // See through pointer-to-pointer bitcasts SmallVector<Value *, 4> Pointers; - GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers); + GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers, DL); // Stores to stack values are valid candidates for removal. bool AllDead = true; @@ -799,8 +795,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // the call is live. DeadStackObjects.remove_if([&](Value *I) { // See if the call site touches the value. - AliasAnalysis::ModRefResult A = - AA->getModRefInfo(CS, I, getPointerSize(I, *AA)); + AliasAnalysis::ModRefResult A = AA->getModRefInfo( + CS, I, getPointerSize(I, DL, AA->getTargetLibraryInfo())); return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref; }); @@ -835,7 +831,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // Remove any allocas from the DeadPointer set that are loaded, as this // makes any stores above the access live. - RemoveAccessedObjects(LoadedLoc, DeadStackObjects); + RemoveAccessedObjects(LoadedLoc, DeadStackObjects, DL); // If all of the allocas were clobbered by the access then we're not going // to find anything else to process. @@ -850,8 +846,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) { /// of the stack objects in the DeadStackObjects set. If so, they become live /// because the location is being loaded. void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, - SmallSetVector<Value*, 16> &DeadStackObjects) { - const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr); + SmallSetVector<Value *, 16> &DeadStackObjects, + const DataLayout &DL) { + const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL); // A constant can't be in the dead pointer set. if (isa<Constant>(UnderlyingPointer)) @@ -867,7 +864,8 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, // Remove objects that could alias LoadedLoc. DeadStackObjects.remove_if([&](Value *I) { // See if the loaded location could alias the stack location. - AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA)); + AliasAnalysis::Location StackLoc( + I, getPointerSize(I, DL, AA->getTargetLibraryInfo())); return !AA->isNoAlias(StackLoc, LoadedLoc); }); } diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index 9309623..d5b9e03 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" @@ -27,7 +28,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/RecyclingAllocator.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <deque> @@ -263,7 +264,6 @@ namespace { class EarlyCSE { public: Function &F; - const DataLayout *DL; const TargetLibraryInfo &TLI; const TargetTransformInfo &TTI; DominatorTree &DT; @@ -308,11 +308,10 @@ public: unsigned CurrentGeneration; /// \brief Set up the EarlyCSE runner for a particular function. - EarlyCSE(Function &F, const DataLayout *DL, const TargetLibraryInfo &TLI, + EarlyCSE(Function &F, const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, DominatorTree &DT, AssumptionCache &AC) - : F(F), DL(DL), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) { - } + : F(F), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {} bool run(); @@ -469,6 +468,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { Instruction *LastStore = nullptr; bool Changed = false; + const DataLayout &DL = BB->getModule()->getDataLayout(); // See if any instructions in the block can be eliminated. If so, do it. If // not, add them to AvailableValues. @@ -685,14 +685,12 @@ bool EarlyCSE::run() { PreservedAnalyses EarlyCSEPass::run(Function &F, AnalysisManager<Function> *AM) { - const DataLayout *DL = F.getParent()->getDataLayout(); - auto &TLI = AM->getResult<TargetLibraryAnalysis>(F); auto &TTI = AM->getResult<TargetIRAnalysis>(F); auto &DT = AM->getResult<DominatorTreeAnalysis>(F); auto &AC = AM->getResult<AssumptionAnalysis>(F); - EarlyCSE CSE(F, DL, TLI, TTI, DT, AC); + EarlyCSE CSE(F, TLI, TTI, DT, AC); if (!CSE.run()) return PreservedAnalyses::all(); @@ -724,14 +722,12 @@ public: if (skipOptnoneFunction(F)) return false; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - auto *DL = DLP ? &DLP->getDataLayout() : nullptr; auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - EarlyCSE CSE(F, DL, TLI, TTI, DT, AC); + EarlyCSE CSE(F, TLI, TTI, DT, AC); return CSE.run(); } diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 73a1f25..c73e60f 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -33,6 +33,7 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/PHITransAddr.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" @@ -45,7 +46,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -584,14 +585,13 @@ namespace { /// Emit code into this block to adjust the value defined here to the /// specified type. This handles various coercion cases. - Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const; + Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const; }; class GVN : public FunctionPass { bool NoLoads; MemoryDependenceAnalysis *MD; DominatorTree *DT; - const DataLayout *DL; const TargetLibraryInfo *TLI; AssumptionCache *AC; SetVector<BasicBlock *> DeadBlocks; @@ -630,7 +630,6 @@ namespace { InstrsToErase.push_back(I); } - const DataLayout *getDataLayout() const { return DL; } DominatorTree &getDominatorTree() const { return *DT; } AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } MemoryDependenceAnalysis &getMemDep() const { return *MD; } @@ -956,8 +955,9 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, return -1; int64_t StoreOffset = 0, LoadOffset = 0; - Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr,StoreOffset,&DL); - Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, &DL); + Value *StoreBase = + GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL); + Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL); if (StoreBase != LoadBase) return -1; @@ -1021,13 +1021,13 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, /// This function is called when we have a /// memdep query of a load that ends up being a clobbering store. static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, - StoreInst *DepSI, - const DataLayout &DL) { + StoreInst *DepSI) { // Cannot handle reading from store of first-class aggregate yet. if (DepSI->getValueOperand()->getType()->isStructTy() || DepSI->getValueOperand()->getType()->isArrayTy()) return -1; + const DataLayout &DL = DepSI->getModule()->getDataLayout(); Value *StorePtr = DepSI->getPointerOperand(); uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()); return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, @@ -1052,11 +1052,11 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, // then we should widen it! int64_t LoadOffs = 0; const Value *LoadBase = - GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, &DL); + GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL); unsigned LoadSize = DL.getTypeStoreSize(LoadTy); - unsigned Size = MemoryDependenceAnalysis:: - getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, DL); + unsigned Size = MemoryDependenceAnalysis::getLoadLoadClobberFullWidthSize( + LoadBase, LoadOffs, LoadSize, DepLI); if (Size == 0) return -1; return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL); @@ -1086,7 +1086,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, Constant *Src = dyn_cast<Constant>(MTI->getSource()); if (!Src) return -1; - GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &DL)); + GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL)); if (!GV || !GV->isConstant()) return -1; // See if the access is within the bounds of the transfer. @@ -1104,7 +1104,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); - if (ConstantFoldLoadFromConstPtr(Src, &DL)) + if (ConstantFoldLoadFromConstPtr(Src, DL)) return Offset; return -1; } @@ -1157,7 +1157,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy, Instruction *InsertPt, GVN &gvn) { - const DataLayout &DL = *gvn.getDataLayout(); + const DataLayout &DL = SrcVal->getModule()->getDataLayout(); // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to // widen SrcVal out to a larger load. unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType()); @@ -1265,7 +1265,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); - return ConstantFoldLoadFromConstPtr(Src, &DL); + return ConstantFoldLoadFromConstPtr(Src, DL); } @@ -1281,7 +1281,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB, LI->getParent())) { assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block"); - return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn); + return ValuesPerBlock[0].MaterializeAdjustedValue(LI, gvn); } // Otherwise, we have to construct SSA form. @@ -1289,8 +1289,6 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, SSAUpdater SSAUpdate(&NewPHIs); SSAUpdate.Initialize(LI->getType(), LI->getName()); - Type *LoadTy = LI->getType(); - for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { const AvailableValueInBlock &AV = ValuesPerBlock[i]; BasicBlock *BB = AV.BB; @@ -1298,7 +1296,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, if (SSAUpdate.HasValueForBlock(BB)) continue; - SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, gvn)); + SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn)); } // Perform PHI construction. @@ -1326,16 +1324,16 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, return V; } -Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const { +Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI, + GVN &gvn) const { Value *Res; + Type *LoadTy = LI->getType(); + const DataLayout &DL = LI->getModule()->getDataLayout(); if (isSimpleValue()) { Res = getSimpleValue(); if (Res->getType() != LoadTy) { - const DataLayout *DL = gvn.getDataLayout(); - assert(DL && "Need target data to handle type mismatch case"); - Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), - *DL); - + Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), DL); + DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " << *getSimpleValue() << '\n' << *Res << '\n' << "\n\n\n"); @@ -1353,10 +1351,8 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c << *Res << '\n' << "\n\n\n"); } } else if (isMemIntrinValue()) { - const DataLayout *DL = gvn.getDataLayout(); - assert(DL && "Need target data to handle type mismatch case"); - Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, - LoadTy, BB->getTerminator(), *DL); + Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy, + BB->getTerminator(), DL); DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset << " " << *getMemIntrinValue() << '\n' << *Res << '\n' << "\n\n\n"); @@ -1383,6 +1379,7 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, // dependencies that produce an unknown value for the load (such as a call // that could potentially clobber the load). unsigned NumDeps = Deps.size(); + const DataLayout &DL = LI->getModule()->getDataLayout(); for (unsigned i = 0, e = NumDeps; i != e; ++i) { BasicBlock *DepBB = Deps[i].getBB(); MemDepResult DepInfo = Deps[i].getResult(); @@ -1409,9 +1406,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, // read by the load, we can extract the bits we need for the load from the // stored value. if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) { - if (DL && Address) { - int Offset = AnalyzeLoadFromClobberingStore(LI->getType(), Address, - DepSI, *DL); + if (Address) { + int Offset = + AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI); if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, DepSI->getValueOperand(), @@ -1428,9 +1425,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) { // If this is a clobber and L is the first instruction in its block, then // we have the first instruction in the entry block. - if (DepLI != LI && Address && DL) { - int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(), Address, - DepLI, *DL); + if (DepLI != LI && Address) { + int Offset = + AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL); if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI, @@ -1443,9 +1440,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, // If the clobbering value is a memset/memcpy/memmove, see if we can // forward a value on from it. if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) { - if (DL && Address) { + if (Address) { int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address, - DepMI, *DL); + DepMI, DL); if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI, Offset)); @@ -1484,8 +1481,8 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, if (S->getValueOperand()->getType() != LI->getType()) { // If the stored value is larger or equal to the loaded value, we can // reuse it. - if (!DL || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(), - LI->getType(), *DL)) { + if (!CanCoerceMustAliasedValueToLoad(S->getValueOperand(), + LI->getType(), DL)) { UnavailableBlocks.push_back(DepBB); continue; } @@ -1501,7 +1498,7 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, if (LD->getType() != LI->getType()) { // If the stored value is larger or equal to the loaded value, we can // reuse it. - if (!DL || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*DL)) { + if (!CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL)) { UnavailableBlocks.push_back(DepBB); continue; } @@ -1613,6 +1610,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // Check if the load can safely be moved to all the unavailable predecessors. bool CanDoPRE = true; + const DataLayout &DL = LI->getModule()->getDataLayout(); SmallVector<Instruction*, 8> NewInsts; for (auto &PredLoad : PredLoads) { BasicBlock *UnavailablePred = PredLoad.first; @@ -1833,10 +1831,11 @@ bool GVN::processLoad(LoadInst *L) { // ... to a pointer that has been loaded from before... MemDepResult Dep = MD->getDependency(L); + const DataLayout &DL = L->getModule()->getDataLayout(); // If we have a clobber and target data is around, see if this is a clobber // that we can fix up through code synthesis. - if (Dep.isClobber() && DL) { + if (Dep.isClobber()) { // Check to see if we have something like this: // store i32 123, i32* %P // %A = bitcast i32* %P to i8* @@ -1849,12 +1848,11 @@ bool GVN::processLoad(LoadInst *L) { // access code. Value *AvailVal = nullptr; if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) { - int Offset = AnalyzeLoadFromClobberingStore(L->getType(), - L->getPointerOperand(), - DepSI, *DL); + int Offset = AnalyzeLoadFromClobberingStore( + L->getType(), L->getPointerOperand(), DepSI); if (Offset != -1) AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset, - L->getType(), L, *DL); + L->getType(), L, DL); } // Check to see if we have something like this: @@ -1867,9 +1865,8 @@ bool GVN::processLoad(LoadInst *L) { if (DepLI == L) return false; - int Offset = AnalyzeLoadFromClobberingLoad(L->getType(), - L->getPointerOperand(), - DepLI, *DL); + int Offset = AnalyzeLoadFromClobberingLoad( + L->getType(), L->getPointerOperand(), DepLI, DL); if (Offset != -1) AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this); } @@ -1877,11 +1874,10 @@ bool GVN::processLoad(LoadInst *L) { // If the clobbering value is a memset/memcpy/memmove, see if we can forward // a value on from it. if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) { - int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(), - L->getPointerOperand(), - DepMI, *DL); + int Offset = AnalyzeLoadFromClobberingMemInst( + L->getType(), L->getPointerOperand(), DepMI, DL); if (Offset != -1) - AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *DL); + AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, DL); } if (AvailVal) { @@ -1932,17 +1928,13 @@ bool GVN::processLoad(LoadInst *L) { // actually have the same type. See if we know how to reuse the stored // value (depending on its type). if (StoredVal->getType() != L->getType()) { - if (DL) { - StoredVal = CoerceAvailableValueToLoadType(StoredVal, L->getType(), - L, *DL); - if (!StoredVal) - return false; - - DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal - << '\n' << *L << "\n\n\n"); - } - else + StoredVal = + CoerceAvailableValueToLoadType(StoredVal, L->getType(), L, DL); + if (!StoredVal) return false; + + DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal + << '\n' << *L << "\n\n\n"); } // Remove it! @@ -1961,17 +1953,12 @@ bool GVN::processLoad(LoadInst *L) { // the same type. See if we know how to reuse the previously loaded value // (depending on its type). if (DepLI->getType() != L->getType()) { - if (DL) { - AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), - L, *DL); - if (!AvailableVal) - return false; - - DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal - << "\n" << *L << "\n\n\n"); - } - else + AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), L, DL); + if (!AvailableVal) return false; + + DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal + << "\n" << *L << "\n\n\n"); } // Remove it! @@ -2239,6 +2226,7 @@ bool GVN::processInstruction(Instruction *I) { // to value numbering it. Value numbering often exposes redundancies, for // example if it determines that %y is equal to %x then the instruction // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. + const DataLayout &DL = I->getModule()->getDataLayout(); if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) { I->replaceAllUsesWith(V); if (MD && V->getType()->getScalarType()->isPointerTy()) @@ -2357,8 +2345,6 @@ bool GVN::runOnFunction(Function& F) { if (!NoLoads) MD = &getAnalysis<MemoryDependenceAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index f99ebbc..51e8041 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -31,6 +31,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -44,7 +45,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" @@ -73,7 +73,6 @@ namespace { LoopInfo *LI; ScalarEvolution *SE; DominatorTree *DT; - const DataLayout *DL; TargetLibraryInfo *TLI; const TargetTransformInfo *TTI; @@ -82,8 +81,8 @@ namespace { public: static char ID; // Pass identification, replacement for typeid - IndVarSimplify() : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), - DL(nullptr), Changed(false) { + IndVarSimplify() + : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) { initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); } @@ -663,14 +662,14 @@ namespace { /// extended by this sign or zero extend operation. This is used to determine /// the final width of the IV before actually widening it. static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, - const DataLayout *DL, const TargetTransformInfo *TTI) { + const TargetTransformInfo *TTI) { bool IsSigned = Cast->getOpcode() == Instruction::SExt; if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) return; Type *Ty = Cast->getType(); uint64_t Width = SE->getTypeSizeInBits(Ty); - if (DL && !DL->isLegalInteger(Width)) + if (!Cast->getModule()->getDataLayout().isLegalInteger(Width)) return; // Cast is either an sext or zext up to this point. @@ -1201,7 +1200,6 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { namespace { class IndVarSimplifyVisitor : public IVVisitor { ScalarEvolution *SE; - const DataLayout *DL; const TargetTransformInfo *TTI; PHINode *IVPhi; @@ -1209,9 +1207,9 @@ namespace { WideIVInfo WI; IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, - const DataLayout *DL, const TargetTransformInfo *TTI, + const TargetTransformInfo *TTI, const DominatorTree *DTree) - : SE(SCEV), DL(DL), TTI(TTI), IVPhi(IV) { + : SE(SCEV), TTI(TTI), IVPhi(IV) { DT = DTree; WI.NarrowIV = IVPhi; if (ReduceLiveIVs) @@ -1219,9 +1217,7 @@ namespace { } // Implement the interface used by simplifyUsersOfIV. - void visitCast(CastInst *Cast) override { - visitIVCast(Cast, WI, SE, DL, TTI); - } + void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } }; } @@ -1255,7 +1251,7 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, PHINode *CurrIV = LoopPhis.pop_back_val(); // Information about sign/zero extensions of CurrIV. - IndVarSimplifyVisitor Visitor(CurrIV, SE, DL, TTI, DT); + IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor); @@ -1521,9 +1517,8 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { /// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride. /// This is difficult in general for SCEV because of potential overflow. But we /// could at least handle constant BECounts. -static PHINode * -FindLoopCounter(Loop *L, const SCEV *BECount, - ScalarEvolution *SE, DominatorTree *DT, const DataLayout *DL) { +static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount, + ScalarEvolution *SE, DominatorTree *DT) { uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); Value *Cond = @@ -1552,7 +1547,8 @@ FindLoopCounter(Loop *L, const SCEV *BECount, // AR may be wider than BECount. With eq/ne tests overflow is immaterial. // AR may not be a narrower type, or we may never exit. uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType()); - if (PhiWidth < BCWidth || (DL && !DL->isLegalInteger(PhiWidth))) + if (PhiWidth < BCWidth || + !L->getHeader()->getModule()->getDataLayout().isLegalInteger(PhiWidth)) continue; const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); @@ -1705,51 +1701,15 @@ LinearFunctionTestReplace(Loop *L, // compare against the post-incremented value, otherwise we must compare // against the preincremented value. if (L->getExitingBlock() == L->getLoopLatch()) { + // Add one to the "backedge-taken" count to get the trip count. + // This addition may overflow, which is valid as long as the comparison is + // truncated to BackedgeTakenCount->getType(). + IVCount = SE->getAddExpr(BackedgeTakenCount, + SE->getConstant(BackedgeTakenCount->getType(), 1)); // The BackedgeTaken expression contains the number of times that the // backedge branches to the loop header. This is one less than the // number of times the loop executes, so use the incremented indvar. - llvm::Value *IncrementedIndvar = - IndVar->getIncomingValueForBlock(L->getExitingBlock()); - const auto *IncrementedIndvarSCEV = - cast<SCEVAddRecExpr>(SE->getSCEV(IncrementedIndvar)); - // It is unsafe to use the incremented indvar if it has a wrapping flag, we - // don't want to compare against a poison value. Check the SCEV that - // corresponds to the incremented indvar, the SCEVExpander will only insert - // flags in the IR if the SCEV originally had wrapping flags. - // FIXME: In theory, SCEV could drop flags even though they exist in IR. - // A more robust solution would involve getting a new expression for - // CmpIndVar by applying non-NSW/NUW AddExprs. - auto WrappingFlags = - ScalarEvolution::setFlags(SCEV::FlagNUW, SCEV::FlagNSW); - const SCEV *IVInit = IncrementedIndvarSCEV->getStart(); - if (SE->getTypeSizeInBits(IVInit->getType()) > - SE->getTypeSizeInBits(IVCount->getType())) - IVInit = SE->getTruncateExpr(IVInit, IVCount->getType()); - unsigned BitWidth = SE->getTypeSizeInBits(IVCount->getType()); - Type *WideTy = IntegerType::get(SE->getContext(), BitWidth + 1); - // Check if InitIV + BECount+1 requires sign/zero extension. - // If not, clear the corresponding flag from WrappingFlags because it is not - // necessary for those flags in the IncrementedIndvarSCEV expression. - if (SE->getSignExtendExpr(SE->getAddExpr(IVInit, BackedgeTakenCount), - WideTy) == - SE->getAddExpr(SE->getSignExtendExpr(IVInit, WideTy), - SE->getSignExtendExpr(BackedgeTakenCount, WideTy))) - WrappingFlags = ScalarEvolution::clearFlags(WrappingFlags, SCEV::FlagNSW); - if (SE->getZeroExtendExpr(SE->getAddExpr(IVInit, BackedgeTakenCount), - WideTy) == - SE->getAddExpr(SE->getZeroExtendExpr(IVInit, WideTy), - SE->getZeroExtendExpr(BackedgeTakenCount, WideTy))) - WrappingFlags = ScalarEvolution::clearFlags(WrappingFlags, SCEV::FlagNUW); - if (!ScalarEvolution::maskFlags(IncrementedIndvarSCEV->getNoWrapFlags(), - WrappingFlags)) { - // Add one to the "backedge-taken" count to get the trip count. - // This addition may overflow, which is valid as long as the comparison is - // truncated to BackedgeTakenCount->getType(). - IVCount = - SE->getAddExpr(BackedgeTakenCount, - SE->getConstant(BackedgeTakenCount->getType(), 1)); - CmpIndVar = IncrementedIndvar; - } + CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock()); } Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE); @@ -1932,12 +1892,11 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); TLI = TLIP ? &TLIP->getTLI() : nullptr; auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>(); TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr; + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); DeadInsts.clear(); Changed = false; @@ -1949,7 +1908,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); // Create a rewriter object which we'll use to transform the code with. - SCEVExpander Rewriter(*SE, "indvars"); + SCEVExpander Rewriter(*SE, DL, "indvars"); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif @@ -1978,7 +1937,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // If we have a trip count expression, rewrite the loop's exit condition // using it. We can currently only handle loops with a single exit. if (canExpandBackedgeTakenCount(L, SE) && needsLFTR(L, DT)) { - PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, DL); + PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT); if (IndVar) { // Check preconditions for proper SCEVExpander operation. SCEV does not // express SCEVExpander's dependencies, such as LoopSimplify. Instead any diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 8559e63..cbdacad 100644 --- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -42,7 +42,6 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/Optional.h" - #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" @@ -51,27 +50,23 @@ #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" - #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" - +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" - +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" #include "llvm/Transforms/Utils/UnrollLoop.h" - -#include "llvm/Pass.h" - #include <array> using namespace llvm; @@ -82,6 +77,9 @@ static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden, static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden, cl::init(false)); +static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden, + cl::init(false)); + static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal", cl::Hidden, cl::init(10)); @@ -96,23 +94,41 @@ namespace { /// /// and /// -/// 2. a condition that is provably true for some range of values taken by the -/// containing loop's induction variable. -/// -/// Currently all inductive range checks are branches conditional on an -/// expression of the form +/// 2. a condition that is provably true for some contiguous range of values +/// taken by the containing loop's induction variable. /// -/// 0 <= (Offset + Scale * I) < Length -/// -/// where `I' is the canonical induction variable of a loop to which Offset and -/// Scale are loop invariant, and Length is >= 0. Currently the 'false' branch -/// is considered cold, looking at profiling data to verify that is a TODO. - class InductiveRangeCheck { + // Classifies a range check + enum RangeCheckKind : unsigned { + // Range check of the form "0 <= I". + RANGE_CHECK_LOWER = 1, + + // Range check of the form "I < L" where L is known positive. + RANGE_CHECK_UPPER = 2, + + // The logical and of the RANGE_CHECK_LOWER and RANGE_CHECK_UPPER + // conditions. + RANGE_CHECK_BOTH = RANGE_CHECK_LOWER | RANGE_CHECK_UPPER, + + // Unrecognized range check condition. + RANGE_CHECK_UNKNOWN = (unsigned)-1 + }; + + static const char *rangeCheckKindToStr(RangeCheckKind); + const SCEV *Offset; const SCEV *Scale; Value *Length; BranchInst *Branch; + RangeCheckKind Kind; + + static RangeCheckKind parseRangeCheckICmp(Loop *L, ICmpInst *ICI, + ScalarEvolution &SE, Value *&Index, + Value *&Length); + + static InductiveRangeCheck::RangeCheckKind + parseRangeCheck(Loop *L, ScalarEvolution &SE, Value *Condition, + const SCEV *&Index, Value *&UpperLimit); InductiveRangeCheck() : Offset(nullptr), Scale(nullptr), Length(nullptr), Branch(nullptr) { } @@ -124,13 +140,17 @@ public: void print(raw_ostream &OS) const { OS << "InductiveRangeCheck:\n"; + OS << " Kind: " << rangeCheckKindToStr(Kind) << "\n"; OS << " Offset: "; Offset->print(OS); OS << " Scale: "; Scale->print(OS); OS << " Length: "; - Length->print(OS); - OS << " Branch: "; + if (Length) + Length->print(OS); + else + OS << "(null)"; + OS << "\n Branch: "; getBranch()->print(OS); OS << "\n"; } @@ -207,160 +227,156 @@ char InductiveRangeCheckElimination::ID = 0; INITIALIZE_PASS(InductiveRangeCheckElimination, "irce", "Inductive range check elimination", false, false) -static bool IsLowerBoundCheck(Value *Check, Value *&IndexV) { - using namespace llvm::PatternMatch; +const char *InductiveRangeCheck::rangeCheckKindToStr( + InductiveRangeCheck::RangeCheckKind RCK) { + switch (RCK) { + case InductiveRangeCheck::RANGE_CHECK_UNKNOWN: + return "RANGE_CHECK_UNKNOWN"; - ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; - Value *LHS = nullptr, *RHS = nullptr; + case InductiveRangeCheck::RANGE_CHECK_UPPER: + return "RANGE_CHECK_UPPER"; - if (!match(Check, m_ICmp(Pred, m_Value(LHS), m_Value(RHS)))) - return false; + case InductiveRangeCheck::RANGE_CHECK_LOWER: + return "RANGE_CHECK_LOWER"; + + case InductiveRangeCheck::RANGE_CHECK_BOTH: + return "RANGE_CHECK_BOTH"; + } + + llvm_unreachable("unknown range check type!"); +} + +/// Parse a single ICmp instruction, `ICI`, into a range check. If `ICI` +/// cannot +/// be interpreted as a range check, return `RANGE_CHECK_UNKNOWN` and set +/// `Index` and `Length` to `nullptr`. Otherwise set `Index` to the value +/// being +/// range checked, and set `Length` to the upper limit `Index` is being range +/// checked with if (and only if) the range check type is stronger or equal to +/// RANGE_CHECK_UPPER. +/// +InductiveRangeCheck::RangeCheckKind +InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI, + ScalarEvolution &SE, Value *&Index, + Value *&Length) { + + auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) { + const SCEV *S = SE.getSCEV(V); + if (isa<SCEVCouldNotCompute>(S)) + return false; + + return SE.getLoopDisposition(S, L) == ScalarEvolution::LoopInvariant && + SE.isKnownNonNegative(S); + }; + + using namespace llvm::PatternMatch; + + ICmpInst::Predicate Pred = ICI->getPredicate(); + Value *LHS = ICI->getOperand(0); + Value *RHS = ICI->getOperand(1); switch (Pred) { default: - return false; + return RANGE_CHECK_UNKNOWN; case ICmpInst::ICMP_SLE: std::swap(LHS, RHS); // fallthrough case ICmpInst::ICMP_SGE: - if (!match(RHS, m_ConstantInt<0>())) - return false; - IndexV = LHS; - return true; + if (match(RHS, m_ConstantInt<0>())) { + Index = LHS; + return RANGE_CHECK_LOWER; + } + return RANGE_CHECK_UNKNOWN; case ICmpInst::ICMP_SLT: std::swap(LHS, RHS); // fallthrough case ICmpInst::ICMP_SGT: - if (!match(RHS, m_ConstantInt<-1>())) - return false; - IndexV = LHS; - return true; - } -} - -static bool IsUpperBoundCheck(Value *Check, Value *Index, Value *&UpperLimit) { - using namespace llvm::PatternMatch; - - ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; - Value *LHS = nullptr, *RHS = nullptr; - - if (!match(Check, m_ICmp(Pred, m_Value(LHS), m_Value(RHS)))) - return false; + if (match(RHS, m_ConstantInt<-1>())) { + Index = LHS; + return RANGE_CHECK_LOWER; + } - switch (Pred) { - default: - return false; + if (IsNonNegativeAndNotLoopVarying(LHS)) { + Index = RHS; + Length = LHS; + return RANGE_CHECK_UPPER; + } + return RANGE_CHECK_UNKNOWN; - case ICmpInst::ICMP_SGT: + case ICmpInst::ICMP_ULT: std::swap(LHS, RHS); // fallthrough - case ICmpInst::ICMP_SLT: - if (LHS != Index) - return false; - UpperLimit = RHS; - return true; - case ICmpInst::ICMP_UGT: - std::swap(LHS, RHS); - // fallthrough - case ICmpInst::ICMP_ULT: - if (LHS != Index) - return false; - UpperLimit = RHS; - return true; + if (IsNonNegativeAndNotLoopVarying(LHS)) { + Index = RHS; + Length = LHS; + return RANGE_CHECK_BOTH; + } + return RANGE_CHECK_UNKNOWN; } + + llvm_unreachable("default clause returns!"); } -/// Split a condition into something semantically equivalent to (0 <= I < -/// Limit), both comparisons signed and Len loop invariant on L and positive. -/// On success, return true and set Index to I and UpperLimit to Limit. Return -/// false on failure (we may still write to UpperLimit and Index on failure). -/// It does not try to interpret I as a loop index. -/// -static bool SplitRangeCheckCondition(Loop *L, ScalarEvolution &SE, +/// Parses an arbitrary condition into a range check. `Length` is set only if +/// the range check is recognized to be `RANGE_CHECK_UPPER` or stronger. +InductiveRangeCheck::RangeCheckKind +InductiveRangeCheck::parseRangeCheck(Loop *L, ScalarEvolution &SE, Value *Condition, const SCEV *&Index, - Value *&UpperLimit) { - - // TODO: currently this catches some silly cases like comparing "%idx slt 1". - // Our transformations are still correct, but less likely to be profitable in - // those cases. We have to come up with some heuristics that pick out the - // range checks that are more profitable to clone a loop for. This function - // in general can be made more robust. - + Value *&Length) { using namespace llvm::PatternMatch; Value *A = nullptr; Value *B = nullptr; - ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; - - // In these early checks we assume that the matched UpperLimit is positive. - // We'll verify that fact later, before returning true. if (match(Condition, m_And(m_Value(A), m_Value(B)))) { - Value *IndexV = nullptr; - Value *ExpectedUpperBoundCheck = nullptr; + Value *IndexA = nullptr, *IndexB = nullptr; + Value *LengthA = nullptr, *LengthB = nullptr; + ICmpInst *ICmpA = dyn_cast<ICmpInst>(A), *ICmpB = dyn_cast<ICmpInst>(B); - if (IsLowerBoundCheck(A, IndexV)) - ExpectedUpperBoundCheck = B; - else if (IsLowerBoundCheck(B, IndexV)) - ExpectedUpperBoundCheck = A; - else - return false; + if (!ICmpA || !ICmpB) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; - if (!IsUpperBoundCheck(ExpectedUpperBoundCheck, IndexV, UpperLimit)) - return false; + auto RCKindA = parseRangeCheckICmp(L, ICmpA, SE, IndexA, LengthA); + auto RCKindB = parseRangeCheckICmp(L, ICmpB, SE, IndexB, LengthB); - Index = SE.getSCEV(IndexV); + if (RCKindA == InductiveRangeCheck::RANGE_CHECK_UNKNOWN || + RCKindB == InductiveRangeCheck::RANGE_CHECK_UNKNOWN) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; - if (isa<SCEVCouldNotCompute>(Index)) - return false; + if (IndexA != IndexB) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; - } else if (match(Condition, m_ICmp(Pred, m_Value(A), m_Value(B)))) { - switch (Pred) { - default: - return false; + if (LengthA != nullptr && LengthB != nullptr && LengthA != LengthB) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; - case ICmpInst::ICMP_SGT: - std::swap(A, B); - // fall through - case ICmpInst::ICMP_SLT: - UpperLimit = B; - Index = SE.getSCEV(A); - if (isa<SCEVCouldNotCompute>(Index) || !SE.isKnownNonNegative(Index)) - return false; - break; + Index = SE.getSCEV(IndexA); + if (isa<SCEVCouldNotCompute>(Index)) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; - case ICmpInst::ICMP_UGT: - std::swap(A, B); - // fall through - case ICmpInst::ICMP_ULT: - UpperLimit = B; - Index = SE.getSCEV(A); - if (isa<SCEVCouldNotCompute>(Index)) - return false; - break; - } - } else { - return false; + Length = LengthA == nullptr ? LengthB : LengthA; + + return (InductiveRangeCheck::RangeCheckKind)(RCKindA | RCKindB); } - const SCEV *UpperLimitSCEV = SE.getSCEV(UpperLimit); - if (isa<SCEVCouldNotCompute>(UpperLimitSCEV) || - !SE.isKnownNonNegative(UpperLimitSCEV)) - return false; + if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) { + Value *IndexVal = nullptr; - if (SE.getLoopDisposition(UpperLimitSCEV, L) != - ScalarEvolution::LoopInvariant) { - DEBUG(dbgs() << " in function: " << L->getHeader()->getParent()->getName() - << " "; - dbgs() << " UpperLimit is not loop invariant: " - << UpperLimit->getName() << "\n";); - return false; + auto RCKind = parseRangeCheckICmp(L, ICI, SE, IndexVal, Length); + + if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; + + Index = SE.getSCEV(IndexVal); + if (isa<SCEVCouldNotCompute>(Index)) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; + + return RCKind; } - return true; + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; } @@ -380,10 +396,15 @@ InductiveRangeCheck::create(InductiveRangeCheck::AllocatorTy &A, BranchInst *BI, Value *Length = nullptr; const SCEV *IndexSCEV = nullptr; - if (!SplitRangeCheckCondition(L, SE, BI->getCondition(), IndexSCEV, Length)) + auto RCKind = InductiveRangeCheck::parseRangeCheck(L, SE, BI->getCondition(), + IndexSCEV, Length); + + if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN) return nullptr; - assert(IndexSCEV && Length && "contract with SplitRangeCheckCondition!"); + assert(IndexSCEV && "contract with SplitRangeCheckCondition!"); + assert((!(RCKind & InductiveRangeCheck::RANGE_CHECK_UPPER) || Length) && + "contract with SplitRangeCheckCondition!"); const SCEVAddRecExpr *IndexAddRec = dyn_cast<SCEVAddRecExpr>(IndexSCEV); bool IsAffineIndex = @@ -397,6 +418,7 @@ InductiveRangeCheck::create(InductiveRangeCheck::AllocatorTy &A, BranchInst *BI, IRC->Offset = IndexAddRec->getStart(); IRC->Scale = IndexAddRec->getStepRecurrence(SE); IRC->Branch = BI; + IRC->Kind = RCKind; return IRC; } @@ -685,30 +707,40 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP } } - auto IsInductionVar = [&SE](const SCEVAddRecExpr *AR, bool &IsIncreasing) { - if (!AR->isAffine()) - return false; + auto HasNoSignedWrap = [&](const SCEVAddRecExpr *AR) { + if (AR->getNoWrapFlags(SCEV::FlagNSW)) + return true; IntegerType *Ty = cast<IntegerType>(AR->getType()); IntegerType *WideTy = IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2); - // Currently we only work with induction variables that have been proved to - // not wrap. This restriction can potentially be lifted in the future. - const SCEVAddRecExpr *ExtendAfterOp = dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); - if (!ExtendAfterOp) - return false; + if (ExtendAfterOp) { + const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy); + const SCEV *ExtendedStep = + SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy); - const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy); - const SCEV *ExtendedStep = - SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy); + bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart && + ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep; + + if (NoSignedWrap) + return true; + } + + // We may have proved this when computing the sign extension above. + return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap; + }; + + auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing) { + if (!AR->isAffine()) + return false; - bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart && - ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep; + // Currently we only work with induction variables that have been proved to + // not wrap. This restriction can potentially be lifted in the future. - if (!NoSignedWrap) + if (!HasNoSignedWrap(AR)) return false; if (const SCEVConstant *StepExpr = @@ -791,9 +823,10 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP "loop variant exit count doesn't make sense!"); assert(!L.contains(LatchExit) && "expected an exit block!"); - - Value *IndVarStartV = SCEVExpander(SE, "irce").expandCodeFor( - IndVarStart, IndVarTy, &*Preheader->rbegin()); + const DataLayout &DL = Preheader->getModule()->getDataLayout(); + Value *IndVarStartV = + SCEVExpander(SE, DL, "irce") + .expandCodeFor(IndVarStart, IndVarTy, &*Preheader->rbegin()); IndVarStartV->setName("indvar.start"); LoopStructure Result; @@ -831,12 +864,35 @@ LoopConstrainer::calculateSubRanges() const { const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt); bool Increasing = MainLoopStructure.IndVarIncreasing; + // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the // range of values the induction variable takes. - const SCEV *Smallest = - Increasing ? Start : SE.getAddExpr(End, SE.getSCEV(One)); - const SCEV *Greatest = - Increasing ? End : SE.getAddExpr(Start, SE.getSCEV(One)); + + const SCEV *Smallest = nullptr, *Greatest = nullptr; + + if (Increasing) { + Smallest = Start; + Greatest = End; + } else { + // These two computations may sign-overflow. Here is why that is okay: + // + // We know that the induction variable does not sign-overflow on any + // iteration except the last one, and it starts at `Start` and ends at + // `End`, decrementing by one every time. + // + // * if `Smallest` sign-overflows we know `End` is `INT_SMAX`. Since the + // induction variable is decreasing we know that that the smallest value + // the loop body is actually executed with is `INT_SMIN` == `Smallest`. + // + // * if `Greatest` sign-overflows, we know it can only be `INT_SMIN`. In + // that case, `Clamp` will always return `Smallest` and + // [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`) + // will be an empty range. Returning an empty range is always safe. + // + + Smallest = SE.getAddExpr(End, SE.getSCEV(One)); + Greatest = SE.getAddExpr(Start, SE.getSCEV(One)); + } auto Clamp = [this, Smallest, Greatest](const SCEV *S) { return SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S)); @@ -1132,7 +1188,7 @@ bool LoopConstrainer::run() { IntegerType *IVTy = cast<IntegerType>(MainLoopStructure.IndVarNext->getType()); - SCEVExpander Expander(SE, "irce"); + SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce"); Instruction *InsertPt = OriginalPreheader->getTerminator(); // It would have been better to make `PreLoop' and `PostLoop' @@ -1293,8 +1349,19 @@ InductiveRangeCheck::computeSafeIterationSpace(ScalarEvolution &SE, const SCEV *M = SE.getMinusSCEV(C, A); const SCEV *Begin = SE.getNegativeSCEV(M); - const SCEV *End = SE.getMinusSCEV(SE.getSCEV(getLength()), M); + const SCEV *UpperLimit = nullptr; + + // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L". + // We can potentially do much better here. + if (Value *V = getLength()) { + UpperLimit = SE.getSCEV(V); + } else { + assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!"); + unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth(); + UpperLimit = SE.getConstant(APInt::getSignedMaxValue(BitWidth)); + } + const SCEV *End = SE.getMinusSCEV(UpperLimit, M); return InductiveRangeCheck::Range(Begin, End); } @@ -1344,12 +1411,18 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { if (RangeChecks.empty()) return false; - DEBUG(dbgs() << "irce: looking at loop "; L->print(dbgs()); - dbgs() << "irce: loop has " << RangeChecks.size() - << " inductive range checks: \n"; - for (InductiveRangeCheck *IRC : RangeChecks) - IRC->print(dbgs()); - ); + auto PrintRecognizedRangeChecks = [&](raw_ostream &OS) { + OS << "irce: looking at loop "; L->print(OS); + OS << "irce: loop has " << RangeChecks.size() + << " inductive range checks: \n"; + for (InductiveRangeCheck *IRC : RangeChecks) + IRC->print(OS); + }; + + DEBUG(PrintRecognizedRangeChecks(dbgs())); + + if (PrintRangeChecks) + PrintRecognizedRangeChecks(errs()); const char *FailureReason = nullptr; Optional<LoopStructure> MaybeLoopStructure = diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 8b54abd..83ac915 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" @@ -32,7 +33,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -78,7 +78,6 @@ namespace { /// revectored to the false side of the second if. /// class JumpThreading : public FunctionPass { - const DataLayout *DL; TargetLibraryInfo *TLI; LazyValueInfo *LVI; #ifdef NDEBUG @@ -159,8 +158,6 @@ bool JumpThreading::runOnFunction(Function &F) { return false; DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); LVI = &getAnalysis<LazyValueInfo>(); @@ -505,6 +502,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, assert(Preference == WantInteger && "Compares only produce integers"); PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0)); if (PN && PN->getParent() == BB) { + const DataLayout &DL = PN->getModule()->getDataLayout(); // We can do this simplification if any comparisons fold to true or false. // See if any do. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { @@ -709,7 +707,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // Run constant folding to see if we can reduce the condition to a simple // constant. if (Instruction *I = dyn_cast<Instruction>(Condition)) { - Value *SimpleVal = ConstantFoldInstruction(I, DL, TLI); + Value *SimpleVal = + ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI); if (SimpleVal) { I->replaceAllUsesWith(SimpleVal); I->eraseFromParent(); @@ -1521,7 +1520,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // At this point, the IR is fully up to date and consistent. Do a quick scan // over the new instructions and zap any that are constants or dead. This // frequently happens because of phi translation. - SimplifyInstructionsInBlock(NewBB, DL, TLI); + SimplifyInstructionsInBlock(NewBB, TLI); // Threaded an edge! ++NumThreads; @@ -1586,7 +1585,6 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, BasicBlock::iterator BI = BB->begin(); for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); - // Clone the non-phi instructions of BB into PredBB, keeping track of the // mapping and using it to remap operands in the cloned instructions. for (; BI != BB->end(); ++BI) { @@ -1603,7 +1601,8 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, // If this instruction can be simplified after the operands are updated, // just use the simplified value instead. This frequently happens due to // phi translation. - if (Value *IV = SimplifyInstruction(New, DL)) { + if (Value *IV = + SimplifyInstruction(New, BB->getModule()->getDataLayout())) { delete New; ValueMapping[BI] = IV; } else { diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 14af38b..1333b02 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -38,6 +38,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -52,7 +53,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -76,21 +76,21 @@ static bool isNotUsedInLoop(Instruction &I, Loop *CurLoop); static bool hoist(Instruction &I, BasicBlock *Preheader); static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, Loop *CurLoop, AliasSetTracker *CurAST ); -static bool isGuaranteedToExecute(Instruction &Inst, DominatorTree *DT, - Loop *CurLoop, LICMSafetyInfo * SafetyInfo); -static bool isSafeToExecuteUnconditionally(Instruction &Inst,DominatorTree *DT, - const DataLayout *DL, Loop *CurLoop, - LICMSafetyInfo * SafetyInfo); +static bool isGuaranteedToExecute(Instruction &Inst, DominatorTree *DT, + Loop *CurLoop, LICMSafetyInfo *SafetyInfo); +static bool isSafeToExecuteUnconditionally(Instruction &Inst, DominatorTree *DT, + Loop *CurLoop, + LICMSafetyInfo *SafetyInfo); static bool pointerInvalidatedByLoop(Value *V, uint64_t Size, const AAMDNodes &AAInfo, AliasSetTracker *CurAST); static Instruction *CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, LoopInfo *LI); -static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, - DominatorTree *DT, const DataLayout *DL, - Loop *CurLoop, AliasSetTracker *CurAST, - LICMSafetyInfo * SafetyInfo); +static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, + DominatorTree *DT, Loop *CurLoop, + AliasSetTracker *CurAST, + LICMSafetyInfo *SafetyInfo); namespace { struct LICM : public LoopPass { @@ -130,7 +130,6 @@ namespace { LoopInfo *LI; // Current LoopInfo DominatorTree *DT; // Dominator Tree for the current Loop. - const DataLayout *DL; // DataLayout for constant folding. TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding. // State that is updated as we process loops. @@ -181,8 +180,6 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { AA = &getAnalysis<AliasAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); @@ -235,10 +232,10 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // instructions, we perform another pass to hoist them out of the loop. // if (L->hasDedicatedExits()) - Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, DL, TLI, - CurLoop, CurAST, &SafetyInfo); + Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, CurLoop, + CurAST, &SafetyInfo); if (Preheader) - Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, DL, TLI, + Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, CurLoop, CurAST, &SafetyInfo); // Now that all loop invariants have been removed from the loop, promote any @@ -291,10 +288,9 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { /// first order w.r.t the DominatorTree. This allows us to visit uses before /// definitions, allowing us to sink a loop body in one pass without iteration. /// -bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, - DominatorTree *DT, const DataLayout *DL, - TargetLibraryInfo *TLI, Loop *CurLoop, - AliasSetTracker *CurAST, LICMSafetyInfo * SafetyInfo) { +bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, + DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, + AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { // Verify inputs. assert(N != nullptr && AA != nullptr && LI != nullptr && @@ -311,8 +307,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // We are processing blocks in reverse dfo, so process children first. const std::vector<DomTreeNode*> &Children = N->getChildren(); for (unsigned i = 0, e = Children.size(); i != e; ++i) - Changed |= sinkRegion(Children[i], AA, LI, DT, DL, TLI, CurLoop, - CurAST, SafetyInfo); + Changed |= + sinkRegion(Children[i], AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo); // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). if (inSubLoop(BB,CurLoop,LI)) return Changed; @@ -336,8 +332,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // outside of the loop. In this case, it doesn't even matter if the // operands of the instruction are loop invariant. // - if (isNotUsedInLoop(I, CurLoop) && - canSinkOrHoistInst(I, AA, DT, DL, CurLoop, CurAST, SafetyInfo)) { + if (isNotUsedInLoop(I, CurLoop) && + canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo)) { ++II; Changed |= sink(I, LI, DT, CurLoop, CurAST); } @@ -350,10 +346,9 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, /// order w.r.t the DominatorTree. This allows us to visit definitions before /// uses, allowing us to hoist a loop body in one pass without iteration. /// -bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, - DominatorTree *DT, const DataLayout *DL, - TargetLibraryInfo *TLI, Loop *CurLoop, - AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { +bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, + DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, + AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { // Verify inputs. assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && CurLoop != nullptr && CurAST != nullptr && @@ -372,7 +367,8 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // Try constant folding this instruction. If all the operands are // constants, it is technically hoistable, but it would be better to just // fold it. - if (Constant *C = ConstantFoldInstruction(&I, DL, TLI)) { + if (Constant *C = ConstantFoldInstruction( + &I, I.getModule()->getDataLayout(), TLI)) { DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); CurAST->copyValue(&I, C); CurAST->deleteValue(&I); @@ -385,16 +381,16 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // if all of the operands of the instruction are loop invariant and if it // is safe to hoist the instruction. // - if (CurLoop->hasLoopInvariantOperands(&I) && - canSinkOrHoistInst(I, AA, DT, DL, CurLoop, CurAST, SafetyInfo) && - isSafeToExecuteUnconditionally(I, DT, DL, CurLoop, SafetyInfo)) + if (CurLoop->hasLoopInvariantOperands(&I) && + canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo) && + isSafeToExecuteUnconditionally(I, DT, CurLoop, SafetyInfo)) Changed |= hoist(I, CurLoop->getLoopPreheader()); } const std::vector<DomTreeNode*> &Children = N->getChildren(); for (unsigned i = 0, e = Children.size(); i != e; ++i) - Changed |= hoistRegion(Children[i], AA, LI, DT, DL, TLI, CurLoop, - CurAST, SafetyInfo); + Changed |= + hoistRegion(Children[i], AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo); return Changed; } @@ -424,10 +420,9 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { /// canSinkOrHoistInst - Return true if the hoister and sinker can handle this /// instruction. /// -bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, - DominatorTree *DT, const DataLayout *DL, - Loop *CurLoop, AliasSetTracker *CurAST, - LICMSafetyInfo * SafetyInfo) { +bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT, + Loop *CurLoop, AliasSetTracker *CurAST, + LICMSafetyInfo *SafetyInfo) { // Loads have extra constraints we have to verify before we can hoist them. if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { if (!LI->isUnordered()) @@ -487,7 +482,7 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, !isa<InsertValueInst>(I)) return false; - return isSafeToExecuteUnconditionally(I, DT, DL, CurLoop, SafetyInfo); + return isSafeToExecuteUnconditionally(I, DT, CurLoop, SafetyInfo); } /// Returns true if a PHINode is a trivially replaceable with an @@ -643,10 +638,10 @@ static bool hoist(Instruction &I, BasicBlock *Preheader) { /// or if it is a trapping instruction and is guaranteed to execute. /// static bool isSafeToExecuteUnconditionally(Instruction &Inst, DominatorTree *DT, - const DataLayout *DL, Loop *CurLoop, - LICMSafetyInfo * SafetyInfo) { + Loop *CurLoop, + LICMSafetyInfo *SafetyInfo) { // If it is not a trapping instruction, it is always safe to hoist. - if (isSafeToSpeculativelyExecute(&Inst, DL)) + if (isSafeToSpeculativelyExecute(&Inst)) return true; return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo); diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp index 11e4d76..1f33f72 100644 --- a/lib/Transforms/Scalar/LoadCombine.cpp +++ b/lib/Transforms/Scalar/LoadCombine.cpp @@ -12,17 +12,17 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" - #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/TargetFolder.h" -#include "llvm/Pass.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" @@ -52,13 +52,10 @@ struct LoadPOPPair { class LoadCombine : public BasicBlockPass { LLVMContext *C; - const DataLayout *DL; AliasAnalysis *AA; public: - LoadCombine() - : BasicBlockPass(ID), - C(nullptr), DL(nullptr), AA(nullptr) { + LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) { initializeSROAPass(*PassRegistry::getPassRegistry()); } @@ -85,12 +82,6 @@ private: bool LoadCombine::doInitialization(Function &F) { DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n"); C = &F.getContext(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - if (!DLP) { - DEBUG(dbgs() << " Skipping LoadCombine -- no target data!\n"); - return false; - } - DL = &DLP->getDataLayout(); return true; } @@ -100,9 +91,10 @@ PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) { POP.Offset = 0; while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) { if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) { - unsigned BitWidth = DL->getPointerTypeSizeInBits(GEP->getType()); + auto &DL = LI.getModule()->getDataLayout(); + unsigned BitWidth = DL.getPointerTypeSizeInBits(GEP->getType()); APInt Offset(BitWidth, 0); - if (GEP->accumulateConstantOffset(*DL, Offset)) + if (GEP->accumulateConstantOffset(DL, Offset)) POP.Offset += Offset.getZExtValue(); else // Can't handle GEPs with variable indices. @@ -145,7 +137,8 @@ bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) { if (PrevOffset == -1ull) { BaseLoad = L.Load; PrevOffset = L.POP.Offset; - PrevSize = DL->getTypeStoreSize(L.Load->getType()); + PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize( + L.Load->getType()); AggregateLoads.push_back(L); continue; } @@ -164,7 +157,8 @@ bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) { // FIXME: We may want to handle this case. continue; PrevOffset = L.POP.Offset; - PrevSize = DL->getTypeStoreSize(L.Load->getType()); + PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize( + L.Load->getType()); AggregateLoads.push_back(L); } if (combineLoads(AggregateLoads)) @@ -215,7 +209,8 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) { for (const auto &L : Loads) { Builder->SetInsertPoint(L.Load); Value *V = Builder->CreateExtractInteger( - *DL, NewLoad, cast<IntegerType>(L.Load->getType()), + L.Load->getModule()->getDataLayout(), NewLoad, + cast<IntegerType>(L.Load->getType()), L.POP.Offset - Loads[0].POP.Offset, "combine.extract"); L.Load->replaceAllUsesWith(V); } @@ -225,13 +220,13 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) { } bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { - if (skipOptnoneFunction(BB) || !DL) + if (skipOptnoneFunction(BB)) return false; AA = &getAnalysis<AliasAnalysis>(); - IRBuilder<true, TargetFolder> - TheBuilder(BB.getContext(), TargetFolder(DL)); + IRBuilder<true, TargetFolder> TheBuilder( + BB.getContext(), TargetFolder(BB.getModule()->getDataLayout())); Builder = &TheBuilder; DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap; diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 243c624..7bc2917 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -47,6 +47,7 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" @@ -56,7 +57,6 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -130,7 +130,6 @@ namespace { class LoopIdiomRecognize : public LoopPass { Loop *CurLoop; - const DataLayout *DL; DominatorTree *DT; ScalarEvolution *SE; TargetLibraryInfo *TLI; @@ -139,7 +138,10 @@ namespace { static char ID; explicit LoopIdiomRecognize() : LoopPass(ID) { initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); - DL = nullptr; DT = nullptr; SE = nullptr; TLI = nullptr; TTI = nullptr; + DT = nullptr; + SE = nullptr; + TLI = nullptr; + TTI = nullptr; } bool runOnLoop(Loop *L, LPPassManager &LPM) override; @@ -179,14 +181,6 @@ namespace { AU.addRequired<TargetTransformInfoWrapperPass>(); } - const DataLayout *getDataLayout() { - if (DL) - return DL; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - return DL; - } - DominatorTree *getDominatorTree() { return DT ? DT : (DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree()); @@ -625,10 +619,6 @@ bool LoopIdiomRecognize::runOnCountableLoop() { if (BECst->getValue()->getValue() == 0) return false; - // We require target data for now. - if (!getDataLayout()) - return false; - // set DT (void)getDominatorTree(); @@ -742,7 +732,8 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { Value *StorePtr = SI->getPointerOperand(); // Reject stores that are so large that they overflow an unsigned. - uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); + auto &DL = CurLoop->getHeader()->getModule()->getDataLayout(); + uint64_t SizeInBits = DL.getTypeSizeInBits(StoredVal->getType()); if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) return false; @@ -917,7 +908,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // but it can be turned into memset_pattern if the target supports it. Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = nullptr; - + auto &DL = CurLoop->getHeader()->getModule()->getDataLayout(); unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); // If we're allowed to form a memset, and the stored value would be acceptable @@ -928,9 +919,8 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, CurLoop->isLoopInvariant(SplatValue)) { // Keep and use SplatValue. PatternValue = nullptr; - } else if (DestAS == 0 && - TLI->has(LibFunc::memset_pattern16) && - (PatternValue = getMemSetPatternValue(StoredVal, *DL))) { + } else if (DestAS == 0 && TLI->has(LibFunc::memset_pattern16) && + (PatternValue = getMemSetPatternValue(StoredVal, DL))) { // Don't create memset_pattern16s with address spaces. // It looks like we can use PatternValue! SplatValue = nullptr; @@ -945,7 +935,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // header. This allows us to insert code for it in the preheader. BasicBlock *Preheader = CurLoop->getLoopPreheader(); IRBuilder<> Builder(Preheader->getTerminator()); - SCEVExpander Expander(*SE, "loop-idiom"); + SCEVExpander Expander(*SE, DL, "loop-idiom"); Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); @@ -1005,7 +995,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // Otherwise we should form a memset_pattern16. PatternValue is known to be // an constant array of 16-bytes. Plop the value into a mergable global. GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, - GlobalValue::InternalLinkage, + GlobalValue::PrivateLinkage, PatternValue, ".memset_pattern"); GV->setUnnamedAddr(true); // Ok to merge these. GV->setAlignment(16); @@ -1042,7 +1032,8 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // header. This allows us to insert code for it in the preheader. BasicBlock *Preheader = CurLoop->getLoopPreheader(); IRBuilder<> Builder(Preheader->getTerminator()); - SCEVExpander Expander(*SE, "loop-idiom"); + const DataLayout &DL = Preheader->getModule()->getDataLayout(); + SCEVExpander Expander(*SE, DL, "loop-idiom"); // Okay, we have a strided store "p[i]" of a loaded value. We can turn // this into a memcpy in the loop preheader now if we want. However, this diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp index 6dc600e..e125026 100644 --- a/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -77,8 +77,6 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( @@ -110,6 +108,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { WorklistItem Item = VisitStack.pop_back_val(); BasicBlock *BB = Item.getPointer(); bool IsSubloopHeader = Item.getInt(); + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); // Simplify instructions in the current basic block. for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp new file mode 100644 index 0000000..f7626c5 --- /dev/null +++ b/lib/Transforms/Scalar/LoopInterchange.cpp @@ -0,0 +1,1154 @@ +//===- LoopInterchange.cpp - Loop interchange pass------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This Pass handles loop interchange transform. +// This pass interchanges loops to provide a more cache-friendly memory access +// patterns. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +using namespace llvm; + +#define DEBUG_TYPE "loop-interchange" + +namespace { + +typedef SmallVector<Loop *, 8> LoopVector; + +// TODO: Check if we can use a sparse matrix here. +typedef std::vector<std::vector<char>> CharMatrix; + +// Maximum number of dependencies that can be handled in the dependency matrix. +static const unsigned MaxMemInstrCount = 100; + +// Maximum loop depth supported. +static const unsigned MaxLoopNestDepth = 10; + +struct LoopInterchange; + +#ifdef DUMP_DEP_MATRICIES +void printDepMatrix(CharMatrix &DepMatrix) { + for (auto I = DepMatrix.begin(), E = DepMatrix.end(); I != E; ++I) { + std::vector<char> Vec = *I; + for (auto II = Vec.begin(), EE = Vec.end(); II != EE; ++II) + DEBUG(dbgs() << *II << " "); + DEBUG(dbgs() << "\n"); + } +} +#endif + +bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, + DependenceAnalysis *DA) { + typedef SmallVector<Value *, 16> ValueVector; + ValueVector MemInstr; + + if (Level > MaxLoopNestDepth) { + DEBUG(dbgs() << "Cannot handle loops of depth greater than " + << MaxLoopNestDepth << "\n"); + return false; + } + + // For each block. + for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end(); + BB != BE; ++BB) { + // Scan the BB and collect legal loads and stores. + for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; + ++I) { + Instruction *Ins = dyn_cast<Instruction>(I); + if (!Ins) + return false; + LoadInst *Ld = dyn_cast<LoadInst>(I); + StoreInst *St = dyn_cast<StoreInst>(I); + if (!St && !Ld) + continue; + if (Ld && !Ld->isSimple()) + return false; + if (St && !St->isSimple()) + return false; + MemInstr.push_back(I); + } + } + + DEBUG(dbgs() << "Found " << MemInstr.size() + << " Loads and Stores to analyze\n"); + + ValueVector::iterator I, IE, J, JE; + + for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { + for (J = I, JE = MemInstr.end(); J != JE; ++J) { + std::vector<char> Dep; + Instruction *Src = dyn_cast<Instruction>(*I); + Instruction *Des = dyn_cast<Instruction>(*J); + if (Src == Des) + continue; + if (isa<LoadInst>(Src) && isa<LoadInst>(Des)) + continue; + if (auto D = DA->depends(Src, Des, true)) { + DEBUG(dbgs() << "Found Dependency between Src=" << Src << " Des=" << Des + << "\n"); + if (D->isFlow()) { + // TODO: Handle Flow dependence.Check if it is sufficient to populate + // the Dependence Matrix with the direction reversed. + DEBUG(dbgs() << "Flow dependence not handled"); + return false; + } + if (D->isAnti()) { + DEBUG(dbgs() << "Found Anti dependence \n"); + unsigned Levels = D->getLevels(); + char Direction; + for (unsigned II = 1; II <= Levels; ++II) { + const SCEV *Distance = D->getDistance(II); + const SCEVConstant *SCEVConst = + dyn_cast_or_null<SCEVConstant>(Distance); + if (SCEVConst) { + const ConstantInt *CI = SCEVConst->getValue(); + if (CI->isNegative()) + Direction = '<'; + else if (CI->isZero()) + Direction = '='; + else + Direction = '>'; + Dep.push_back(Direction); + } else if (D->isScalar(II)) { + Direction = 'S'; + Dep.push_back(Direction); + } else { + unsigned Dir = D->getDirection(II); + if (Dir == Dependence::DVEntry::LT || + Dir == Dependence::DVEntry::LE) + Direction = '<'; + else if (Dir == Dependence::DVEntry::GT || + Dir == Dependence::DVEntry::GE) + Direction = '>'; + else if (Dir == Dependence::DVEntry::EQ) + Direction = '='; + else + Direction = '*'; + Dep.push_back(Direction); + } + } + while (Dep.size() != Level) { + Dep.push_back('I'); + } + + DepMatrix.push_back(Dep); + if (DepMatrix.size() > MaxMemInstrCount) { + DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount + << " dependencies inside loop\n"); + return false; + } + } + } + } + } + + // We don't have a DepMatrix to check legality return false + if (DepMatrix.size() == 0) + return false; + return true; +} + +// A loop is moved from index 'from' to an index 'to'. Update the Dependence +// matrix by exchanging the two columns. +void interChangeDepedencies(CharMatrix &DepMatrix, unsigned FromIndx, + unsigned ToIndx) { + unsigned numRows = DepMatrix.size(); + for (unsigned i = 0; i < numRows; ++i) { + char TmpVal = DepMatrix[i][ToIndx]; + DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx]; + DepMatrix[i][FromIndx] = TmpVal; + } +} + +// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is +// '>' +bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row, + unsigned Column) { + for (unsigned i = 0; i <= Column; ++i) { + if (DepMatrix[Row][i] == '<') + return false; + if (DepMatrix[Row][i] == '>') + return true; + } + // All dependencies were '=','S' or 'I' + return false; +} + +// Checks if no dependence exist in the dependency matrix in Row before Column. +bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row, + unsigned Column) { + for (unsigned i = 0; i < Column; ++i) { + if (DepMatrix[Row][i] != '=' || DepMatrix[Row][i] != 'S' || + DepMatrix[Row][i] != 'I') + return false; + } + return true; +} + +bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, + unsigned OuterLoopId, char InnerDep, char OuterDep) { + + if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId)) + return false; + + if (InnerDep == OuterDep) + return true; + + // It is legal to interchange if and only if after interchange no row has a + // '>' direction as the leftmost non-'='. + + if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I') + return true; + + if (InnerDep == '<') + return true; + + if (InnerDep == '>') { + // If OuterLoopId represents outermost loop then interchanging will make the + // 1st dependency as '>' + if (OuterLoopId == 0) + return false; + + // If all dependencies before OuterloopId are '=','S'or 'I'. Then + // interchanging will result in this row having an outermost non '=' + // dependency of '>' + if (!containsNoDependence(DepMatrix, Row, OuterLoopId)) + return true; + } + + return false; +} + +// Checks if it is legal to interchange 2 loops. +// [Theorm] A permutation of the loops in a perfect nest is legal if and only if +// the direction matrix, after the same permutation is applied to its columns, +// has no ">" direction as the leftmost non-"=" direction in any row. +bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, + unsigned OuterLoopId) { + + unsigned NumRows = DepMatrix.size(); + // For each row check if it is valid to interchange. + for (unsigned Row = 0; Row < NumRows; ++Row) { + char InnerDep = DepMatrix[Row][InnerLoopId]; + char OuterDep = DepMatrix[Row][OuterLoopId]; + if (InnerDep == '*' || OuterDep == '*') + return false; + else if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, + OuterDep)) + return false; + } + return true; +} + +static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) { + + DEBUG(dbgs() << "Calling populateWorklist called\n"); + LoopVector LoopList; + Loop *CurrentLoop = &L; + std::vector<Loop *> vec = CurrentLoop->getSubLoopsVector(); + while (vec.size() != 0) { + // The current loop has multiple subloops in it hence it is not tightly + // nested. + // Discard all loops above it added into Worklist. + if (vec.size() != 1) { + LoopList.clear(); + return; + } + LoopList.push_back(CurrentLoop); + CurrentLoop = *(vec.begin()); + vec = CurrentLoop->getSubLoopsVector(); + } + LoopList.push_back(CurrentLoop); + V.push_back(LoopList); +} + +static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { + PHINode *InnerIndexVar = L->getCanonicalInductionVariable(); + if (InnerIndexVar) + return InnerIndexVar; + if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr) + return nullptr; + for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { + PHINode *PhiVar = cast<PHINode>(I); + Type *PhiTy = PhiVar->getType(); + if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && + !PhiTy->isPointerTy()) + return nullptr; + const SCEVAddRecExpr *AddRec = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar)); + if (!AddRec || !AddRec->isAffine()) + continue; + const SCEV *Step = AddRec->getStepRecurrence(*SE); + const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); + if (!C) + continue; + // Found the induction variable. + // FIXME: Handle loops with more than one induction variable. Note that, + // currently, legality makes sure we have only one induction variable. + return PhiVar; + } + return nullptr; +} + +/// LoopInterchangeLegality checks if it is legal to interchange the loop. +class LoopInterchangeLegality { +public: + LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, + LoopInterchange *Pass) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), CurrentPass(Pass) {} + + /// Check if the loops can be interchanged. + bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, + CharMatrix &DepMatrix); + /// Check if the loop structure is understood. We do not handle triangular + /// loops for now. + bool isLoopStructureUnderstood(PHINode *InnerInductionVar); + + bool currentLimitations(); + +private: + bool tightlyNested(Loop *Outer, Loop *Inner); + + Loop *OuterLoop; + Loop *InnerLoop; + + /// Scev analysis. + ScalarEvolution *SE; + LoopInterchange *CurrentPass; +}; + +/// LoopInterchangeProfitability checks if it is profitable to interchange the +/// loop. +class LoopInterchangeProfitability { +public: + LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {} + + /// Check if the loop interchange is profitable + bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, + CharMatrix &DepMatrix); + +private: + int getInstrOrderCost(); + + Loop *OuterLoop; + Loop *InnerLoop; + + /// Scev analysis. + ScalarEvolution *SE; +}; + +/// LoopInterchangeTransform interchanges the loop +class LoopInterchangeTransform { +public: + LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, + LoopInfo *LI, DominatorTree *DT, + LoopInterchange *Pass, BasicBlock *LoopNestExit) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), + LoopExit(LoopNestExit) {} + + /// Interchange OuterLoop and InnerLoop. + bool transform(); + void restructureLoops(Loop *InnerLoop, Loop *OuterLoop); + void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); + +private: + void splitInnerLoopLatch(Instruction *); + void splitOuterLoopLatch(); + void splitInnerLoopHeader(); + bool adjustLoopLinks(); + void adjustLoopPreheaders(); + void adjustOuterLoopPreheader(); + void adjustInnerLoopPreheader(); + bool adjustLoopBranches(); + + Loop *OuterLoop; + Loop *InnerLoop; + + /// Scev analysis. + ScalarEvolution *SE; + LoopInfo *LI; + DominatorTree *DT; + BasicBlock *LoopExit; +}; + +// Main LoopInterchange Pass +struct LoopInterchange : public FunctionPass { + static char ID; + ScalarEvolution *SE; + LoopInfo *LI; + DependenceAnalysis *DA; + DominatorTree *DT; + LoopInterchange() + : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) { + initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ScalarEvolution>(); + AU.addRequired<AliasAnalysis>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<DependenceAnalysis>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + } + + bool runOnFunction(Function &F) override { + SE = &getAnalysis<ScalarEvolution>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + DA = &getAnalysis<DependenceAnalysis>(); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + // Build up a worklist of loop pairs to analyze. + SmallVector<LoopVector, 8> Worklist; + + for (Loop *L : *LI) + populateWorklist(*L, Worklist); + + DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n"); + bool Changed = true; + while (!Worklist.empty()) { + LoopVector LoopList = Worklist.pop_back_val(); + Changed = processLoopList(LoopList); + } + return Changed; + } + + bool isComputableLoopNest(LoopVector LoopList) { + for (auto I = LoopList.begin(), E = LoopList.end(); I != E; ++I) { + Loop *L = *I; + const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); + if (ExitCountOuter == SE->getCouldNotCompute()) { + DEBUG(dbgs() << "Couldn't compute Backedge count\n"); + return false; + } + if (L->getNumBackEdges() != 1) { + DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); + return false; + } + if (!L->getExitingBlock()) { + DEBUG(dbgs() << "Loop Doesn't have unique exit block\n"); + return false; + } + } + return true; + } + + unsigned selectLoopForInterchange(LoopVector LoopList) { + // TODO: Add a better heuristic to select the loop to be interchanged based + // on the dependece matrix. Currently we select the innermost loop. + return LoopList.size() - 1; + } + + bool processLoopList(LoopVector LoopList) { + bool Changed = false; + bool containsLCSSAPHI = false; + CharMatrix DependencyMatrix; + if (LoopList.size() < 2) { + DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n"); + return false; + } + if (!isComputableLoopNest(LoopList)) { + DEBUG(dbgs() << "Not vaild loop candidate for interchange\n"); + return false; + } + Loop *OuterMostLoop = *(LoopList.begin()); + + DEBUG(dbgs() << "Processing LoopList of size = " << LoopList.size() + << "\n"); + + if (!populateDependencyMatrix(DependencyMatrix, LoopList.size(), + OuterMostLoop, DA)) { + DEBUG(dbgs() << "Populating Dependency matrix failed\n"); + return false; + } +#ifdef DUMP_DEP_MATRICIES + DEBUG(dbgs() << "Dependence before inter change \n"); + printDepMatrix(DependencyMatrix); +#endif + + BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch(); + BranchInst *OuterMostLoopLatchBI = + dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator()); + if (!OuterMostLoopLatchBI) + return false; + + // Since we currently do not handle LCSSA PHI's any failure in loop + // condition will now branch to LoopNestExit. + // TODO: This should be removed once we handle LCSSA PHI nodes. + + // Get the Outermost loop exit. + BasicBlock *LoopNestExit; + if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader()) + LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1); + else + LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0); + + for (auto I = LoopList.begin(), E = LoopList.end(); I != E; ++I) { + Loop *L = *I; + BasicBlock *Latch = L->getLoopLatch(); + BasicBlock *Header = L->getHeader(); + if (Latch && Latch != Header && isa<PHINode>(Latch->begin())) { + containsLCSSAPHI = true; + break; + } + } + + // TODO: Handle lcssa PHI's. Currently LCSSA PHI's are not handled. Handle + // the same by splitting the loop latch and adjusting loop links + // accordingly. + if (containsLCSSAPHI) + return false; + + unsigned SelecLoopId = selectLoopForInterchange(LoopList); + // Move the selected loop outwards to the best posible position. + for (unsigned i = SelecLoopId; i > 0; i--) { + bool Interchanged = + processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix); + if (!Interchanged) + return Changed; + // Loops interchanged reflect the same in LoopList + std::swap(LoopList[i - 1], LoopList[i]); + + // Update the DependencyMatrix + interChangeDepedencies(DependencyMatrix, i, i - 1); + +#ifdef DUMP_DEP_MATRICIES + DEBUG(dbgs() << "Dependence after inter change \n"); + printDepMatrix(DependencyMatrix); +#endif + Changed |= Interchanged; + } + return Changed; + } + + bool processLoop(LoopVector LoopList, unsigned InnerLoopId, + unsigned OuterLoopId, BasicBlock *LoopNestExit, + std::vector<std::vector<char>> &DependencyMatrix) { + + DEBUG(dbgs() << "Processing Innder Loop Id = " << InnerLoopId + << " and OuterLoopId = " << OuterLoopId << "\n"); + Loop *InnerLoop = LoopList[InnerLoopId]; + Loop *OuterLoop = LoopList[OuterLoopId]; + + LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, this); + if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { + DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); + return false; + } + DEBUG(dbgs() << "Loops are legal to interchange\n"); + LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE); + if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { + DEBUG(dbgs() << "Interchanging Loops not profitable\n"); + return false; + } + + LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, this, + LoopNestExit); + LIT.transform(); + DEBUG(dbgs() << "Loops interchanged\n"); + return true; + } +}; + +} // end of namespace + +static bool containsUnsafeInstructions(BasicBlock *BB) { + for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { + if (I->mayHaveSideEffects() || I->mayReadFromMemory()) + return true; + } + return false; +} + +bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + + DEBUG(dbgs() << "Checking if Loops are Tightly Nested\n"); + + // A perfectly nested loop will not have any branch in between the outer and + // inner block i.e. outer header will branch to either inner preheader and + // outerloop latch. + BranchInst *outerLoopHeaderBI = + dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); + if (!outerLoopHeaderBI) + return false; + unsigned num = outerLoopHeaderBI->getNumSuccessors(); + for (unsigned i = 0; i < num; i++) { + if (outerLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader && + outerLoopHeaderBI->getSuccessor(i) != OuterLoopLatch) + return false; + } + + DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch \n"); + // We do not have any basic block in between now make sure the outer header + // and outer loop latch doesnt contain any unsafe instructions. + if (containsUnsafeInstructions(OuterLoopHeader) || + containsUnsafeInstructions(OuterLoopLatch)) + return false; + + DEBUG(dbgs() << "Loops are perfectly nested \n"); + // We have a perfect loop nest. + return true; +} + +static unsigned getPHICount(BasicBlock *BB) { + unsigned PhiCount = 0; + for (auto I = BB->begin(); isa<PHINode>(I); ++I) + PhiCount++; + return PhiCount; +} + +bool LoopInterchangeLegality::isLoopStructureUnderstood( + PHINode *InnerInduction) { + + unsigned Num = InnerInduction->getNumOperands(); + BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); + for (unsigned i = 0; i < Num; ++i) { + Value *Val = InnerInduction->getOperand(i); + if (isa<Constant>(Val)) + continue; + Instruction *I = dyn_cast<Instruction>(Val); + if (!I) + return false; + // TODO: Handle triangular loops. + // e.g. for(int i=0;i<N;i++) + // for(int j=i;j<N;j++) + unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); + if (InnerInduction->getIncomingBlock(IncomBlockIndx) == + InnerLoopPreheader && + !OuterLoop->isLoopInvariant(I)) { + return false; + } + } + return true; +} + +// This function indicates the current limitations in the transform as a result +// of which we do not proceed. +bool LoopInterchangeLegality::currentLimitations() { + + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + + PHINode *InnerInductionVar; + PHINode *OuterInductionVar; + + // We currently handle only 1 induction variable inside the loop. We also do + // not handle reductions as of now. + if (getPHICount(InnerLoopHeader) > 1) + return true; + + if (getPHICount(OuterLoopHeader) > 1) + return true; + + InnerInductionVar = getInductionVariable(InnerLoop, SE); + OuterInductionVar = getInductionVariable(OuterLoop, SE); + + if (!OuterInductionVar || !InnerInductionVar) { + DEBUG(dbgs() << "Induction variable not found\n"); + return true; + } + + // TODO: Triangular loops are not handled for now. + if (!isLoopStructureUnderstood(InnerInductionVar)) { + DEBUG(dbgs() << "Loop structure not understood by pass\n"); + return true; + } + + // TODO: Loops with LCSSA PHI's are currently not handled. + if (isa<PHINode>(OuterLoopLatch->begin())) { + DEBUG(dbgs() << "Found and LCSSA PHI in outer loop latch\n"); + return true; + } + if (InnerLoopLatch != InnerLoopHeader && + isa<PHINode>(InnerLoopLatch->begin())) { + DEBUG(dbgs() << "Found and LCSSA PHI in inner loop latch\n"); + return true; + } + + // TODO: Current limitation: Since we split the inner loop latch at the point + // were induction variable is incremented (induction.next); We cannot have + // more than 1 user of induction.next since it would result in broken code + // after split. + // e.g. + // for(i=0;i<N;i++) { + // for(j = 0;j<M;j++) { + // A[j+1][i+2] = A[j][i]+k; + // } + // } + bool FoundInduction = false; + Instruction *InnerIndexVarInc = nullptr; + if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader) + InnerIndexVarInc = + dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1)); + else + InnerIndexVarInc = + dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0)); + + if (!InnerIndexVarInc) + return true; + + // Since we split the inner loop latch on this induction variable. Make sure + // we do not have any instruction between the induction variable and branch + // instruction. + + for (auto I = InnerLoopLatch->rbegin(), E = InnerLoopLatch->rend(); + I != E && !FoundInduction; ++I) { + if (isa<BranchInst>(*I) || isa<CmpInst>(*I) || isa<TruncInst>(*I)) + continue; + const Instruction &Ins = *I; + // We found an instruction. If this is not induction variable then it is not + // safe to split this loop latch. + if (!Ins.isIdenticalTo(InnerIndexVarInc)) + return true; + else + FoundInduction = true; + } + // The loop latch ended and we didnt find the induction variable return as + // current limitation. + if (!FoundInduction) + return true; + + return false; +} + +bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, + unsigned OuterLoopId, + CharMatrix &DepMatrix) { + + if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { + DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId + << "and OuterLoopId = " << OuterLoopId + << "due to dependence\n"); + return false; + } + + // Create unique Preheaders if we already do not have one. + BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + + // Create a unique outer preheader - + // 1) If OuterLoop preheader is not present. + // 2) If OuterLoop Preheader is same as OuterLoop Header + // 3) If OuterLoop Preheader is same as Header of the previous loop. + // 4) If OuterLoop Preheader is Entry node. + if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() || + isa<PHINode>(OuterLoopPreHeader->begin()) || + !OuterLoopPreHeader->getUniquePredecessor()) { + OuterLoopPreHeader = InsertPreheaderForLoop(OuterLoop, CurrentPass); + } + + if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() || + InnerLoopPreHeader == OuterLoop->getHeader()) { + InnerLoopPreHeader = InsertPreheaderForLoop(InnerLoop, CurrentPass); + } + + // Check if the loops are tightly nested. + if (!tightlyNested(OuterLoop, InnerLoop)) { + DEBUG(dbgs() << "Loops not tightly nested\n"); + return false; + } + + // TODO: The loops could not be interchanged due to current limitations in the + // transform module. + if (currentLimitations()) { + DEBUG(dbgs() << "Not legal because of current transform limitation\n"); + return false; + } + + return true; +} + +int LoopInterchangeProfitability::getInstrOrderCost() { + unsigned GoodOrder, BadOrder; + BadOrder = GoodOrder = 0; + for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end(); + BI != BE; ++BI) { + for (auto I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) { + const Instruction &Ins = *I; + if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) { + unsigned NumOp = GEP->getNumOperands(); + bool FoundInnerInduction = false; + bool FoundOuterInduction = false; + for (unsigned i = 0; i < NumOp; ++i) { + const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i)); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal); + if (!AR) + continue; + + // If we find the inner induction after an outer induction e.g. + // for(int i=0;i<N;i++) + // for(int j=0;j<N;j++) + // A[i][j] = A[i-1][j-1]+k; + // then it is a good order. + if (AR->getLoop() == InnerLoop) { + // We found an InnerLoop induction after OuterLoop induction. It is + // a good order. + FoundInnerInduction = true; + if (FoundOuterInduction) { + GoodOrder++; + break; + } + } + // If we find the outer induction after an inner induction e.g. + // for(int i=0;i<N;i++) + // for(int j=0;j<N;j++) + // A[j][i] = A[j-1][i-1]+k; + // then it is a bad order. + if (AR->getLoop() == OuterLoop) { + // We found an OuterLoop induction after InnerLoop induction. It is + // a bad order. + FoundOuterInduction = true; + if (FoundInnerInduction) { + BadOrder++; + break; + } + } + } + } + } + } + return GoodOrder - BadOrder; +} + +static bool isProfitabileForVectorization(unsigned InnerLoopId, + unsigned OuterLoopId, + CharMatrix &DepMatrix) { + // TODO: Improve this heuristic to catch more cases. + // If the inner loop is loop independent or doesn't carry any dependency it is + // profitable to move this to outer position. + unsigned Row = DepMatrix.size(); + for (unsigned i = 0; i < Row; ++i) { + if (DepMatrix[i][InnerLoopId] != 'S' && DepMatrix[i][InnerLoopId] != 'I') + return false; + // TODO: We need to improve this heuristic. + if (DepMatrix[i][OuterLoopId] != '=') + return false; + } + // If outer loop has dependence and inner loop is loop independent then it is + // profitable to interchange to enable parallelism. + return true; +} + +bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, + unsigned OuterLoopId, + CharMatrix &DepMatrix) { + + // TODO: Add Better Profitibility checks. + // e.g + // 1) Construct dependency matrix and move the one with no loop carried dep + // inside to enable vectorization. + + // This is rough cost estimation algorithm. It counts the good and bad order + // of induction variables in the instruction and allows reordering if number + // of bad orders is more than good. + int Cost = 0; + Cost += getInstrOrderCost(); + DEBUG(dbgs() << "Cost = " << Cost << "\n"); + if (Cost < 0) + return true; + + // It is not profitable as per current cache profitibility model. But check if + // we can move this loop outside to improve parallelism. + bool ImprovesPar = + isProfitabileForVectorization(InnerLoopId, OuterLoopId, DepMatrix); + return ImprovesPar; +} + +void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, + Loop *InnerLoop) { + for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E; + ++I) { + if (*I == InnerLoop) { + OuterLoop->removeChildLoop(I); + return; + } + } + assert(false && "Couldn't find loop"); +} + +void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop, + Loop *OuterLoop) { + Loop *OuterLoopParent = OuterLoop->getParentLoop(); + if (OuterLoopParent) { + // Remove the loop from its parent loop. + removeChildLoop(OuterLoopParent, OuterLoop); + removeChildLoop(OuterLoop, InnerLoop); + OuterLoopParent->addChildLoop(InnerLoop); + } else { + removeChildLoop(OuterLoop, InnerLoop); + LI->changeTopLevelLoop(OuterLoop, InnerLoop); + } + + for (Loop::iterator I = InnerLoop->begin(), E = InnerLoop->end(); I != E; ++I) + OuterLoop->addChildLoop(InnerLoop->removeChildLoop(I)); + + InnerLoop->addChildLoop(OuterLoop); +} + +bool LoopInterchangeTransform::transform() { + + DEBUG(dbgs() << "transform\n"); + bool Transformed = false; + Instruction *InnerIndexVar; + + if (InnerLoop->getSubLoops().size() == 0) { + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + DEBUG(dbgs() << "Calling Split Inner Loop\n"); + PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); + if (!InductionPHI) { + DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); + return false; + } + + if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) + InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1)); + else + InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); + + // + // Split at the place were the induction variable is + // incremented/decremented. + // TODO: This splitting logic may not work always. Fix this. + splitInnerLoopLatch(InnerIndexVar); + DEBUG(dbgs() << "splitInnerLoopLatch Done\n"); + + // Splits the inner loops phi nodes out into a seperate basic block. + splitInnerLoopHeader(); + DEBUG(dbgs() << "splitInnerLoopHeader Done\n"); + } + + Transformed |= adjustLoopLinks(); + if (!Transformed) { + DEBUG(dbgs() << "adjustLoopLinks Failed\n"); + return false; + } + + restructureLoops(InnerLoop, OuterLoop); + return true; +} + +void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) { + BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); + BasicBlock *InnerLoopLatchPred = InnerLoopLatch; + InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI); +} + +void LoopInterchangeTransform::splitOuterLoopLatch() { + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + BasicBlock *OuterLatchLcssaPhiBlock = OuterLoopLatch; + OuterLoopLatch = SplitBlock(OuterLatchLcssaPhiBlock, + OuterLoopLatch->getFirstNonPHI(), DT, LI); +} + +void LoopInterchangeTransform::splitInnerLoopHeader() { + + // Split the inner loop header out. + BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); + + DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & " + "InnerLoopHeader \n"); +} + +/// \brief Move all instructions except the terminator from FromBB right before +/// InsertBefore +static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { + auto &ToList = InsertBefore->getParent()->getInstList(); + auto &FromList = FromBB->getInstList(); + + ToList.splice(InsertBefore, FromList, FromList.begin(), + FromBB->getTerminator()); +} + +void LoopInterchangeTransform::adjustOuterLoopPreheader() { + BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + BasicBlock *InnerPreHeader = InnerLoop->getLoopPreheader(); + + moveBBContents(OuterLoopPreHeader, InnerPreHeader->getTerminator()); +} + +void LoopInterchangeTransform::adjustInnerLoopPreheader() { + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *OuterHeader = OuterLoop->getHeader(); + + moveBBContents(InnerLoopPreHeader, OuterHeader->getTerminator()); +} + +bool LoopInterchangeTransform::adjustLoopBranches() { + + DEBUG(dbgs() << "adjustLoopBranches called\n"); + // Adjust the loop preheader + BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); + BasicBlock *InnerLoopLatchPredecessor = + InnerLoopLatch->getUniquePredecessor(); + BasicBlock *InnerLoopLatchSuccessor; + BasicBlock *OuterLoopLatchSuccessor; + + BranchInst *OuterLoopLatchBI = + dyn_cast<BranchInst>(OuterLoopLatch->getTerminator()); + BranchInst *InnerLoopLatchBI = + dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); + BranchInst *OuterLoopHeaderBI = + dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); + BranchInst *InnerLoopHeaderBI = + dyn_cast<BranchInst>(InnerLoopHeader->getTerminator()); + + if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor || + !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI || + !InnerLoopHeaderBI) + return false; + + BranchInst *InnerLoopLatchPredecessorBI = + dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator()); + BranchInst *OuterLoopPredecessorBI = + dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator()); + + if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) + return false; + BasicBlock *InnerLoopHeaderSucessor = InnerLoopHeader->getUniqueSuccessor(); + if (!InnerLoopHeaderSucessor) + return false; + + // Adjust Loop Preheader and headers + + unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors(); + for (unsigned i = 0; i < NumSucc; ++i) { + if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader) + OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader); + } + + NumSucc = OuterLoopHeaderBI->getNumSuccessors(); + for (unsigned i = 0; i < NumSucc; ++i) { + if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch) + OuterLoopHeaderBI->setSuccessor(i, LoopExit); + else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader) + OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSucessor); + } + + BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI); + InnerLoopHeaderBI->eraseFromParent(); + + // -------------Adjust loop latches----------- + if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) + InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1); + else + InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0); + + NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors(); + for (unsigned i = 0; i < NumSucc; ++i) { + if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch) + InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor); + } + + if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) + OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); + else + OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0); + + if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor) + InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor); + else + InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor); + + if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) { + OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch); + } else { + OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch); + } + + return true; +} +void LoopInterchangeTransform::adjustLoopPreheaders() { + + // We have interchanged the preheaders so we need to interchange the data in + // the preheader as well. + // This is because the content of inner preheader was previously executed + // inside the outer loop. + BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + BranchInst *InnerTermBI = + cast<BranchInst>(InnerLoopPreHeader->getTerminator()); + + BasicBlock *HeaderSplit = + SplitBlock(OuterLoopHeader, OuterLoopHeader->getTerminator(), DT, LI); + Instruction *InsPoint = HeaderSplit->getFirstNonPHI(); + // These instructions should now be executed inside the loop. + // Move instruction into a new block after outer header. + moveBBContents(InnerLoopPreHeader, InsPoint); + // These instructions were not executed previously in the loop so move them to + // the older inner loop preheader. + moveBBContents(OuterLoopPreHeader, InnerTermBI); +} + +bool LoopInterchangeTransform::adjustLoopLinks() { + + // Adjust all branches in the inner and outer loop. + bool Changed = adjustLoopBranches(); + if (Changed) + adjustLoopPreheaders(); + return Changed; +} + +char LoopInterchange::ID = 0; +INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", + "Interchanges loops for cache reuse", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) + +INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", + "Interchanges loops for cache reuse", false, false) + +Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); } diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp index fdf7e3b..ed103e6 100644 --- a/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" @@ -30,7 +31,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -160,7 +160,6 @@ namespace { AliasAnalysis *AA; LoopInfo *LI; ScalarEvolution *SE; - const DataLayout *DL; TargetLibraryInfo *TLI; DominatorTree *DT; @@ -367,10 +366,8 @@ namespace { struct DAGRootTracker { DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV, ScalarEvolution *SE, AliasAnalysis *AA, - TargetLibraryInfo *TLI, const DataLayout *DL) - : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), - DL(DL), IV(IV) { - } + TargetLibraryInfo *TLI) + : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), IV(IV) {} /// Stage 1: Find all the DAG roots for the induction variable. bool findRoots(); @@ -416,7 +413,6 @@ namespace { ScalarEvolution *SE; AliasAnalysis *AA; TargetLibraryInfo *TLI; - const DataLayout *DL; // The loop induction variable. Instruction *IV; @@ -1131,7 +1127,7 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { // needed because otherwise isSafeToSpeculativelyExecute returns // false on PHI nodes. if (!isa<PHINode>(I) && !isSimpleLoadStore(I) && - !isSafeToSpeculativelyExecute(I, DL)) + !isSafeToSpeculativelyExecute(I)) // Intervening instructions cause side effects. FutureSideEffects = true; } @@ -1161,11 +1157,10 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { // side effects, and this instruction might also, then we can't reorder // them, and this matching fails. As an exception, we allow the alias // set tracker to handle regular (simple) load/store dependencies. - if (FutureSideEffects && - ((!isSimpleLoadStore(BaseInst) && - !isSafeToSpeculativelyExecute(BaseInst, DL)) || - (!isSimpleLoadStore(RootInst) && - !isSafeToSpeculativelyExecute(RootInst, DL)))) { + if (FutureSideEffects && ((!isSimpleLoadStore(BaseInst) && + !isSafeToSpeculativelyExecute(BaseInst)) || + (!isSimpleLoadStore(RootInst) && + !isSafeToSpeculativelyExecute(RootInst)))) { DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << " vs. " << *RootInst << " (side effects prevent reordering)\n"); @@ -1272,6 +1267,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { ++J; } + const DataLayout &DL = Header->getModule()->getDataLayout(); // We need to create a new induction variable for each different BaseInst. for (auto &DRS : RootSets) { @@ -1284,7 +1280,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { SE->getConstant(RealIVSCEV->getType(), 1), L, SCEV::FlagAnyWrap)); { // Limit the lifetime of SCEVExpander. - SCEVExpander Expander(*SE, "reroll"); + SCEVExpander Expander(*SE, DL, "reroll"); Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); for (auto &KV : Uses) { @@ -1324,7 +1320,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { } } - SimplifyInstructionsInBlock(Header, DL, TLI); + SimplifyInstructionsInBlock(Header, TLI); DeleteDeadPHIs(Header, TLI); } @@ -1448,7 +1444,7 @@ void LoopReroll::ReductionTracker::replaceSelected() { bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, ReductionTracker &Reductions) { - DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DL); + DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI); if (!DAGRoots.findRoots()) return false; @@ -1477,8 +1473,6 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolution>(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); BasicBlock *Header = L->getHeader(); diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index 4d12349..a675e12 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -24,8 +24,10 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -412,6 +414,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader); + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + // For the rest of the instructions, either hoist to the OrigPreheader if // possible or create a clone in the OldPreHeader if not. TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); @@ -442,8 +446,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // With the operands remapped, see if the instruction constant folds or is // otherwise simplifyable. This commonly occurs because the entry from PHI // nodes allows icmps and other instructions to fold. - // FIXME: Provide DL, TLI, DT, AC to SimplifyInstruction. - Value *V = SimplifyInstruction(C); + // FIXME: Provide TLI, DT, AC to SimplifyInstruction. + Value *V = SimplifyInstruction(C, DL); if (V && LI->replacementPreservesLCSSAForm(C, V)) { // If so, then delete the temporary instruction and stick the folded value // in the map. diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 318065e..8445d5f 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -68,6 +68,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -3825,7 +3826,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) && (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale)) - .ule(abs64(NewF.BaseOffset))) + .ule(std::abs(NewF.BaseOffset))) continue; // OK, looks good. @@ -3856,7 +3857,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { J != JE; ++J) if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J)) if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt( - abs64(NewF.BaseOffset)) && + std::abs(NewF.BaseOffset)) && (C->getValue()->getValue() + NewF.BaseOffset).countTrailingZeros() >= countTrailingZeros<uint64_t>(NewF.BaseOffset)) @@ -4823,7 +4824,8 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, // we can remove them after we are done working. SmallVector<WeakVH, 16> DeadInsts; - SCEVExpander Rewriter(SE, "lsr"); + SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), + "lsr"); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif @@ -5093,7 +5095,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { Changed |= DeleteDeadPHIs(L->getHeader()); if (EnablePhiElim && L->isLoopSimplifyForm()) { SmallVector<WeakVH, 16> DeadInsts; - SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr"); + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), DL, "lsr"); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index 924be16..600cbde 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -23,14 +24,13 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/UnrollLoop.h" -#include "llvm/IR/InstVisitor.h" -#include "llvm/Analysis/InstructionSimplify.h" #include <climits> using namespace llvm; @@ -259,6 +259,7 @@ static bool isLoadFromConstantInitializer(Value *V) { return false; } +namespace { struct FindConstantPointers { bool LoadCanBeConstantFolded; bool IndexIsConstant; @@ -356,11 +357,12 @@ class UnrollAnalyzer : public InstVisitor<UnrollAnalyzer, bool> { if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) RHS = SimpleRHS; Value *SimpleV = nullptr; + const DataLayout &DL = I.getModule()->getDataLayout(); if (auto FI = dyn_cast<FPMathOperator>(&I)) SimpleV = - SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags()); + SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL); else - SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS); + SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL); if (SimpleV && CountedInstructions.insert(&I).second) NumberOfOptimizedInstructions += TTI.getUserCost(&I); @@ -540,6 +542,7 @@ public: return NumberOfOptimizedInstructions; } }; +} // namespace // Complete loop unrolling can make some loads constant, and we need to know if // that would expose any further optimization opportunities. @@ -619,6 +622,11 @@ static bool HasUnrollDisablePragma(const Loop *L) { return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable"); } +// Returns true if the loop has an runtime unroll(disable) pragma. +static bool HasRuntimeUnrollDisablePragma(const Loop *L) { + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable"); +} + // If loop has an unroll_count pragma return the (necessarily // positive) value from the pragma. Otherwise return 0. static unsigned UnrollCountPragmaValue(const Loop *L) { @@ -807,6 +815,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Reduce count based on the type of unrolling and the threshold values. unsigned OriginalCount = Count; bool AllowRuntime = UserRuntime ? CurrentRuntime : UP.Runtime; + if (HasRuntimeUnrollDisablePragma(L)) { + AllowRuntime = false; + } if (Unrolling == Partial) { bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; if (!AllowPartial && !CountSetExplicitly) { diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 987dc96..988d2af 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -1082,6 +1083,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, /// pass. /// void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); while (!Worklist.empty()) { Instruction *I = Worklist.back(); Worklist.pop_back(); @@ -1104,7 +1106,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { // See if instruction simplification can hack this up. This is common for // things like "select false, X, Y" after unswitching made the condition be // 'false'. TODO: update the domtree properly so we can pass it here. - if (Value *V = SimplifyInstruction(I)) + if (Value *V = SimplifyInstruction(I, DL)) if (LI->replacementPreservesLCSSAForm(I, V)) { ReplaceUsesOfWith(I, V, Worklist, L, LPM); continue; diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 006b885..2b5a078 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" @@ -28,7 +29,6 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; @@ -41,7 +41,8 @@ STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, - bool &VariableIdxFound, const DataLayout &TD){ + bool &VariableIdxFound, + const DataLayout &DL) { // Skip over the first indices. gep_type_iterator GTI = gep_type_begin(GEP); for (unsigned i = 1; i != Idx; ++i, ++GTI) @@ -57,13 +58,13 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, // Handle struct indices, which add their field offset to the pointer. if (StructType *STy = dyn_cast<StructType>(*GTI)) { - Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); + Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); continue; } // Otherwise, we have a sequential type like an array or vector. Multiply // the index by the ElementSize. - uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()); + uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); Offset += Size*OpC->getSExtValue(); } @@ -74,7 +75,7 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, /// constant offset, and return that constant offset. For example, Ptr1 might /// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, - const DataLayout &TD) { + const DataLayout &DL) { Ptr1 = Ptr1->stripPointerCasts(); Ptr2 = Ptr2->stripPointerCasts(); @@ -92,12 +93,12 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, // If one pointer is a GEP and the other isn't, then see if the GEP is a // constant offset from the base, as in "P" and "gep P, 1". if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) { - Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, TD); + Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL); return !VariableIdxFound; } if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) { - Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD); + Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL); return !VariableIdxFound; } @@ -115,8 +116,8 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx)) break; - int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD); - int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD); + int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL); + int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL); if (VariableIdxFound) return false; Offset = Offset2-Offset1; @@ -150,12 +151,11 @@ struct MemsetRange { /// TheStores - The actual stores that make up this range. SmallVector<Instruction*, 16> TheStores; - bool isProfitableToUseMemset(const DataLayout &TD) const; - + bool isProfitableToUseMemset(const DataLayout &DL) const; }; } // end anon namespace -bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const { +bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // If we found more than 4 stores to merge or 16 bytes, use memset. if (TheStores.size() >= 4 || End-Start >= 16) return true; @@ -183,7 +183,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const { // size. If so, check to see whether we will end up actually reducing the // number of stores used. unsigned Bytes = unsigned(End-Start); - unsigned MaxIntSize = TD.getLargestLegalIntTypeSize(); + unsigned MaxIntSize = DL.getLargestLegalIntTypeSize(); if (MaxIntSize == 0) MaxIntSize = 1; unsigned NumPointerStores = Bytes / MaxIntSize; @@ -314,14 +314,12 @@ namespace { class MemCpyOpt : public FunctionPass { MemoryDependenceAnalysis *MD; TargetLibraryInfo *TLI; - const DataLayout *DL; public: static char ID; // Pass identification, replacement for typeid MemCpyOpt() : FunctionPass(ID) { initializeMemCpyOptPass(*PassRegistry::getPassRegistry()); MD = nullptr; TLI = nullptr; - DL = nullptr; } bool runOnFunction(Function &F) override; @@ -377,13 +375,13 @@ INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", /// attempts to merge them together into a memcpy/memset. Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, Value *StartPtr, Value *ByteVal) { - if (!DL) return nullptr; + const DataLayout &DL = StartInst->getModule()->getDataLayout(); // Okay, so we now have a single store that can be splatable. Scan to find // all subsequent stores of the same value to offset from the same pointer. // Join these together into ranges, so we can decide whether contiguous blocks // are stored. - MemsetRanges Ranges(*DL); + MemsetRanges Ranges(DL); BasicBlock::iterator BI = StartInst; for (++BI; !isa<TerminatorInst>(BI); ++BI) { @@ -406,8 +404,8 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; - if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), - Offset, *DL)) + if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, + DL)) break; Ranges.addStore(Offset, NextStore); @@ -420,7 +418,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; - if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *DL)) + if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL)) break; Ranges.addMemSet(Offset, MSI); @@ -452,7 +450,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, if (Range.TheStores.size() == 1) continue; // If it is profitable to lower this range to memset, do so now. - if (!Range.isProfitableToUseMemset(*DL)) + if (!Range.isProfitableToUseMemset(DL)) continue; // Otherwise, we do want to transform this! Create a new memset. @@ -464,7 +462,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, if (Alignment == 0) { Type *EltType = cast<PointerType>(StartPtr->getType())->getElementType(); - Alignment = DL->getABITypeAlignment(EltType); + Alignment = DL.getABITypeAlignment(EltType); } AMemSet = @@ -494,8 +492,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; - - if (!DL) return false; + const DataLayout &DL = SI->getModule()->getDataLayout(); // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than @@ -525,16 +522,16 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (C) { unsigned storeAlign = SI->getAlignment(); if (!storeAlign) - storeAlign = DL->getABITypeAlignment(SI->getOperand(0)->getType()); + storeAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType()); unsigned loadAlign = LI->getAlignment(); if (!loadAlign) - loadAlign = DL->getABITypeAlignment(LI->getType()); + loadAlign = DL.getABITypeAlignment(LI->getType()); - bool changed = performCallSlotOptzn(LI, - SI->getPointerOperand()->stripPointerCasts(), - LI->getPointerOperand()->stripPointerCasts(), - DL->getTypeStoreSize(SI->getOperand(0)->getType()), - std::min(storeAlign, loadAlign), C); + bool changed = performCallSlotOptzn( + LI, SI->getPointerOperand()->stripPointerCasts(), + LI->getPointerOperand()->stripPointerCasts(), + DL.getTypeStoreSize(SI->getOperand(0)->getType()), + std::min(storeAlign, loadAlign), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); @@ -606,15 +603,13 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (!srcAlloca) return false; - // Check that all of src is copied to dest. - if (!DL) return false; - ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize()); if (!srcArraySize) return false; - uint64_t srcSize = DL->getTypeAllocSize(srcAlloca->getAllocatedType()) * - srcArraySize->getZExtValue(); + const DataLayout &DL = cpy->getModule()->getDataLayout(); + uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) * + srcArraySize->getZExtValue(); if (cpyLen < srcSize) return false; @@ -628,8 +623,8 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (!destArraySize) return false; - uint64_t destSize = DL->getTypeAllocSize(A->getAllocatedType()) * - destArraySize->getZExtValue(); + uint64_t destSize = DL.getTypeAllocSize(A->getAllocatedType()) * + destArraySize->getZExtValue(); if (destSize < srcSize) return false; @@ -648,7 +643,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return false; } - uint64_t destSize = DL->getTypeAllocSize(StructTy); + uint64_t destSize = DL.getTypeAllocSize(StructTy); if (destSize < srcSize) return false; } @@ -659,7 +654,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // Check that dest points to memory that is at least as aligned as src. unsigned srcAlign = srcAlloca->getAlignment(); if (!srcAlign) - srcAlign = DL->getABITypeAlignment(srcAlloca->getAllocatedType()); + srcAlign = DL.getABITypeAlignment(srcAlloca->getAllocatedType()); bool isDestSufficientlyAligned = srcAlign <= cpyAlign; // If dest is not aligned enough and we can't increase its alignment then // bail out. @@ -959,12 +954,11 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { /// processByValArgument - This is called on every byval argument in call sites. bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { - if (!DL) return false; - + const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout(); // Find out what feeds this byval argument. Value *ByValArg = CS.getArgument(ArgNo); Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); - uint64_t ByValSize = DL->getTypeAllocSize(ByValTy); + uint64_t ByValSize = DL.getTypeAllocSize(ByValTy); MemDepResult DepInfo = MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize), true, CS.getInstruction(), @@ -997,8 +991,8 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { *CS->getParent()->getParent()); DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); if (MDep->getAlignment() < ByValAlign && - getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &AC, - CS.getInstruction(), &DT) < ByValAlign) + getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, + CS.getInstruction(), &AC, &DT) < ByValAlign) return false; // Verify that the copied-from memory doesn't change in between the memcpy and @@ -1077,8 +1071,6 @@ bool MemCpyOpt::runOnFunction(Function &F) { bool MadeChange = false; MD = &getAnalysis<MemoryDependenceAnalysis>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); // If we don't have at least memset and memcpy, there is little point of doing diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 8fad63f..73f4296 100644 --- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -81,12 +81,13 @@ #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <vector> diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index 98016b4..307cc73 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -321,10 +321,8 @@ unsigned Reassociate::getRank(Value *V) { // If this is a not or neg instruction, do not count it for rank. This // assures us that X and ~X will have the same rank. - Type *Ty = V->getType(); - if ((!Ty->isIntegerTy() && !Ty->isFloatingPointTy()) || - (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) && - !BinaryOperator::isFNeg(I))) + if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) && + !BinaryOperator::isFNeg(I)) ++Rank; DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n"); @@ -351,7 +349,7 @@ void Reassociate::canonicalizeOperands(Instruction *I) { static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name, Instruction *InsertBefore, Value *FlagsOp) { - if (S1->getType()->isIntegerTy()) + if (S1->getType()->isIntOrIntVectorTy()) return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore); else { BinaryOperator *Res = @@ -363,7 +361,7 @@ static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name, static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name, Instruction *InsertBefore, Value *FlagsOp) { - if (S1->getType()->isIntegerTy()) + if (S1->getType()->isIntOrIntVectorTy()) return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore); else { BinaryOperator *Res = @@ -375,7 +373,7 @@ static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name, static BinaryOperator *CreateNeg(Value *S1, const Twine &Name, Instruction *InsertBefore, Value *FlagsOp) { - if (S1->getType()->isIntegerTy()) + if (S1->getType()->isIntOrIntVectorTy()) return BinaryOperator::CreateNeg(S1, Name, InsertBefore); else { BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore); @@ -388,8 +386,8 @@ static BinaryOperator *CreateNeg(Value *S1, const Twine &Name, /// static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) { Type *Ty = Neg->getType(); - Constant *NegOne = Ty->isIntegerTy() ? ConstantInt::getAllOnesValue(Ty) - : ConstantFP::get(Ty, -1.0); + Constant *NegOne = Ty->isIntOrIntVectorTy() ? + ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0); BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg); Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op. @@ -872,7 +870,7 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, Constant *Undef = UndefValue::get(I->getType()); NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode), Undef, Undef, "", I); - if (NewOp->getType()->isFloatingPointTy()) + if (NewOp->getType()->isFPOrFPVectorTy()) NewOp->setFastMathFlags(I->getFastMathFlags()); } else { NewOp = NodesToRewrite.pop_back_val(); @@ -1520,8 +1518,8 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // Insert a new multiply. Type *Ty = TheOp->getType(); - Constant *C = Ty->isIntegerTy() ? ConstantInt::get(Ty, NumFound) - : ConstantFP::get(Ty, NumFound); + Constant *C = Ty->isIntOrIntVectorTy() ? + ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound); Instruction *Mul = CreateMul(TheOp, C, "factor", I, I); // Now that we have inserted a multiply, optimize it. This allows us to @@ -1661,7 +1659,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // from an expression will drop a use of maxocc, and this can cause // RemoveFactorFromExpression on successive values to behave differently. Instruction *DummyInst = - I->getType()->isIntegerTy() + I->getType()->isIntOrIntVectorTy() ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal) : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal); @@ -1792,7 +1790,7 @@ static Value *buildMultiplyTree(IRBuilder<> &Builder, Value *LHS = Ops.pop_back_val(); do { - if (LHS->getType()->isIntegerTy()) + if (LHS->getType()->isIntOrIntVectorTy()) LHS = Builder.CreateMul(LHS, Ops.pop_back_val()); else LHS = Builder.CreateFMul(LHS, Ops.pop_back_val()); @@ -2090,8 +2088,9 @@ void Reassociate::OptimizeInst(Instruction *I) { if (I->isCommutative()) canonicalizeOperands(I); - // Don't optimize vector instructions. - if (I->getType()->isVectorTy()) + // TODO: We should optimize vector Xor instructions, but they are + // currently unsupported. + if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor) return; // Don't optimize floating point instructions that don't have unsafe algebra. @@ -2170,9 +2169,6 @@ void Reassociate::OptimizeInst(Instruction *I) { } void Reassociate::ReassociateExpression(BinaryOperator *I) { - assert(!I->getType()->isVectorTy() && - "Reassociation of vector instructions is not supported."); - // First, walk the expression tree, linearizing the tree, collecting the // operand information. SmallVector<RepeatedValue, 8> Tree; diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index ca9ab54..f5d21ff 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -548,9 +548,6 @@ public: } PhiState(Value *b) : status(Base), base(b) {} PhiState() : status(Unknown), base(nullptr) {} - PhiState(const PhiState &other) : status(other.status), base(other.base) { - assert(status != Base || base); - } Status getStatus() const { return status; } Value *getBase() const { return base; } @@ -684,12 +681,19 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache, states[def] = PhiState(); // Recursively fill in all phis & selects reachable from the initial one // for which we don't already know a definite base value for - // PERF: Yes, this is as horribly inefficient as it looks. + // TODO: This should be rewritten with a worklist bool done = false; while (!done) { done = true; + // Since we're adding elements to 'states' as we run, we can't keep + // iterators into the set. + SmallVector<Value*, 16> Keys; + Keys.reserve(states.size()); for (auto Pair : states) { - Value *v = Pair.first; + Value *V = Pair.first; + Keys.push_back(V); + } + for (Value *v : Keys) { assert(!isKnownBaseResult(v) && "why did it get added?"); if (PHINode *phi = dyn_cast<PHINode>(v)) { assert(phi->getNumIncomingValues() > 0 && @@ -730,10 +734,12 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache, // have reached conflict state. The current version seems too conservative. bool progress = true; - size_t oldSize = 0; while (progress) { - oldSize = states.size(); +#ifndef NDEBUG + size_t oldSize = states.size(); +#endif progress = false; + // We're only changing keys in this loop, thus safe to keep iterators for (auto Pair : states) { MeetPhiStates calculateMeet(states); Value *v = Pair.first; @@ -768,46 +774,58 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache, } // Insert Phis for all conflicts + // We want to keep naming deterministic in the loop that follows, so + // sort the keys before iteration. This is useful in allowing us to + // write stable tests. Note that there is no invalidation issue here. + SmallVector<Value*, 16> Keys; + Keys.reserve(states.size()); for (auto Pair : states) { - Instruction *v = cast<Instruction>(Pair.first); - PhiState state = Pair.second; + Value *V = Pair.first; + Keys.push_back(V); + } + std::sort(Keys.begin(), Keys.end(), order_by_name); + // TODO: adjust naming patterns to avoid this order of iteration dependency + for (Value *V : Keys) { + Instruction *v = cast<Instruction>(V); + PhiState state = states[V]; assert(!isKnownBaseResult(v) && "why did it get added?"); assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); - if (state.isConflict()) { - if (isa<PHINode>(v)) { - int num_preds = - std::distance(pred_begin(v->getParent()), pred_end(v->getParent())); - assert(num_preds > 0 && "how did we reach here"); - PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v); - NewInsertedDefs.insert(phi); - // Add metadata marking this as a base value - auto *const_1 = ConstantInt::get( - Type::getInt32Ty( - v->getParent()->getParent()->getParent()->getContext()), - 1); - auto MDConst = ConstantAsMetadata::get(const_1); - MDNode *md = MDNode::get( - v->getParent()->getParent()->getParent()->getContext(), MDConst); - phi->setMetadata("is_base_value", md); - states[v] = PhiState(PhiState::Conflict, phi); - } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) { - // The undef will be replaced later - UndefValue *undef = UndefValue::get(sel->getType()); - SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef, - undef, "base_select", sel); - NewInsertedDefs.insert(basesel); - // Add metadata marking this as a base value - auto *const_1 = ConstantInt::get( - Type::getInt32Ty( - v->getParent()->getParent()->getParent()->getContext()), - 1); - auto MDConst = ConstantAsMetadata::get(const_1); - MDNode *md = MDNode::get( - v->getParent()->getParent()->getParent()->getContext(), MDConst); - basesel->setMetadata("is_base_value", md); - states[v] = PhiState(PhiState::Conflict, basesel); - } else - llvm_unreachable("unknown conflict type"); + if (!state.isConflict()) + continue; + + if (isa<PHINode>(v)) { + int num_preds = + std::distance(pred_begin(v->getParent()), pred_end(v->getParent())); + assert(num_preds > 0 && "how did we reach here"); + PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v); + NewInsertedDefs.insert(phi); + // Add metadata marking this as a base value + auto *const_1 = ConstantInt::get( + Type::getInt32Ty( + v->getParent()->getParent()->getParent()->getContext()), + 1); + auto MDConst = ConstantAsMetadata::get(const_1); + MDNode *md = MDNode::get( + v->getParent()->getParent()->getParent()->getContext(), MDConst); + phi->setMetadata("is_base_value", md); + states[v] = PhiState(PhiState::Conflict, phi); + } else { + SelectInst *sel = cast<SelectInst>(v); + // The undef will be replaced later + UndefValue *undef = UndefValue::get(sel->getType()); + SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef, + undef, "base_select", sel); + NewInsertedDefs.insert(basesel); + // Add metadata marking this as a base value + auto *const_1 = ConstantInt::get( + Type::getInt32Ty( + v->getParent()->getParent()->getParent()->getContext()), + 1); + auto MDConst = ConstantAsMetadata::get(const_1); + MDNode *md = MDNode::get( + v->getParent()->getParent()->getParent()->getContext(), MDConst); + basesel->setMetadata("is_base_value", md); + states[v] = PhiState(PhiState::Conflict, basesel); } } @@ -818,97 +836,98 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache, assert(!isKnownBaseResult(v) && "why did it get added?"); assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); - if (state.isConflict()) { - if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) { - PHINode *phi = cast<PHINode>(v); - unsigned NumPHIValues = phi->getNumIncomingValues(); - for (unsigned i = 0; i < NumPHIValues; i++) { - Value *InVal = phi->getIncomingValue(i); - BasicBlock *InBB = phi->getIncomingBlock(i); - - // If we've already seen InBB, add the same incoming value - // we added for it earlier. The IR verifier requires phi - // nodes with multiple entries from the same basic block - // to have the same incoming value for each of those - // entries. If we don't do this check here and basephi - // has a different type than base, we'll end up adding two - // bitcasts (and hence two distinct values) as incoming - // values for the same basic block. - - int blockIndex = basephi->getBasicBlockIndex(InBB); - if (blockIndex != -1) { - Value *oldBase = basephi->getIncomingValue(blockIndex); - basephi->addIncoming(oldBase, InBB); + if (!state.isConflict()) + continue; + + if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) { + PHINode *phi = cast<PHINode>(v); + unsigned NumPHIValues = phi->getNumIncomingValues(); + for (unsigned i = 0; i < NumPHIValues; i++) { + Value *InVal = phi->getIncomingValue(i); + BasicBlock *InBB = phi->getIncomingBlock(i); + + // If we've already seen InBB, add the same incoming value + // we added for it earlier. The IR verifier requires phi + // nodes with multiple entries from the same basic block + // to have the same incoming value for each of those + // entries. If we don't do this check here and basephi + // has a different type than base, we'll end up adding two + // bitcasts (and hence two distinct values) as incoming + // values for the same basic block. + + int blockIndex = basephi->getBasicBlockIndex(InBB); + if (blockIndex != -1) { + Value *oldBase = basephi->getIncomingValue(blockIndex); + basephi->addIncoming(oldBase, InBB); #ifndef NDEBUG - Value *base = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(base)) { - // Either conflict or base. - assert(states.count(base)); - base = states[base].getBase(); - assert(base != nullptr && "unknown PhiState!"); - assert(NewInsertedDefs.count(base) && - "should have already added this in a prev. iteration!"); - } - - // In essense this assert states: the only way two - // values incoming from the same basic block may be - // different is by being different bitcasts of the same - // value. A cleanup that remains TODO is changing - // findBaseOrBDV to return an llvm::Value of the correct - // type (and still remain pure). This will remove the - // need to add bitcasts. - assert(base->stripPointerCasts() == oldBase->stripPointerCasts() && - "sanity -- findBaseOrBDV should be pure!"); -#endif - continue; - } - - // Find either the defining value for the PHI or the normal base for - // a non-phi node Value *base = findBaseOrBDV(InVal, cache); if (!isKnownBaseResult(base)) { // Either conflict or base. assert(states.count(base)); base = states[base].getBase(); assert(base != nullptr && "unknown PhiState!"); + assert(NewInsertedDefs.count(base) && + "should have already added this in a prev. iteration!"); } - assert(base && "can't be null"); - // Must use original input BB since base may not be Instruction - // The cast is needed since base traversal may strip away bitcasts - if (base->getType() != basephi->getType()) { - base = new BitCastInst(base, basephi->getType(), "cast", - InBB->getTerminator()); - NewInsertedDefs.insert(base); - } - basephi->addIncoming(base, InBB); + + // In essense this assert states: the only way two + // values incoming from the same basic block may be + // different is by being different bitcasts of the same + // value. A cleanup that remains TODO is changing + // findBaseOrBDV to return an llvm::Value of the correct + // type (and still remain pure). This will remove the + // need to add bitcasts. + assert(base->stripPointerCasts() == oldBase->stripPointerCasts() && + "sanity -- findBaseOrBDV should be pure!"); +#endif + continue; } - assert(basephi->getNumIncomingValues() == NumPHIValues); - } else if (SelectInst *basesel = dyn_cast<SelectInst>(state.getBase())) { - SelectInst *sel = cast<SelectInst>(v); - // Operand 1 & 2 are true, false path respectively. TODO: refactor to - // something more safe and less hacky. - for (int i = 1; i <= 2; i++) { - Value *InVal = sel->getOperand(i); - // Find either the defining value for the PHI or the normal base for - // a non-phi node - Value *base = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(base)) { - // Either conflict or base. - assert(states.count(base)); - base = states[base].getBase(); - assert(base != nullptr && "unknown PhiState!"); - } - assert(base && "can't be null"); - // Must use original input BB since base may not be Instruction - // The cast is needed since base traversal may strip away bitcasts - if (base->getType() != basesel->getType()) { - base = new BitCastInst(base, basesel->getType(), "cast", basesel); - NewInsertedDefs.insert(base); - } - basesel->setOperand(i, base); + + // Find either the defining value for the PHI or the normal base for + // a non-phi node + Value *base = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(base)) { + // Either conflict or base. + assert(states.count(base)); + base = states[base].getBase(); + assert(base != nullptr && "unknown PhiState!"); } - } else - llvm_unreachable("unexpected conflict type"); + assert(base && "can't be null"); + // Must use original input BB since base may not be Instruction + // The cast is needed since base traversal may strip away bitcasts + if (base->getType() != basephi->getType()) { + base = new BitCastInst(base, basephi->getType(), "cast", + InBB->getTerminator()); + NewInsertedDefs.insert(base); + } + basephi->addIncoming(base, InBB); + } + assert(basephi->getNumIncomingValues() == NumPHIValues); + } else { + SelectInst *basesel = cast<SelectInst>(state.getBase()); + SelectInst *sel = cast<SelectInst>(v); + // Operand 1 & 2 are true, false path respectively. TODO: refactor to + // something more safe and less hacky. + for (int i = 1; i <= 2; i++) { + Value *InVal = sel->getOperand(i); + // Find either the defining value for the PHI or the normal base for + // a non-phi node + Value *base = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(base)) { + // Either conflict or base. + assert(states.count(base)); + base = states[base].getBase(); + assert(base != nullptr && "unknown PhiState!"); + } + assert(base && "can't be null"); + // Must use original input BB since base may not be Instruction + // The cast is needed since base traversal may strip away bitcasts + if (base->getType() != basesel->getType()) { + base = new BitCastInst(base, basesel->getType(), "cast", basesel); + NewInsertedDefs.insert(base); + } + basesel->setOperand(i, base); + } } } @@ -964,7 +983,13 @@ static void findBasePointers(const StatepointLiveSetTy &live, DenseMap<llvm::Value *, llvm::Value *> &PointerToBase, DominatorTree *DT, DefiningValueMapTy &DVCache, DenseSet<llvm::Value *> &NewInsertedDefs) { - for (Value *ptr : live) { + // For the naming of values inserted to be deterministic - which makes for + // much cleaner and more stable tests - we need to assign an order to the + // live values. DenseSets do not provide a deterministic order across runs. + SmallVector<Value*, 64> Temp; + Temp.insert(Temp.end(), live.begin(), live.end()); + std::sort(Temp.begin(), Temp.end(), order_by_name); + for (Value *ptr : Temp) { Value *base = findBasePointer(ptr, DVCache, NewInsertedDefs); assert(base && "failed to find base pointer"); PointerToBase[ptr] = base; @@ -993,10 +1018,19 @@ static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, findBasePointers(result.liveset, PointerToBase, &DT, DVCache, NewInsertedDefs); if (PrintBasePointers) { + // Note: Need to print these in a stable order since this is checked in + // some tests. errs() << "Base Pairs (w/o Relocation):\n"; + SmallVector<Value*, 64> Temp; + Temp.reserve(PointerToBase.size()); for (auto Pair : PointerToBase) { - errs() << " derived %" << Pair.first->getName() << " base %" - << Pair.second->getName() << "\n"; + Temp.push_back(Pair.first); + } + std::sort(Temp.begin(), Temp.end(), order_by_name); + for (Value *Ptr : Temp) { + Value *Base = PointerToBase[Ptr]; + errs() << " derived %" << Ptr->getName() << " base %" + << Base->getName() << "\n"; } } @@ -1131,11 +1165,11 @@ static AttributeSet legalizeCallAttributes(AttributeSet AS) { /// statepointToken - statepoint instruction to which relocates should be /// bound. /// Builder - Llvm IR builder to be used to construct new calls. -void CreateGCRelocates(ArrayRef<llvm::Value *> liveVariables, - const int liveStart, - ArrayRef<llvm::Value *> basePtrs, - Instruction *statepointToken, IRBuilder<> Builder) { - +static void CreateGCRelocates(ArrayRef<llvm::Value *> liveVariables, + const int liveStart, + ArrayRef<llvm::Value *> basePtrs, + Instruction *statepointToken, + IRBuilder<> Builder) { SmallVector<Instruction *, 64> NewDefs; NewDefs.reserve(liveVariables.size()); @@ -1559,8 +1593,18 @@ static void relocationViaAlloca( // store must be inserted after load, otherwise store will be in alloca's // use list and an extra load will be inserted before it StoreInst *store = new StoreInst(def, alloca); - if (isa<Instruction>(def)) { - store->insertAfter(cast<Instruction>(def)); + if (Instruction *inst = dyn_cast<Instruction>(def)) { + if (InvokeInst *invoke = dyn_cast<InvokeInst>(inst)) { + // InvokeInst is a TerminatorInst so the store need to be inserted + // into its normal destination block. + BasicBlock *normalDest = invoke->getNormalDest(); + store->insertBefore(normalDest->getFirstNonPHI()); + } else { + assert(!inst->isTerminator() && + "The only TerminatorInst that can produce a value is " + "InvokeInst which is handled above."); + store->insertAfter(inst); + } } else { assert((isa<Argument>(def) || isa<GlobalVariable>(def) || (isa<Constant>(def) && cast<Constant>(def)->isNullValue())) && diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 05b9608..875a007 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -35,7 +36,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> @@ -154,7 +154,7 @@ namespace { /// Constant Propagation. /// class SCCPSolver : public InstVisitor<SCCPSolver> { - const DataLayout *DL; + const DataLayout &DL; const TargetLibraryInfo *TLI; SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable. DenseMap<Value*, LatticeVal> ValueState; // The state each value is in. @@ -206,8 +206,8 @@ class SCCPSolver : public InstVisitor<SCCPSolver> { typedef std::pair<BasicBlock*, BasicBlock*> Edge; DenseSet<Edge> KnownFeasibleEdges; public: - SCCPSolver(const DataLayout *DL, const TargetLibraryInfo *tli) - : DL(DL), TLI(tli) {} + SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli) + : DL(DL), TLI(tli) {} /// MarkBlockExecutable - This method can be used by clients to mark all of /// the blocks that are known to be intrinsically live in the processed unit. @@ -1561,8 +1561,7 @@ bool SCCP::runOnFunction(Function &F) { return false; DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); - const DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; + const DataLayout &DL = F.getParent()->getDataLayout(); const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); SCCPSolver Solver(DL, TLI); @@ -1691,8 +1690,7 @@ static bool AddressIsTaken(const GlobalValue *GV) { } bool IPSCCP::runOnModule(Module &M) { - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; + const DataLayout &DL = M.getDataLayout(); const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); SCCPSolver Solver(DL, TLI); diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index f69c750..06b000f 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -247,7 +247,7 @@ public: /// hold. void insert(ArrayRef<Slice> NewSlices) { int OldSize = Slices.size(); - std::move(NewSlices.begin(), NewSlices.end(), std::back_inserter(Slices)); + Slices.append(NewSlices.begin(), NewSlices.end()); auto SliceI = Slices.begin() + OldSize; std::sort(SliceI, Slices.end()); std::inplace_merge(Slices.begin(), SliceI, Slices.end()); @@ -701,6 +701,7 @@ private: // by writing out the code here where we have tho underlying allocation // size readily available. APInt GEPOffset = Offset; + const DataLayout &DL = GEPI.getModule()->getDataLayout(); for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI); GTI != GTE; ++GTI) { @@ -750,6 +751,7 @@ private: if (!IsOffsetKnown) return PI.setAborted(&LI); + const DataLayout &DL = LI.getModule()->getDataLayout(); uint64_t Size = DL.getTypeStoreSize(LI.getType()); return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile()); } @@ -761,6 +763,7 @@ private: if (!IsOffsetKnown) return PI.setAborted(&SI); + const DataLayout &DL = SI.getModule()->getDataLayout(); uint64_t Size = DL.getTypeStoreSize(ValOp->getType()); // If this memory access can be shown to *statically* extend outside the @@ -898,6 +901,7 @@ private: SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; Visited.insert(Root); Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); + const DataLayout &DL = Root->getModule()->getDataLayout(); // If there are no loads or stores, the access is dead. We mark that as // a size zero access. Size = 0; @@ -1194,7 +1198,6 @@ class SROA : public FunctionPass { const bool RequiresDomTree; LLVMContext *C; - const DataLayout *DL; DominatorTree *DT; AssumptionCache *AC; @@ -1243,7 +1246,7 @@ class SROA : public FunctionPass { public: SROA(bool RequiresDomTree = true) : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr), - DL(nullptr), DT(nullptr) { + DT(nullptr) { initializeSROAPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -1349,7 +1352,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B, /// /// FIXME: This should be hoisted into a generic utility, likely in /// Transforms/Util/Local.h -static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) { +static bool isSafePHIToSpeculate(PHINode &PN) { // For now, we can only do this promotion if the load is in the same block // as the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. @@ -1381,6 +1384,8 @@ static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) { if (!HaveLoad) return false; + const DataLayout &DL = PN.getModule()->getDataLayout(); + // We can only transform this if it is safe to push the loads into the // predecessor blocks. The only thing to watch out for is that we can't put // a possibly trapping load in the predecessor if it is a critical edge. @@ -1403,7 +1408,7 @@ static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) { // is already a load in the block, then we can move the load to the pred // block. if (InVal->isDereferenceablePointer(DL) || - isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL)) + isSafeToLoadUnconditionally(InVal, TI, MaxAlign)) continue; return false; @@ -1468,10 +1473,10 @@ static void speculatePHINodeLoads(PHINode &PN) { /// /// We can do this to a select if its only uses are loads and if the operand /// to the select can be loaded unconditionally. -static bool isSafeSelectToSpeculate(SelectInst &SI, - const DataLayout *DL = nullptr) { +static bool isSafeSelectToSpeculate(SelectInst &SI) { Value *TValue = SI.getTrueValue(); Value *FValue = SI.getFalseValue(); + const DataLayout &DL = SI.getModule()->getDataLayout(); bool TDerefable = TValue->isDereferenceablePointer(DL); bool FDerefable = FValue->isDereferenceablePointer(DL); @@ -1484,10 +1489,10 @@ static bool isSafeSelectToSpeculate(SelectInst &SI, // absolutely (e.g. allocas) or at this point because we can see other // accesses to it. if (!TDerefable && - !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment(), DL)) + !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment())) return false; if (!FDerefable && - !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment(), DL)) + !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment())) return false; } @@ -3699,6 +3704,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // them to the alloca slices. SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap; std::vector<LoadInst *> SplitLoads; + const DataLayout &DL = AI.getModule()->getDataLayout(); for (LoadInst *LI : Loads) { SplitLoads.clear(); @@ -3724,10 +3730,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); LoadInst *PLoad = IRB.CreateAlignedLoad( - getAdjustedPtr(IRB, *DL, BasePtr, - APInt(DL->getPointerSizeInBits(), PartOffset), + getAdjustedPtr(IRB, DL, BasePtr, + APInt(DL.getPointerSizeInBits(), PartOffset), PartPtrTy, BasePtr->getName() + "."), - getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false, + getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, LI->getName()); // Append this load onto the list of split loads so we can find it later @@ -3777,10 +3783,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { PLoad->getType()->getPointerTo(SI->getPointerAddressSpace()); StoreInst *PStore = IRB.CreateAlignedStore( - PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr, - APInt(DL->getPointerSizeInBits(), PartOffset), + PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, + APInt(DL.getPointerSizeInBits(), PartOffset), PartPtrTy, StoreBasePtr->getName() + "."), - getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false); + getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); (void)PStore; DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); } @@ -3857,20 +3863,20 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { } else { IRB.SetInsertPoint(BasicBlock::iterator(LI)); PLoad = IRB.CreateAlignedLoad( - getAdjustedPtr(IRB, *DL, LoadBasePtr, - APInt(DL->getPointerSizeInBits(), PartOffset), + getAdjustedPtr(IRB, DL, LoadBasePtr, + APInt(DL.getPointerSizeInBits(), PartOffset), PartPtrTy, LoadBasePtr->getName() + "."), - getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false, + getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, LI->getName()); } // And store this partition. IRB.SetInsertPoint(BasicBlock::iterator(SI)); StoreInst *PStore = IRB.CreateAlignedStore( - PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr, - APInt(DL->getPointerSizeInBits(), PartOffset), + PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, + APInt(DL.getPointerSizeInBits(), PartOffset), PartPtrTy, StoreBasePtr->getName() + "."), - getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false); + getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); // Now build a new slice for the alloca. NewSlices.push_back( @@ -3970,25 +3976,26 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. Type *SliceTy = nullptr; + const DataLayout &DL = AI.getModule()->getDataLayout(); if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset())) - if (DL->getTypeAllocSize(CommonUseTy) >= P.size()) + if (DL.getTypeAllocSize(CommonUseTy) >= P.size()) SliceTy = CommonUseTy; if (!SliceTy) - if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(), + if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) SliceTy = TypePartitionTy; if ((!SliceTy || (SliceTy->isArrayTy() && SliceTy->getArrayElementType()->isIntegerTy())) && - DL->isLegalInteger(P.size() * 8)) + DL.isLegalInteger(P.size() * 8)) SliceTy = Type::getIntNTy(*C, P.size() * 8); if (!SliceTy) SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size()); - assert(DL->getTypeAllocSize(SliceTy) >= P.size()); + assert(DL.getTypeAllocSize(SliceTy) >= P.size()); - bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, *DL); + bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); VectorType *VecTy = - IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, *DL); + IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL); if (VecTy) SliceTy = VecTy; @@ -4010,12 +4017,12 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // The minimum alignment which users can rely on when the explicit // alignment is omitted or zero is that required by the ABI for this // type. - Alignment = DL->getABITypeAlignment(AI.getAllocatedType()); + Alignment = DL.getABITypeAlignment(AI.getAllocatedType()); } Alignment = MinAlign(Alignment, P.beginOffset()); // If we will get at least this much alignment from the type alone, leave // the alloca's alignment unconstrained. - if (Alignment <= DL->getABITypeAlignment(SliceTy)) + if (Alignment <= DL.getABITypeAlignment(SliceTy)) Alignment = 0; NewAI = new AllocaInst( SliceTy, nullptr, Alignment, @@ -4035,7 +4042,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, SmallPtrSet<PHINode *, 8> PHIUsers; SmallPtrSet<SelectInst *, 8> SelectUsers; - AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, P.beginOffset(), + AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(), P.endOffset(), IsIntegerPromotable, VecTy, PHIUsers, SelectUsers); bool Promotable = true; @@ -4057,7 +4064,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(), E = PHIUsers.end(); I != E; ++I) - if (!isSafePHIToSpeculate(**I, DL)) { + if (!isSafePHIToSpeculate(**I)) { Promotable = false; PHIUsers.clear(); SelectUsers.clear(); @@ -4066,7 +4073,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(), E = SelectUsers.end(); I != E; ++I) - if (!isSafeSelectToSpeculate(**I, DL)) { + if (!isSafeSelectToSpeculate(**I)) { Promotable = false; PHIUsers.clear(); SelectUsers.clear(); @@ -4110,6 +4117,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { unsigned NumPartitions = 0; bool Changed = false; + const DataLayout &DL = AI.getModule()->getDataLayout(); // First try to pre-split loads and stores. Changed |= presplitLoadsAndStores(AI, AS); @@ -4127,7 +4135,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { // confident that the above handling of splittable loads and stores is // completely sufficient before we forcibly disable the remaining handling. if (S.beginOffset() == 0 && - S.endOffset() >= DL->getTypeAllocSize(AI.getAllocatedType())) + S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType())) continue; if (isa<LoadInst>(S.getUse()->getUser()) || isa<StoreInst>(S.getUse()->getUser())) { @@ -4155,7 +4163,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { Changed = true; if (NewAI != &AI) { uint64_t SizeOfByte = 8; - uint64_t AllocaSize = DL->getTypeSizeInBits(NewAI->getAllocatedType()); + uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType()); // Don't include any padding. uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte); Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size)); @@ -4236,21 +4244,22 @@ bool SROA::runOnAlloca(AllocaInst &AI) { AI.eraseFromParent(); return true; } + const DataLayout &DL = AI.getModule()->getDataLayout(); // Skip alloca forms that this analysis can't handle. if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || - DL->getTypeAllocSize(AI.getAllocatedType()) == 0) + DL.getTypeAllocSize(AI.getAllocatedType()) == 0) return false; bool Changed = false; // First, split any FCA loads and stores touching this alloca to promote // better splitting and promotion opportunities. - AggLoadStoreRewriter AggRewriter(*DL); + AggLoadStoreRewriter AggRewriter(DL); Changed |= AggRewriter.rewrite(AI); // Build the slices using a recursive instruction-visiting builder. - AllocaSlices AS(*DL, AI); + AllocaSlices AS(DL, AI); DEBUG(AS.print(dbgs())); if (AS.isEscaped()) return Changed; @@ -4423,12 +4432,6 @@ bool SROA::runOnFunction(Function &F) { DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - if (!DLP) { - DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); - return false; - } - DL = &DLP->getDataLayout(); DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp index c7232a9..3e7cf04 100644 --- a/lib/Transforms/Scalar/SampleProfile.cpp +++ b/lib/Transforms/Scalar/SampleProfile.cpp @@ -217,6 +217,9 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) { /// \returns The profiled weight of I. unsigned SampleProfileLoader::getInstWeight(Instruction &Inst) { DebugLoc DLoc = Inst.getDebugLoc(); + if (DLoc.isUnknown()) + return 0; + unsigned Lineno = DLoc.getLine(); if (Lineno < HeaderLineno) return 0; diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 621633b..6cc8411 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -48,6 +48,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopDeletionPass(Registry); initializeLoopAccessAnalysisPass(Registry); initializeLoopInstSimplifyPass(Registry); + initializeLoopInterchangePass(Registry); initializeLoopRotatePass(Registry); initializeLoopStrengthReducePass(Registry); initializeLoopRerollPass(Registry); @@ -209,7 +210,6 @@ void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) { void LLVMAddVerifierPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createVerifierPass()); - // FIXME: should this also add createDebugInfoVerifierPass()? } void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) { diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 5c49a55..acd8585 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -89,7 +89,6 @@ namespace { private: bool HasDomTree; - const DataLayout *DL; /// DeadInsts - Keep track of instructions we have made dead, so that /// we can remove them after we are done working. @@ -159,9 +158,10 @@ namespace { void isSafeMemAccess(uint64_t Offset, uint64_t MemSize, Type *MemOpType, bool isStore, AllocaInfo &Info, Instruction *TheAccess, bool AllowWholeAccess); - bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size); - uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset, - Type *&IdxTy); + bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size, + const DataLayout &DL); + uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy, + const DataLayout &DL); void DoScalarReplacement(AllocaInst *AI, std::vector<AllocaInst*> &WorkList); @@ -699,9 +699,9 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // If the source and destination are both to the same alloca, then this is // a noop copy-to-self, just delete it. Otherwise, emit a load and store // as appropriate. - AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &DL, 0)); + AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, DL, 0)); - if (GetUnderlyingObject(MTI->getSource(), &DL, 0) != OrigAI) { + if (GetUnderlyingObject(MTI->getSource(), DL, 0) != OrigAI) { // Dest must be OrigAI, change this to be a load from the original // pointer (bitcasted), then a store to our new alloca. assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); @@ -717,7 +717,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); SrcVal->setAlignment(MTI->getAlignment()); Builder.CreateStore(SrcVal, NewAI); - } else if (GetUnderlyingObject(MTI->getDest(), &DL, 0) != OrigAI) { + } else if (GetUnderlyingObject(MTI->getDest(), DL, 0) != OrigAI) { // Src must be OrigAI, change this to be a load from NewAI then a store // through the original dest pointer (bitcasted). assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); @@ -1032,17 +1032,8 @@ bool SROA::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - bool Changed = performPromotion(F); - // FIXME: ScalarRepl currently depends on DataLayout more than it - // theoretically needs to. It should be refactored in order to support - // target-independent IR. Until this is done, just skip the actual - // scalar-replacement portion of this pass. - if (!DL) return Changed; - while (1) { bool LocalChange = performScalarRepl(F); if (!LocalChange) break; // No need to repromote if no scalarrepl @@ -1148,7 +1139,8 @@ public: /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) { +static bool isSafeSelectToSpeculate(SelectInst *SI) { + const DataLayout &DL = SI->getModule()->getDataLayout(); bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(DL); bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(DL); @@ -1158,11 +1150,13 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) { // Both operands to the select need to be dereferencable, either absolutely // (e.g. allocas) or at this point because we can see other accesses to it. - if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI, - LI->getAlignment(), DL)) + if (!TDerefable && + !isSafeToLoadUnconditionally(SI->getTrueValue(), LI, + LI->getAlignment())) return false; - if (!FDerefable && !isSafeToLoadUnconditionally(SI->getFalseValue(), LI, - LI->getAlignment(), DL)) + if (!FDerefable && + !isSafeToLoadUnconditionally(SI->getFalseValue(), LI, + LI->getAlignment())) return false; } @@ -1185,7 +1179,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) { /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) { +static bool isSafePHIToSpeculate(PHINode *PN) { // For now, we can only do this promotion if the load is in the same block as // the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. @@ -1209,6 +1203,8 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) { MaxAlign = std::max(MaxAlign, LI->getAlignment()); } + const DataLayout &DL = PN->getModule()->getDataLayout(); + // Okay, we know that we have one or more loads in the same block as the PHI. // We can transform this if it is safe to push the loads into the predecessor // blocks. The only thing to watch out for is that we can't put a possibly @@ -1234,7 +1230,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) { // If this pointer is always safe to load, or if we can prove that there is // already a load in the block, then we can move the load to the pred block. if (InVal->isDereferenceablePointer(DL) || - isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, DL)) + isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign)) continue; return false; @@ -1248,7 +1244,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) { /// direct (non-volatile) loads and stores to it. If the alloca is close but /// not quite there, this will transform the code to allow promotion. As such, /// it is a non-pure predicate. -static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) { +static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout &DL) { SetVector<Instruction*, SmallVector<Instruction*, 4>, SmallPtrSet<Instruction*, 4> > InstsToRewrite; for (User *U : AI->users()) { @@ -1279,7 +1275,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) { // If it is safe to turn "load (select c, AI, ptr)" into a select of two // loads, then we can transform this by rewriting the select. - if (!isSafeSelectToSpeculate(SI, DL)) + if (!isSafeSelectToSpeculate(SI)) return false; InstsToRewrite.insert(SI); @@ -1294,7 +1290,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) { // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads // in the pred blocks, then we can transform this by rewriting the PHI. - if (!isSafePHIToSpeculate(PN, DL)) + if (!isSafePHIToSpeculate(PN)) return false; InstsToRewrite.insert(PN); @@ -1416,6 +1412,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) { bool SROA::performPromotion(Function &F) { std::vector<AllocaInst*> Allocas; + const DataLayout &DL = F.getParent()->getDataLayout(); DominatorTree *DT = nullptr; if (HasDomTree) DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -1479,6 +1476,7 @@ bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) { // bool SROA::performScalarRepl(Function &F) { std::vector<AllocaInst*> WorkList; + const DataLayout &DL = F.getParent()->getDataLayout(); // Scan the entry basic block, adding allocas to the worklist. BasicBlock &BB = F.getEntryBlock(); @@ -1508,7 +1506,7 @@ bool SROA::performScalarRepl(Function &F) { // transform the allocation instruction if it is an array allocation // (allocations OF arrays are ok though), and an allocation of a scalar // value cannot be decomposed at all. - uint64_t AllocaSize = DL->getTypeAllocSize(AI->getAllocatedType()); + uint64_t AllocaSize = DL.getTypeAllocSize(AI->getAllocatedType()); // Do not promote [0 x %struct]. if (AllocaSize == 0) continue; @@ -1531,8 +1529,9 @@ bool SROA::performScalarRepl(Function &F) { // promoted itself. If so, we don't want to transform it needlessly. Note // that we can't just check based on the type: the alloca may be of an i32 // but that has pointer arithmetic to set byte 3 of it or something. - if (AllocaInst *NewAI = ConvertToScalarInfo( - (unsigned)AllocaSize, *DL, ScalarLoadThreshold).TryConvert(AI)) { + if (AllocaInst *NewAI = + ConvertToScalarInfo((unsigned)AllocaSize, DL, ScalarLoadThreshold) + .TryConvert(AI)) { NewAI->takeName(AI); AI->eraseFromParent(); ++NumConverted; @@ -1610,6 +1609,7 @@ void SROA::DeleteDeadInstructions() { /// referenced by this instruction. void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info) { + const DataLayout &DL = I->getModule()->getDataLayout(); for (Use &U : I->uses()) { Instruction *User = cast<Instruction>(U.getUser()); @@ -1632,8 +1632,8 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, if (!LI->isSimple()) return MarkUnsafe(Info, User); Type *LIType = LI->getType(); - isSafeMemAccess(Offset, DL->getTypeAllocSize(LIType), - LIType, false, Info, LI, true /*AllowWholeAccess*/); + isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info, + LI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { @@ -1642,8 +1642,8 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, return MarkUnsafe(Info, User); Type *SIType = SI->getOperand(0)->getType(); - isSafeMemAccess(Offset, DL->getTypeAllocSize(SIType), - SIType, true, Info, SI, true /*AllowWholeAccess*/); + isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info, + SI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { if (II->getIntrinsicID() != Intrinsic::lifetime_start && @@ -1675,6 +1675,7 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, if (!Info.CheckedPHIs.insert(PN).second) return; + const DataLayout &DL = I->getModule()->getDataLayout(); for (User *U : I->users()) { Instruction *UI = cast<Instruction>(U); @@ -1691,8 +1692,8 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, if (!LI->isSimple()) return MarkUnsafe(Info, UI); Type *LIType = LI->getType(); - isSafeMemAccess(Offset, DL->getTypeAllocSize(LIType), - LIType, false, Info, LI, false /*AllowWholeAccess*/); + isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info, + LI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) { @@ -1701,8 +1702,8 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, return MarkUnsafe(Info, UI); Type *SIType = SI->getOperand(0)->getType(); - isSafeMemAccess(Offset, DL->getTypeAllocSize(SIType), - SIType, true, Info, SI, false /*AllowWholeAccess*/); + isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info, + SI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (isa<PHINode>(UI) || isa<SelectInst>(UI)) { isSafePHISelectUseForScalarRepl(UI, Offset, Info); @@ -1746,9 +1747,11 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI, // constant part of the offset. if (NonConstant) Indices.pop_back(); - Offset += DL->getIndexedOffset(GEPI->getPointerOperandType(), Indices); - if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, - NonConstantIdxSize)) + + const DataLayout &DL = GEPI->getModule()->getDataLayout(); + Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices); + if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, NonConstantIdxSize, + DL)) MarkUnsafe(Info, GEPI); } @@ -1803,9 +1806,10 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, Type *MemOpType, bool isStore, AllocaInfo &Info, Instruction *TheAccess, bool AllowWholeAccess) { + const DataLayout &DL = TheAccess->getModule()->getDataLayout(); // Check if this is a load/store of the entire alloca. if (Offset == 0 && AllowWholeAccess && - MemSize == DL->getTypeAllocSize(Info.AI->getAllocatedType())) { + MemSize == DL.getTypeAllocSize(Info.AI->getAllocatedType())) { // This can be safe for MemIntrinsics (where MemOpType is 0) and integer // loads/stores (which are essentially the same as the MemIntrinsics with // regard to copying padding between elements). But, if an alloca is @@ -1828,7 +1832,7 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, } // Check if the offset/size correspond to a component within the alloca type. Type *T = Info.AI->getAllocatedType(); - if (TypeHasComponent(T, Offset, MemSize)) { + if (TypeHasComponent(T, Offset, MemSize, DL)) { Info.hasSubelementAccess = true; return; } @@ -1838,24 +1842,25 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, /// TypeHasComponent - Return true if T has a component type with the /// specified offset and size. If Size is zero, do not check the size. -bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { +bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size, + const DataLayout &DL) { Type *EltTy; uint64_t EltSize; if (StructType *ST = dyn_cast<StructType>(T)) { - const StructLayout *Layout = DL->getStructLayout(ST); + const StructLayout *Layout = DL.getStructLayout(ST); unsigned EltIdx = Layout->getElementContainingOffset(Offset); EltTy = ST->getContainedType(EltIdx); - EltSize = DL->getTypeAllocSize(EltTy); + EltSize = DL.getTypeAllocSize(EltTy); Offset -= Layout->getElementOffset(EltIdx); } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { EltTy = AT->getElementType(); - EltSize = DL->getTypeAllocSize(EltTy); + EltSize = DL.getTypeAllocSize(EltTy); if (Offset >= AT->getNumElements() * EltSize) return false; Offset %= EltSize; } else if (VectorType *VT = dyn_cast<VectorType>(T)) { EltTy = VT->getElementType(); - EltSize = DL->getTypeAllocSize(EltTy); + EltSize = DL.getTypeAllocSize(EltTy); if (Offset >= VT->getNumElements() * EltSize) return false; Offset %= EltSize; @@ -1867,7 +1872,7 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { // Check if the component spans multiple elements. if (Offset + Size > EltSize) return false; - return TypeHasComponent(EltTy, Offset, Size); + return TypeHasComponent(EltTy, Offset, Size, DL); } /// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite @@ -1876,6 +1881,7 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { /// instruction. void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, SmallVectorImpl<AllocaInst *> &NewElts) { + const DataLayout &DL = I->getModule()->getDataLayout(); for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) { Use &TheUse = *UI++; Instruction *User = cast<Instruction>(TheUse.getUser()); @@ -1893,8 +1899,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); uint64_t MemSize = Length->getZExtValue(); - if (Offset == 0 && - MemSize == DL->getTypeAllocSize(AI->getAllocatedType())) + if (Offset == 0 && MemSize == DL.getTypeAllocSize(AI->getAllocatedType())) RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts); // Otherwise the intrinsic can only touch a single element and the // address operand will be updated, so nothing else needs to be done. @@ -1930,8 +1935,8 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, LI->replaceAllUsesWith(Insert); DeadInsts.push_back(LI); } else if (LIType->isIntegerTy() && - DL->getTypeAllocSize(LIType) == - DL->getTypeAllocSize(AI->getAllocatedType())) { + DL.getTypeAllocSize(LIType) == + DL.getTypeAllocSize(AI->getAllocatedType())) { // If this is a load of the entire alloca to an integer, rewrite it. RewriteLoadUserOfWholeAlloca(LI, AI, NewElts); } @@ -1957,8 +1962,8 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, } DeadInsts.push_back(SI); } else if (SIType->isIntegerTy() && - DL->getTypeAllocSize(SIType) == - DL->getTypeAllocSize(AI->getAllocatedType())) { + DL.getTypeAllocSize(SIType) == + DL.getTypeAllocSize(AI->getAllocatedType())) { // If this is a store of the entire alloca from an integer, rewrite it. RewriteStoreUserOfWholeAlloca(SI, AI, NewElts); } @@ -2001,7 +2006,8 @@ void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, Type *T = AI->getAllocatedType(); uint64_t EltOffset = 0; Type *IdxTy; - uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy); + uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, + BC->getModule()->getDataLayout()); Instruction *Val = NewElts[Idx]; if (Val->getType() != BC->getDestTy()) { Val = new BitCastInst(Val, BC->getDestTy(), "", BC); @@ -2016,11 +2022,12 @@ void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, /// Sets T to the type of the element and Offset to the offset within that /// element. IdxTy is set to the type of the index result to be used in a /// GEP instruction. -uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, - Type *&IdxTy) { +uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy, + const DataLayout &DL) { uint64_t Idx = 0; + if (StructType *ST = dyn_cast<StructType>(T)) { - const StructLayout *Layout = DL->getStructLayout(ST); + const StructLayout *Layout = DL.getStructLayout(ST); Idx = Layout->getElementContainingOffset(Offset); T = ST->getContainedType(Idx); Offset -= Layout->getElementOffset(Idx); @@ -2028,7 +2035,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, return Idx; } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { T = AT->getElementType(); - uint64_t EltSize = DL->getTypeAllocSize(T); + uint64_t EltSize = DL.getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; IdxTy = Type::getInt64Ty(T->getContext()); @@ -2036,7 +2043,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, } VectorType *VT = cast<VectorType>(T); T = VT->getElementType(); - uint64_t EltSize = DL->getTypeAllocSize(T); + uint64_t EltSize = DL.getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; IdxTy = Type::getInt64Ty(T->getContext()); @@ -2049,6 +2056,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, SmallVectorImpl<AllocaInst *> &NewElts) { uint64_t OldOffset = Offset; + const DataLayout &DL = GEPI->getModule()->getDataLayout(); SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); // If the GEP was dynamic then it must have been a dynamic vector lookup. // In this case, it must be the last GEP operand which is dynamic so keep that @@ -2057,19 +2065,19 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, Value* NonConstantIdx = nullptr; if (!GEPI->hasAllConstantIndices()) NonConstantIdx = Indices.pop_back_val(); - Offset += DL->getIndexedOffset(GEPI->getPointerOperandType(), Indices); + Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices); RewriteForScalarRepl(GEPI, AI, Offset, NewElts); Type *T = AI->getAllocatedType(); Type *IdxTy; - uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy); + uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy, DL); if (GEPI->getOperand(0) == AI) OldIdx = ~0ULL; // Force the GEP to be rewritten. T = AI->getAllocatedType(); uint64_t EltOffset = Offset; - uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy); + uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, DL); // If this GEP does not move the pointer across elements of the alloca // being split, then it does not needs to be rewritten. @@ -2080,7 +2088,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, SmallVector<Value*, 8> NewArgs; NewArgs.push_back(Constant::getNullValue(i32Ty)); while (EltOffset != 0) { - uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy); + uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy, DL); NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx)); } if (NonConstantIdx) { @@ -2114,9 +2122,10 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, // Put matching lifetime markers on everything from Offset up to // Offset+OldSize. Type *AIType = AI->getAllocatedType(); + const DataLayout &DL = II->getModule()->getDataLayout(); uint64_t NewOffset = Offset; Type *IdxTy; - uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy); + uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy, DL); IRBuilder<> Builder(II); uint64_t Size = OldSize->getLimitedValue(); @@ -2129,7 +2138,7 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, V = Builder.CreateGEP(V, Builder.getInt64(NewOffset)); IdxTy = NewElts[Idx]->getAllocatedType(); - uint64_t EltSize = DL->getTypeAllocSize(IdxTy) - NewOffset; + uint64_t EltSize = DL.getTypeAllocSize(IdxTy) - NewOffset; if (EltSize > Size) { EltSize = Size; Size = 0; @@ -2145,7 +2154,7 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, for (; Idx != NewElts.size() && Size; ++Idx) { IdxTy = NewElts[Idx]->getAllocatedType(); - uint64_t EltSize = DL->getTypeAllocSize(IdxTy); + uint64_t EltSize = DL.getTypeAllocSize(IdxTy); if (EltSize > Size) { EltSize = Size; Size = 0; @@ -2221,6 +2230,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, bool SROADest = MI->getRawDest() == Inst; Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext())); + const DataLayout &DL = MI->getModule()->getDataLayout(); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // If this is a memcpy/memmove, emit a GEP of the other element address. @@ -2237,10 +2247,10 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType()); Type *OtherTy = OtherPtrTy->getElementType(); if (StructType *ST = dyn_cast<StructType>(OtherTy)) { - EltOffset = DL->getStructLayout(ST)->getElementOffset(i); + EltOffset = DL.getStructLayout(ST)->getElementOffset(i); } else { Type *EltTy = cast<SequentialType>(OtherTy)->getElementType(); - EltOffset = DL->getTypeAllocSize(EltTy)*i; + EltOffset = DL.getTypeAllocSize(EltTy) * i; } // The alignment of the other pointer is the guaranteed alignment of the @@ -2281,7 +2291,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, Type *ValTy = EltTy->getScalarType(); // Construct an integer with the right value. - unsigned EltSize = DL->getTypeSizeInBits(ValTy); + unsigned EltSize = DL.getTypeSizeInBits(ValTy); APInt OneVal(EltSize, CI->getZExtValue()); APInt TotalVal(OneVal); // Set each byte. @@ -2311,7 +2321,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, // this element. } - unsigned EltSize = DL->getTypeAllocSize(EltTy); + unsigned EltSize = DL.getTypeAllocSize(EltTy); if (!EltSize) continue; @@ -2345,12 +2355,13 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, // and store the element value to the individual alloca. Value *SrcVal = SI->getOperand(0); Type *AllocaEltTy = AI->getAllocatedType(); - uint64_t AllocaSizeBits = DL->getTypeAllocSizeInBits(AllocaEltTy); + const DataLayout &DL = SI->getModule()->getDataLayout(); + uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy); IRBuilder<> Builder(SI); // Handle tail padding by extending the operand - if (DL->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) + if (DL.getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) SrcVal = Builder.CreateZExt(SrcVal, IntegerType::get(SI->getContext(), AllocaSizeBits)); @@ -2360,15 +2371,15 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, // There are two forms here: AI could be an array or struct. Both cases // have different ways to compute the element offset. if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { - const StructLayout *Layout = DL->getStructLayout(EltSTy); + const StructLayout *Layout = DL.getStructLayout(EltSTy); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Get the number of bits to shift SrcVal to get the value. Type *FieldTy = EltSTy->getElementType(i); uint64_t Shift = Layout->getElementOffsetInBits(i); - if (DL->isBigEndian()) - Shift = AllocaSizeBits-Shift-DL->getTypeAllocSizeInBits(FieldTy); + if (DL.isBigEndian()) + Shift = AllocaSizeBits - Shift - DL.getTypeAllocSizeInBits(FieldTy); Value *EltVal = SrcVal; if (Shift) { @@ -2377,7 +2388,7 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, } // Truncate down to an integer of the right size. - uint64_t FieldSizeBits = DL->getTypeSizeInBits(FieldTy); + uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy); // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; @@ -2402,12 +2413,12 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, } else { ArrayType *ATy = cast<ArrayType>(AllocaEltTy); Type *ArrayEltTy = ATy->getElementType(); - uint64_t ElementOffset = DL->getTypeAllocSizeInBits(ArrayEltTy); - uint64_t ElementSizeBits = DL->getTypeSizeInBits(ArrayEltTy); + uint64_t ElementOffset = DL.getTypeAllocSizeInBits(ArrayEltTy); + uint64_t ElementSizeBits = DL.getTypeSizeInBits(ArrayEltTy); uint64_t Shift; - if (DL->isBigEndian()) + if (DL.isBigEndian()) Shift = AllocaSizeBits-ElementOffset; else Shift = 0; @@ -2441,7 +2452,7 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, } new StoreInst(EltVal, DestField, SI); - if (DL->isBigEndian()) + if (DL.isBigEndian()) Shift -= ElementOffset; else Shift += ElementOffset; @@ -2459,7 +2470,8 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, // Extract each element out of the NewElts according to its structure offset // and form the result value. Type *AllocaEltTy = AI->getAllocatedType(); - uint64_t AllocaSizeBits = DL->getTypeAllocSizeInBits(AllocaEltTy); + const DataLayout &DL = LI->getModule()->getDataLayout(); + uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy); DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI << '\n'); @@ -2469,10 +2481,10 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, const StructLayout *Layout = nullptr; uint64_t ArrayEltBitOffset = 0; if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { - Layout = DL->getStructLayout(EltSTy); + Layout = DL.getStructLayout(EltSTy); } else { Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType(); - ArrayEltBitOffset = DL->getTypeAllocSizeInBits(ArrayEltTy); + ArrayEltBitOffset = DL.getTypeAllocSizeInBits(ArrayEltTy); } Value *ResultVal = @@ -2484,7 +2496,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, Value *SrcField = NewElts[i]; Type *FieldTy = cast<PointerType>(SrcField->getType())->getElementType(); - uint64_t FieldSizeBits = DL->getTypeSizeInBits(FieldTy); + uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy); // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; @@ -2515,7 +2527,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, else // Array case. Shift = i*ArrayEltBitOffset; - if (DL->isBigEndian()) + if (DL.isBigEndian()) Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth(); if (Shift) { @@ -2532,7 +2544,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, } // Handle tail padding by truncating the result - if (DL->getTypeSizeInBits(LI->getType()) != AllocaSizeBits) + if (DL.getTypeSizeInBits(LI->getType()) != AllocaSizeBits) ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI); LI->replaceAllUsesWith(ResultVal); @@ -2589,13 +2601,15 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { return false; } + const DataLayout &DL = AI->getModule()->getDataLayout(); + // Okay, we know all the users are promotable. If the aggregate is a memcpy // source and destination, we have to be careful. In particular, the memcpy // could be moving around elements that live in structure padding of the LLVM // types, but may actually be used. In these cases, we refuse to promote the // struct. if (Info.isMemCpySrc && Info.isMemCpyDst && - HasPadding(AI->getAllocatedType(), *DL)) + HasPadding(AI->getAllocatedType(), DL)) return false; // If the alloca never has an access to just *part* of it, but is accessed diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp index 6036c09..a457cba 100644 --- a/lib/Transforms/Scalar/Scalarizer.cpp +++ b/lib/Transforms/Scalar/Scalarizer.cpp @@ -165,7 +165,7 @@ private: void gather(Instruction *, const ValueVector &); bool canTransferMetadata(unsigned Kind); void transferMetadata(Instruction *, const ValueVector &); - bool getVectorLayout(Type *, unsigned, VectorLayout &); + bool getVectorLayout(Type *, unsigned, VectorLayout &, const DataLayout &); bool finish(); template<typename T> bool splitBinary(Instruction &, const T &); @@ -173,7 +173,6 @@ private: ScatterMap Scattered; GatherList Gathered; unsigned ParallelLoopAccessMDKind; - const DataLayout *DL; bool ScalarizeLoadStore; }; @@ -248,8 +247,6 @@ bool Scalarizer::doInitialization(Module &M) { } bool Scalarizer::runOnFunction(Function &F) { - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) { BasicBlock *BB = BBI; for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { @@ -345,10 +342,7 @@ void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) { // Try to fill in Layout from Ty, returning true on success. Alignment is // the alignment of the vector, or 0 if the ABI default should be used. bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment, - VectorLayout &Layout) { - if (!DL) - return false; - + VectorLayout &Layout, const DataLayout &DL) { // Make sure we're dealing with a vector. Layout.VecTy = dyn_cast<VectorType>(Ty); if (!Layout.VecTy) @@ -356,15 +350,15 @@ bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment, // Check that we're dealing with full-byte elements. Layout.ElemTy = Layout.VecTy->getElementType(); - if (DL->getTypeSizeInBits(Layout.ElemTy) != - DL->getTypeStoreSizeInBits(Layout.ElemTy)) + if (DL.getTypeSizeInBits(Layout.ElemTy) != + DL.getTypeStoreSizeInBits(Layout.ElemTy)) return false; if (Alignment) Layout.VecAlign = Alignment; else - Layout.VecAlign = DL->getABITypeAlignment(Layout.VecTy); - Layout.ElemSize = DL->getTypeStoreSize(Layout.ElemTy); + Layout.VecAlign = DL.getABITypeAlignment(Layout.VecTy); + Layout.ElemSize = DL.getTypeStoreSize(Layout.ElemTy); return true; } @@ -456,7 +450,7 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) { Indices.resize(NumIndices); for (unsigned J = 0; J < NumIndices; ++J) Indices[J] = Ops[J][I]; - Res[I] = Builder.CreateGEP(Base[I], Indices, + Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), Base[I], Indices, GEPI.getName() + ".i" + Twine(I)); if (GEPI.isInBounds()) if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I])) @@ -595,7 +589,8 @@ bool Scalarizer::visitLoadInst(LoadInst &LI) { return false; VectorLayout Layout; - if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout)) + if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout, + LI.getModule()->getDataLayout())) return false; unsigned NumElems = Layout.VecTy->getNumElements(); @@ -619,7 +614,8 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) { VectorLayout Layout; Value *FullValue = SI.getValueOperand(); - if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout)) + if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout, + SI.getModule()->getDataLayout())) return false; unsigned NumElems = Layout.VecTy->getNumElements(); diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index bffe8df..1a04d74 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -199,18 +199,15 @@ class ConstantOffsetExtractor { /// new index representing the remainder (equal to the original index minus /// the constant offset), or nullptr if we cannot extract a constant offset. /// \p Idx The given GEP index - /// \p DL The datalayout of the module /// \p GEP The given GEP - static Value *Extract(Value *Idx, const DataLayout *DL, - GetElementPtrInst *GEP); + static Value *Extract(Value *Idx, GetElementPtrInst *GEP); /// Looks for a constant offset from the given GEP index without extracting /// it. It returns the numeric value of the extracted constant offset (0 if /// failed). The meaning of the arguments are the same as Extract. - static int64_t Find(Value *Idx, const DataLayout *DL, GetElementPtrInst *GEP); + static int64_t Find(Value *Idx, GetElementPtrInst *GEP); private: - ConstantOffsetExtractor(const DataLayout *Layout, Instruction *InsertionPt) - : DL(Layout), IP(InsertionPt) {} + ConstantOffsetExtractor(Instruction *InsertionPt) : IP(InsertionPt) {} /// Searches the expression that computes V for a non-zero constant C s.t. /// V can be reassociated into the form V' + C. If the searching is /// successful, returns C and update UserChain as a def-use chain from C to V; @@ -294,8 +291,6 @@ class ConstantOffsetExtractor { /// A data structure used in rebuildWithoutConstOffset. Contains all /// sext/zext instructions along UserChain. SmallVector<CastInst *, 16> ExtInsts; - /// The data layout of the module. Used in ComputeKnownBits. - const DataLayout *DL; Instruction *IP; /// Insertion position of cloned instructions. }; @@ -312,19 +307,10 @@ class SeparateConstOffsetFromGEP : public FunctionPass { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DataLayoutPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.setPreservesCFG(); } - bool doInitialization(Module &M) override { - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - if (DLP == nullptr) - report_fatal_error("data layout missing"); - DL = &DLP->getDataLayout(); - return false; - } - bool runOnFunction(Function &F) override; private: @@ -372,7 +358,6 @@ class SeparateConstOffsetFromGEP : public FunctionPass { /// Verified in @i32_add in split-gep.ll bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); - const DataLayout *DL; const TargetMachine *TM; /// Whether to lower a GEP with multiple indices into arithmetic operations or /// multiple GEPs with a single index. @@ -386,7 +371,6 @@ INITIALIZE_PASS_BEGIN( "Split GEPs to a variadic base and a constant offset for better CSE", false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DataLayoutPass) INITIALIZE_PASS_END( SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", "Split GEPs to a variadic base and a constant offset for better CSE", false, @@ -647,9 +631,8 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { return BO; } -Value *ConstantOffsetExtractor::Extract(Value *Idx, const DataLayout *DL, - GetElementPtrInst *GEP) { - ConstantOffsetExtractor Extractor(DL, GEP); +Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP) { + ConstantOffsetExtractor Extractor(GEP); // Find a non-zero constant offset first. APInt ConstantOffset = Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, @@ -660,10 +643,9 @@ Value *ConstantOffsetExtractor::Extract(Value *Idx, const DataLayout *DL, return Extractor.rebuildWithoutConstOffset(); } -int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL, - GetElementPtrInst *GEP) { +int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) { // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. - return ConstantOffsetExtractor(DL, GEP) + return ConstantOffsetExtractor(GEP) .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, GEP->isInBounds()) .getSExtValue(); @@ -674,6 +656,7 @@ void ConstantOffsetExtractor::ComputeKnownBits(Value *V, APInt &KnownOne, IntegerType *IT = cast<IntegerType>(V->getType()); KnownOne = APInt(IT->getBitWidth(), 0); KnownZero = APInt(IT->getBitWidth(), 0); + const DataLayout &DL = IP->getModule()->getDataLayout(); llvm::computeKnownBits(V, KnownZero, KnownOne, DL, 0); } @@ -689,7 +672,8 @@ bool ConstantOffsetExtractor::NoCommonBits(Value *LHS, Value *RHS) const { bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize( GetElementPtrInst *GEP) { bool Changed = false; - Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); + const DataLayout &DL = GEP->getModule()->getDataLayout(); + Type *IntPtrTy = DL.getIntPtrType(GEP->getType()); gep_type_iterator GTI = gep_type_begin(*GEP); for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); I != E; ++I, ++GTI) { @@ -710,18 +694,19 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, NeedsExtraction = false; int64_t AccumulativeByteOffset = 0; gep_type_iterator GTI = gep_type_begin(*GEP); + const DataLayout &DL = GEP->getModule()->getDataLayout(); for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { if (isa<SequentialType>(*GTI)) { // Tries to extract a constant offset from this GEP index. int64_t ConstantOffset = - ConstantOffsetExtractor::Find(GEP->getOperand(I), DL, GEP); + ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP); if (ConstantOffset != 0) { NeedsExtraction = true; // A GEP may have multiple indices. We accumulate the extracted // constant offset to a byte offset, and later offset the remainder of // the original GEP with this byte offset. AccumulativeByteOffset += - ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType()); + ConstantOffset * DL.getTypeAllocSize(GTI.getIndexedType()); } } else if (LowerGEP) { StructType *StTy = cast<StructType>(*GTI); @@ -730,7 +715,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, if (Field != 0) { NeedsExtraction = true; AccumulativeByteOffset += - DL->getStructLayout(StTy)->getElementOffset(Field); + DL.getStructLayout(StTy)->getElementOffset(Field); } } } @@ -740,7 +725,8 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) { IRBuilder<> Builder(Variadic); - Type *IntPtrTy = DL->getIntPtrType(Variadic->getType()); + const DataLayout &DL = Variadic->getModule()->getDataLayout(); + Type *IntPtrTy = DL.getIntPtrType(Variadic->getType()); Type *I8PtrTy = Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace()); @@ -760,7 +746,7 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( continue; APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), - DL->getTypeAllocSize(GTI.getIndexedType())); + DL.getTypeAllocSize(GTI.getIndexedType())); // Scale the index by element size. if (ElementSize != 1) { if (ElementSize.isPowerOf2()) { @@ -791,7 +777,8 @@ void SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) { IRBuilder<> Builder(Variadic); - Type *IntPtrTy = DL->getIntPtrType(Variadic->getType()); + const DataLayout &DL = Variadic->getModule()->getDataLayout(); + Type *IntPtrTy = DL.getIntPtrType(Variadic->getType()); Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy); gep_type_iterator GTI = gep_type_begin(*Variadic); @@ -807,7 +794,7 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic, continue; APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), - DL->getTypeAllocSize(GTI.getIndexedType())); + DL.getTypeAllocSize(GTI.getIndexedType())); // Scale the index by element size. if (ElementSize != 1) { if (ElementSize.isPowerOf2()) { @@ -880,8 +867,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { if (isa<SequentialType>(*GTI)) { // Splits this GEP index into a variadic part and a constant offset, and // uses the variadic part as the new index. - Value *NewIdx = - ConstantOffsetExtractor::Extract(GEP->getOperand(I), DL, GEP); + Value *NewIdx = ConstantOffsetExtractor::Extract(GEP->getOperand(I), GEP); if (NewIdx != nullptr) { GEP->setOperand(I, NewIdx); } @@ -958,15 +944,17 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned = // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is // used with unsigned integers later. + const DataLayout &DL = GEP->getModule()->getDataLayout(); int64_t ElementTypeSizeOfGEP = static_cast<int64_t>( - DL->getTypeAllocSize(GEP->getType()->getElementType())); - Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); + DL.getTypeAllocSize(GEP->getType()->getElementType())); + Type *IntPtrTy = DL.getIntPtrType(GEP->getType()); if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) { // Very likely. As long as %gep is natually aligned, the byte offset we // extracted should be a multiple of sizeof(*%gep). int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP; - NewGEP = GetElementPtrInst::Create( - NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP); + NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP, + ConstantInt::get(IntPtrTy, Index, true), + GEP->getName(), GEP); } else { // Unlikely but possible. For example, // #pragma pack(1) @@ -986,8 +974,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { GEP->getPointerAddressSpace()); NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP); NewGEP = GetElementPtrInst::Create( - NewGEP, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), - "uglygep", GEP); + Type::getInt8Ty(GEP->getContext()), NewGEP, + ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep", + GEP); if (GEP->getType() != I8PtrTy) NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP); } diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index fb8fe38..8566cd9 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -127,7 +127,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, - const DataLayout *DL, AssumptionCache *AC, + AssumptionCache *AC, unsigned BonusInstThreshold) { bool Changed = false; bool LocalChange = true; @@ -137,7 +137,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AC)) { + if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, AC)) { LocalChange = true; ++NumSimpl; } @@ -148,11 +148,10 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, } static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI, - const DataLayout *DL, AssumptionCache *AC, - int BonusInstThreshold) { + AssumptionCache *AC, int BonusInstThreshold) { bool EverChanged = removeUnreachableBlocks(F); EverChanged |= mergeEmptyReturnBlocks(F); - EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold); + EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -166,7 +165,7 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI, return true; do { - EverChanged = iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold); + EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold); EverChanged |= removeUnreachableBlocks(F); } while (EverChanged); @@ -181,11 +180,10 @@ SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold) PreservedAnalyses SimplifyCFGPass::run(Function &F, AnalysisManager<Function> *AM) { - auto *DL = F.getParent()->getDataLayout(); auto &TTI = AM->getResult<TargetIRAnalysis>(F); auto &AC = AM->getResult<AssumptionAnalysis>(F); - if (!simplifyFunctionCFG(F, TTI, DL, &AC, BonusInstThreshold)) + if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold)) return PreservedAnalyses::none(); return PreservedAnalyses::all(); @@ -207,9 +205,7 @@ struct CFGSimplifyPass : public FunctionPass { &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - return simplifyFunctionCFG(F, TTI, DL, AC, BonusInstThreshold); + return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold); } void getAnalysisUsage(AnalysisUsage &AU) const override { diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index d0ee0a6..b169d56 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -35,7 +36,6 @@ namespace { DominatorTree *DT; LoopInfo *LI; AliasAnalysis *AA; - const DataLayout *DL; public: static char ID; // Pass identification @@ -100,8 +100,6 @@ bool Sinking::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); AA = &getAnalysis<AliasAnalysis>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; bool MadeChange, EverMadeChange = false; @@ -196,7 +194,7 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { // We cannot sink a load across a critical edge - there may be stores in // other code paths. - if (!isSafeToSpeculativelyExecute(Inst, DL)) + if (!isSafeToSpeculativelyExecute(Inst)) return false; // We don't want to sink across a critical edge if we don't dominate the diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 4edc86c..e71031c 100644 --- a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -15,19 +15,30 @@ // // There are many optimizations we can perform in the domain of SLSR. This file // for now contains only an initial step. Specifically, we look for strength -// reduction candidate in the form of +// reduction candidates in two forms: // -// (B + i) * S +// Form 1: (B + i) * S +// Form 2: &B[i * S] // -// where B and S are integer constants or variables, and i is a constant -// integer. If we found two such candidates +// where S is an integer variable, and i is a constant integer. If we found two +// candidates // -// S1: X = (B + i) * S S2: Y = (B + i') * S +// S1: X = (B + i) * S +// S2: Y = (B + i') * S +// +// or +// +// S1: X = &B[i * S] +// S2: Y = &B[i' * S] // // and S1 dominates S2, we call S1 a basis of S2, and can replace S2 with // // Y = X + (i' - i) * S // +// or +// +// Y = &X[(i' - i) * S] +// // where (i' - i) * S is folded to the extent possible. When S2 has multiple // bases, we pick the one that is closest to S2, or S2's "immediate" basis. // @@ -35,8 +46,6 @@ // // - Handle candidates in the form of B + i * S // -// - Handle candidates in the form of pointer arithmetics. e.g., B[i * S] -// // - Floating point arithmetics when fast math is enabled. // // - SLSR may decrease ILP at the architecture level. Targets that are very @@ -45,6 +54,10 @@ #include <vector> #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" @@ -58,14 +71,30 @@ using namespace PatternMatch; namespace { class StraightLineStrengthReduce : public FunctionPass { - public: +public: // SLSR candidate. Such a candidate must be in the form of // (Base + Index) * Stride + // or + // Base[..][Index * Stride][..] struct Candidate : public ilist_node<Candidate> { - Candidate(Value *B = nullptr, ConstantInt *Idx = nullptr, - Value *S = nullptr, Instruction *I = nullptr) - : Base(B), Index(Idx), Stride(S), Ins(I), Basis(nullptr) {} - Value *Base; + enum Kind { + Invalid, // reserved for the default constructor + Mul, // (B + i) * S + GEP, // &B[..][i * S][..] + }; + + Candidate() + : CandidateKind(Invalid), Base(nullptr), Index(nullptr), + Stride(nullptr), Ins(nullptr), Basis(nullptr) {} + Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S, + Instruction *I) + : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I), + Basis(nullptr) {} + Kind CandidateKind; + const SCEV *Base; + // Note that Index and Stride of a GEP candidate may not have the same + // integer type. In that case, during rewriting, Stride will be + // sign-extended or truncated to Index's type. ConstantInt *Index; Value *Stride; // The instruction this candidate corresponds to. It helps us to rewrite a @@ -90,33 +119,70 @@ class StraightLineStrengthReduce : public FunctionPass { static char ID; - StraightLineStrengthReduce() : FunctionPass(ID), DT(nullptr) { + StraightLineStrengthReduce() + : FunctionPass(ID), DL(nullptr), DT(nullptr), TTI(nullptr) { initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); // We do not modify the shape of the CFG. AU.setPreservesCFG(); } + bool doInitialization(Module &M) override { + DL = &M.getDataLayout(); + return false; + } + bool runOnFunction(Function &F) override; - private: +private: // Returns true if Basis is a basis for C, i.e., Basis dominates C and they // share the same base and stride. bool isBasisFor(const Candidate &Basis, const Candidate &C); // Checks whether I is in a candidate form. If so, adds all the matching forms // to Candidates, and tries to find the immediate basis for each of them. void allocateCandidateAndFindBasis(Instruction *I); - // Given that I is in the form of "(B + Idx) * S", adds this form to - // Candidates, and finds its immediate basis. - void allocateCandidateAndFindBasis(Value *B, ConstantInt *Idx, Value *S, + // Allocate candidates and find bases for Mul instructions. + void allocateCandidateAndFindBasisForMul(Instruction *I); + // Splits LHS into Base + Index and, if succeeds, calls + // allocateCandidateAndFindBasis. + void allocateCandidateAndFindBasisForMul(Value *LHS, Value *RHS, + Instruction *I); + // Allocate candidates and find bases for GetElementPtr instructions. + void allocateCandidateAndFindBasisForGEP(GetElementPtrInst *GEP); + // A helper function that scales Idx with ElementSize before invoking + // allocateCandidateAndFindBasis. + void allocateCandidateAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx, + Value *S, uint64_t ElementSize, + Instruction *I); + // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate + // basis. + void allocateCandidateAndFindBasis(Candidate::Kind CT, const SCEV *B, + ConstantInt *Idx, Value *S, Instruction *I); // Rewrites candidate C with respect to Basis. void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis); + // A helper function that factors ArrayIdx to a product of a stride and a + // constant index, and invokes allocateCandidateAndFindBasis with the + // factorings. + void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize, + GetElementPtrInst *GEP); + // Emit code that computes the "bump" from Basis to C. If the candidate is a + // GEP and the bump is not divisible by the element size of the GEP, this + // function sets the BumpWithUglyGEP flag to notify its caller to bump the + // basis using an ugly GEP. + static Value *emitBump(const Candidate &Basis, const Candidate &C, + IRBuilder<> &Builder, const DataLayout *DL, + bool &BumpWithUglyGEP); + const DataLayout *DL; DominatorTree *DT; + ScalarEvolution *SE; + TargetTransformInfo *TTI; ilist<Candidate> Candidates; // Temporarily holds all instructions that are unlinked (but not deleted) by // rewriteCandidateWithBasis. These instructions will be actually removed @@ -129,6 +195,8 @@ char StraightLineStrengthReduce::ID = 0; INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr", "Straight line strength reduction", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr", "Straight line strength reduction", false, false) @@ -141,9 +209,47 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, return (Basis.Ins != C.Ins && // skip the same instruction // Basis must dominate C in order to rewrite C with respect to Basis. DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) && - // They share the same base and stride. + // They share the same base, stride, and candidate kind. Basis.Base == C.Base && - Basis.Stride == C.Stride); + Basis.Stride == C.Stride && + Basis.CandidateKind == C.CandidateKind); +} + +static bool isCompletelyFoldable(GetElementPtrInst *GEP, + const TargetTransformInfo *TTI, + const DataLayout *DL) { + GlobalVariable *BaseGV = nullptr; + int64_t BaseOffset = 0; + bool HasBaseReg = false; + int64_t Scale = 0; + + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand())) + BaseGV = GV; + else + HasBaseReg = true; + + gep_type_iterator GTI = gep_type_begin(GEP); + for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I, ++GTI) { + if (isa<SequentialType>(*GTI)) { + int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType()); + if (ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) { + BaseOffset += ConstIdx->getSExtValue() * ElementSize; + } else { + // Needs scale register. + if (Scale != 0) { + // No addressing mode takes two scale registers. + return false; + } + Scale = ElementSize; + } + } else { + StructType *STy = cast<StructType>(*GTI); + uint64_t Field = cast<ConstantInt>(*I)->getZExtValue(); + BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field); + } + } + return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV, + BaseOffset, HasBaseReg, Scale); } // TODO: We currently implement an algorithm whose time complexity is linear to @@ -153,11 +259,17 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, // table is indexed by the base and the stride of a candidate. Therefore, // finding the immediate basis of a candidate boils down to one hash-table look // up. -void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Value *B, - ConstantInt *Idx, - Value *S, - Instruction *I) { - Candidate C(B, Idx, S, I); +void StraightLineStrengthReduce::allocateCandidateAndFindBasis( + Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S, + Instruction *I) { + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { + // If &B[Idx * S] fits into an addressing mode, do not turn it into + // non-free computation. + if (isCompletelyFoldable(GEP, TTI, DL)) + return; + } + + Candidate C(CT, B, Idx, S, I); // Try to compute the immediate basis of C. unsigned NumIterations = 0; // Limit the scan radius to avoid running forever. @@ -176,60 +288,209 @@ void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Value *B, } void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::Mul: + allocateCandidateAndFindBasisForMul(I); + break; + case Instruction::GetElementPtr: + allocateCandidateAndFindBasisForGEP(cast<GetElementPtrInst>(I)); + break; + } +} + +void StraightLineStrengthReduce::allocateCandidateAndFindBasisForMul( + Value *LHS, Value *RHS, Instruction *I) { Value *B = nullptr; ConstantInt *Idx = nullptr; - // "(Base + Index) * Stride" must be a Mul instruction at the first hand. - if (I->getOpcode() == Instruction::Mul) { - if (IntegerType *ITy = dyn_cast<IntegerType>(I->getType())) { - Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); - for (unsigned Swapped = 0; Swapped < 2; ++Swapped) { - // Only handle the canonical operand ordering. - if (match(LHS, m_Add(m_Value(B), m_ConstantInt(Idx)))) { - // If LHS is in the form of "Base + Index", then I is in the form of - // "(Base + Index) * RHS". - allocateCandidateAndFindBasis(B, Idx, RHS, I); - } else { - // Otherwise, at least try the form (LHS + 0) * RHS. - allocateCandidateAndFindBasis(LHS, ConstantInt::get(ITy, 0), RHS, I); - } - // Swap LHS and RHS so that we also cover the cases where LHS is the - // stride. - if (LHS == RHS) - break; - std::swap(LHS, RHS); - } - } + // Only handle the canonical operand ordering. + if (match(LHS, m_Add(m_Value(B), m_ConstantInt(Idx)))) { + // If LHS is in the form of "Base + Index", then I is in the form of + // "(Base + Index) * RHS". + allocateCandidateAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I); + } else { + // Otherwise, at least try the form (LHS + 0) * RHS. + ConstantInt *Zero = ConstantInt::get(cast<IntegerType>(I->getType()), 0); + allocateCandidateAndFindBasis(Candidate::Mul, SE->getSCEV(LHS), Zero, RHS, + I); + } +} + +void StraightLineStrengthReduce::allocateCandidateAndFindBasisForMul( + Instruction *I) { + // Try matching (B + i) * S. + // TODO: we could extend SLSR to float and vector types. + if (!isa<IntegerType>(I->getType())) + return; + + Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); + allocateCandidateAndFindBasisForMul(LHS, RHS, I); + if (LHS != RHS) { + // Symmetrically, try to split RHS to Base + Index. + allocateCandidateAndFindBasisForMul(RHS, LHS, I); + } +} + +void StraightLineStrengthReduce::allocateCandidateAndFindBasisForGEP( + const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize, + Instruction *I) { + // I = B + sext(Idx *nsw S) *nsw ElementSize + // = B + (sext(Idx) * ElementSize) * sext(S) + // Casting to IntegerType is safe because we skipped vector GEPs. + IntegerType *IntPtrTy = cast<IntegerType>(DL->getIntPtrType(I->getType())); + ConstantInt *ScaledIdx = ConstantInt::get( + IntPtrTy, Idx->getSExtValue() * (int64_t)ElementSize, true); + allocateCandidateAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I); +} + +void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx, + const SCEV *Base, + uint64_t ElementSize, + GetElementPtrInst *GEP) { + // At least, ArrayIdx = ArrayIdx *s 1. + allocateCandidateAndFindBasisForGEP( + Base, ConstantInt::get(cast<IntegerType>(ArrayIdx->getType()), 1), + ArrayIdx, ElementSize, GEP); + Value *LHS = nullptr; + ConstantInt *RHS = nullptr; + // TODO: handle shl. e.g., we could treat (S << 2) as (S * 4). + // + // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx + // itself. This would allow us to handle the shl case for free. However, + // matching SCEVs has two issues: + // + // 1. this would complicate rewriting because the rewriting procedure + // would have to translate SCEVs back to IR instructions. This translation + // is difficult when LHS is further evaluated to a composite SCEV. + // + // 2. ScalarEvolution is designed to be control-flow oblivious. It tends + // to strip nsw/nuw flags which are critical for SLSR to trace into + // sext'ed multiplication. + if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) { + // SLSR is currently unsafe if i * S may overflow. + // GEP = Base + sext(LHS *nsw RHS) *nsw ElementSize + allocateCandidateAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP); + } +} + +void StraightLineStrengthReduce::allocateCandidateAndFindBasisForGEP( + GetElementPtrInst *GEP) { + // TODO: handle vector GEPs + if (GEP->getType()->isVectorTy()) + return; + + const SCEV *GEPExpr = SE->getSCEV(GEP); + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); + + gep_type_iterator GTI = gep_type_begin(GEP); + for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) { + if (!isa<SequentialType>(*GTI++)) + continue; + Value *ArrayIdx = *I; + // Compute the byte offset of this index. + uint64_t ElementSize = DL->getTypeAllocSize(*GTI); + const SCEV *ElementSizeExpr = SE->getSizeOfExpr(IntPtrTy, *GTI); + const SCEV *ArrayIdxExpr = SE->getSCEV(ArrayIdx); + ArrayIdxExpr = SE->getTruncateOrSignExtend(ArrayIdxExpr, IntPtrTy); + const SCEV *LocalOffset = + SE->getMulExpr(ArrayIdxExpr, ElementSizeExpr, SCEV::FlagNSW); + // The base of this candidate equals GEPExpr less the byte offset of this + // index. + const SCEV *Base = SE->getMinusSCEV(GEPExpr, LocalOffset); + factorArrayIndex(ArrayIdx, Base, ElementSize, GEP); + // When ArrayIdx is the sext of a value, we try to factor that value as + // well. Handling this case is important because array indices are + // typically sign-extended to the pointer size. + Value *TruncatedArrayIdx = nullptr; + if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx)))) + factorArrayIndex(TruncatedArrayIdx, Base, ElementSize, GEP); } } +// A helper function that unifies the bitwidth of A and B. +static void unifyBitWidth(APInt &A, APInt &B) { + if (A.getBitWidth() < B.getBitWidth()) + A = A.sext(B.getBitWidth()); + else if (A.getBitWidth() > B.getBitWidth()) + B = B.sext(A.getBitWidth()); +} + +Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis, + const Candidate &C, + IRBuilder<> &Builder, + const DataLayout *DL, + bool &BumpWithUglyGEP) { + APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue(); + unifyBitWidth(Idx, BasisIdx); + APInt IndexOffset = Idx - BasisIdx; + + BumpWithUglyGEP = false; + if (Basis.CandidateKind == Candidate::GEP) { + APInt ElementSize( + IndexOffset.getBitWidth(), + DL->getTypeAllocSize( + cast<GetElementPtrInst>(Basis.Ins)->getType()->getElementType())); + APInt Q, R; + APInt::sdivrem(IndexOffset, ElementSize, Q, R); + if (R.getSExtValue() == 0) + IndexOffset = Q; + else + BumpWithUglyGEP = true; + } + // Compute Bump = C - Basis = (i' - i) * S. + // Common case 1: if (i' - i) is 1, Bump = S. + if (IndexOffset.getSExtValue() == 1) + return C.Stride; + // Common case 2: if (i' - i) is -1, Bump = -S. + if (IndexOffset.getSExtValue() == -1) + return Builder.CreateNeg(C.Stride); + // Otherwise, Bump = (i' - i) * sext/trunc(S). + ConstantInt *Delta = ConstantInt::get(Basis.Ins->getContext(), IndexOffset); + Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, Delta->getType()); + return Builder.CreateMul(ExtendedStride, Delta); +} + void StraightLineStrengthReduce::rewriteCandidateWithBasis( const Candidate &C, const Candidate &Basis) { + assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base && + C.Stride == Basis.Stride); + // An instruction can correspond to multiple candidates. Therefore, instead of // simply deleting an instruction when we rewrite it, we mark its parent as // nullptr (i.e. unlink it) so that we can skip the candidates whose // instruction is already rewritten. if (!C.Ins->getParent()) return; - assert(C.Base == Basis.Base && C.Stride == Basis.Stride); - // Basis = (B + i) * S - // C = (B + i') * S - // ==> - // C = Basis + (i' - i) * S + IRBuilder<> Builder(C.Ins); - ConstantInt *IndexOffset = ConstantInt::get( - C.Ins->getContext(), C.Index->getValue() - Basis.Index->getValue()); - Value *Reduced; - // TODO: preserve nsw/nuw in some cases. - if (IndexOffset->isOne()) { - // If (i' - i) is 1, fold C into Basis + S. - Reduced = Builder.CreateAdd(Basis.Ins, C.Stride); - } else if (IndexOffset->isMinusOne()) { - // If (i' - i) is -1, fold C into Basis - S. - Reduced = Builder.CreateSub(Basis.Ins, C.Stride); - } else { - Value *Bump = Builder.CreateMul(C.Stride, IndexOffset); + bool BumpWithUglyGEP; + Value *Bump = emitBump(Basis, C, Builder, DL, BumpWithUglyGEP); + Value *Reduced = nullptr; // equivalent to but weaker than C.Ins + switch (C.CandidateKind) { + case Candidate::Mul: Reduced = Builder.CreateAdd(Basis.Ins, Bump); - } + break; + case Candidate::GEP: + { + Type *IntPtrTy = DL->getIntPtrType(C.Ins->getType()); + if (BumpWithUglyGEP) { + // C = (char *)Basis + Bump + unsigned AS = Basis.Ins->getType()->getPointerAddressSpace(); + Type *CharTy = Type::getInt8PtrTy(Basis.Ins->getContext(), AS); + Reduced = Builder.CreateBitCast(Basis.Ins, CharTy); + // We only considered inbounds GEP as candidates. + Reduced = Builder.CreateInBoundsGEP(Reduced, Bump); + Reduced = Builder.CreateBitCast(Reduced, C.Ins->getType()); + } else { + // C = gep Basis, Bump + // Canonicalize bump to pointer size. + Bump = Builder.CreateSExtOrTrunc(Bump, IntPtrTy); + Reduced = Builder.CreateInBoundsGEP(Basis.Ins, Bump); + } + } + break; + default: + llvm_unreachable("C.CandidateKind is invalid"); + }; Reduced->takeName(C.Ins); C.Ins->replaceAllUsesWith(Reduced); C.Ins->dropAllReferences(); @@ -243,15 +504,15 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + SE = &getAnalysis<ScalarEvolution>(); // Traverse the dominator tree in the depth-first order. This order makes sure // all bases of a candidate are in Candidates when we process it. for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT); node != GraphTraits<DominatorTree *>::nodes_end(DT); ++node) { - BasicBlock *B = node->getBlock(); - for (auto I = B->begin(); I != B->end(); ++I) { - allocateCandidateAndFindBasis(I); - } + for (auto &I : *node->getBlock()) + allocateCandidateAndFindBasis(&I); } // Rewrite candidates in the reverse depth-first order. This order makes sure diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index aaf6f9a..6c3ce58 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -9,8 +9,8 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SCCIterator.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" @@ -18,6 +18,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 715ddeb..9eef132 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -54,8 +54,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" @@ -87,7 +87,6 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced"); namespace { struct TailCallElim : public FunctionPass { const TargetTransformInfo *TTI; - const DataLayout *DL; static char ID; // Pass identification, replacement for typeid TailCallElim() : FunctionPass(ID) { @@ -159,8 +158,6 @@ bool TailCallElim::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; - DL = F.getParent()->getDataLayout(); - bool AllCallsAreTailCalls = false; bool Modified = markTails(F, AllCallsAreTailCalls); if (AllCallsAreTailCalls) @@ -392,10 +389,9 @@ bool TailCallElim::runTRE(Function &F) { SmallVector<PHINode*, 8> ArgumentPHIs; bool MadeChange = false; - // CanTRETailMarkedCall - If false, we cannot perform TRE on tail calls - // marked with the 'tail' attribute, because doing so would cause the stack - // size to increase (real TRE would deallocate variable sized allocas, TRE - // doesn't). + // If false, we cannot perform TRE on tail calls marked with the 'tail' + // attribute, because doing so would cause the stack size to increase (real + // TRE would deallocate variable sized allocas, TRE doesn't). bool CanTRETailMarkedCall = CanTRE(F); // Change any tail recursive calls to loops. @@ -404,28 +400,19 @@ bool TailCallElim::runTRE(Function &F) { // alloca' is changed from being a static alloca to being a dynamic alloca. // Until this is resolved, disable this transformation if that would ever // happen. This bug is PR962. - SmallVector<BasicBlock*, 8> BBToErase; - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) { + BasicBlock *BB = BBI++; // FoldReturnAndProcessPred may delete BB. if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, !CanTRETailMarkedCall); - if (!Change && BB->getFirstNonPHIOrDbg() == Ret) { + if (!Change && BB->getFirstNonPHIOrDbg() == Ret) Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, !CanTRETailMarkedCall); - // FoldReturnAndProcessPred may have emptied some BB. Remember to - // erase them. - if (Change && BB->empty()) - BBToErase.push_back(BB); - - } MadeChange |= Change; } } - for (auto BB: BBToErase) - BB->eraseFromParent(); - // If we eliminated any tail recursions, it's possible that we inserted some // silly PHI nodes which just merge an initial value (the incoming operand) // with themselves. Check to see if we did and clean up our mess if so. This @@ -435,7 +422,7 @@ bool TailCallElim::runTRE(Function &F) { PHINode *PN = ArgumentPHIs[i]; // If the PHI Node is a dynamic constant, replace it with the value it is. - if (Value *PNV = SimplifyInstruction(PN)) { + if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) { PN->replaceAllUsesWith(PNV); PN->eraseFromParent(); } @@ -445,7 +432,7 @@ bool TailCallElim::runTRE(Function &F) { } -/// CanMoveAboveCall - Return true if it is safe to move the specified +/// Return true if it is safe to move the specified /// instruction from after the call to before the call, assuming that all /// instructions between the call and this instruction are movable. /// @@ -464,7 +451,7 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) { // being loaded from. if (CI->mayWriteToMemory() || !isSafeToLoadUnconditionally(L->getPointerOperand(), L, - L->getAlignment(), DL)) + L->getAlignment())) return false; } } @@ -480,13 +467,11 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) { return true; } -// isDynamicConstant - Return true if the specified value is the same when the -// return would exit as it was when the initial iteration of the recursive -// function was executed. -// -// We currently handle static constants and arguments that are not modified as -// part of the recursion. -// +/// Return true if the specified value is the same when the return would exit +/// as it was when the initial iteration of the recursive function was executed. +/// +/// We currently handle static constants and arguments that are not modified as +/// part of the recursion. static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) { if (isa<Constant>(V)) return true; // Static constants are always dyn consts @@ -518,10 +503,9 @@ static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) { return false; } -// getCommonReturnValue - Check to see if the function containing the specified -// tail call consistently returns the same runtime-constant value at all exit -// points except for IgnoreRI. If so, return the returned value. -// +/// Check to see if the function containing the specified tail call consistently +/// returns the same runtime-constant value at all exit points except for +/// IgnoreRI. If so, return the returned value. static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { Function *F = CI->getParent()->getParent(); Value *ReturnedValue = nullptr; @@ -545,10 +529,9 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { return ReturnedValue; } -/// CanTransformAccumulatorRecursion - If the specified instruction can be -/// transformed using accumulator recursion elimination, return the constant -/// which is the start of the accumulator value. Otherwise return null. -/// +/// If the specified instruction can be transformed using accumulator recursion +/// elimination, return the constant which is the start of the accumulator +/// value. Otherwise return null. Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI) { if (!I->isAssociative() || !I->isCommutative()) return nullptr; @@ -836,14 +819,11 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred); // Cleanup: if all predecessors of BB have been eliminated by - // FoldReturnIntoUncondBranch, we would like to delete it, but we - // can not just nuke it as it is being used as an iterator by our caller. - // Just empty it, and the caller will erase it when it is safe to do so. - // It is important to empty it, because the ret instruction in there is - // still using a value which EliminateRecursiveTailCall will attempt - // to remove. + // FoldReturnIntoUncondBranch, delete it. It is important to empty it, + // because the ret instruction in there is still using a value which + // EliminateRecursiveTailCall will attempt to remove. if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB)) - BB->getInstList().clear(); + BB->eraseFromParent(); EliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, |