diff options
Diffstat (limited to 'lib/Transforms/Scalar')
43 files changed, 8965 insertions, 2001 deletions
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index 3d91984..d6fc916 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -32,19 +32,18 @@ using namespace llvm; STATISTIC(NumRemoved, "Number of instructions removed"); namespace { - struct ADCE : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - ADCE() : FunctionPass(ID) { - initializeADCEPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function& F) override; +struct ADCE : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + ADCE() : FunctionPass(ID) { + initializeADCEPass(*PassRegistry::getPassRegistry()); + } - void getAnalysisUsage(AnalysisUsage& AU) const override { - AU.setPreservesCFG(); - } + bool runOnFunction(Function& F) override; - }; + void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.setPreservesCFG(); + } +}; } char ADCE::ID = 0; @@ -54,46 +53,45 @@ bool ADCE::runOnFunction(Function& F) { if (skipOptnoneFunction(F)) return false; - SmallPtrSet<Instruction*, 128> alive; - SmallVector<Instruction*, 128> worklist; + SmallPtrSet<Instruction*, 128> Alive; + SmallVector<Instruction*, 128> Worklist; // Collect the set of "root" instructions that are known live. - for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) - if (isa<TerminatorInst>(I.getInstructionIterator()) || - isa<DbgInfoIntrinsic>(I.getInstructionIterator()) || - isa<LandingPadInst>(I.getInstructionIterator()) || - I->mayHaveSideEffects()) { - alive.insert(I.getInstructionIterator()); - worklist.push_back(I.getInstructionIterator()); + for (Instruction &I : inst_range(F)) { + if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || + isa<LandingPadInst>(I) || I.mayHaveSideEffects()) { + Alive.insert(&I); + Worklist.push_back(&I); } + } // Propagate liveness backwards to operands. - while (!worklist.empty()) { - Instruction* curr = worklist.pop_back_val(); - for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end(); - OI != OE; ++OI) - if (Instruction* Inst = dyn_cast<Instruction>(OI)) - if (alive.insert(Inst).second) - worklist.push_back(Inst); + while (!Worklist.empty()) { + Instruction *Curr = Worklist.pop_back_val(); + for (Use &OI : Curr->operands()) { + if (Instruction *Inst = dyn_cast<Instruction>(OI)) + if (Alive.insert(Inst).second) + Worklist.push_back(Inst); + } } // The inverse of the live set is the dead set. These are those instructions // which have no side effects and do not influence the control flow or return // value of the function, and may therefore be deleted safely. - // NOTE: We reuse the worklist vector here for memory efficiency. - for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) - if (!alive.count(I.getInstructionIterator())) { - worklist.push_back(I.getInstructionIterator()); - I->dropAllReferences(); + // NOTE: We reuse the Worklist vector here for memory efficiency. + for (Instruction &I : inst_range(F)) { + if (!Alive.count(&I)) { + Worklist.push_back(&I); + I.dropAllReferences(); } + } - for (SmallVectorImpl<Instruction *>::iterator I = worklist.begin(), - E = worklist.end(); I != E; ++I) { + for (Instruction *&I : Worklist) { ++NumRemoved; - (*I)->eraseFromParent(); + I->eraseFromParent(); } - return !worklist.empty(); + return !Worklist.empty(); } FunctionPass *llvm::createAggressiveDCEPass() { diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 06c3dfd..5c74885 100644 --- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -21,7 +21,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -53,12 +53,12 @@ struct AlignmentFromAssumptions : public FunctionPass { bool runOnFunction(Function &F); virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<ScalarEvolution>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<ScalarEvolution>(); } @@ -69,7 +69,6 @@ struct AlignmentFromAssumptions : public FunctionPass { // another assumption later, then we may change the alignment at that point. DenseMap<MemTransferInst *, unsigned> NewDestAlignments, NewSrcAlignments; - AssumptionTracker *AT; ScalarEvolution *SE; DominatorTree *DT; const DataLayout *DL; @@ -84,7 +83,7 @@ char AlignmentFromAssumptions::ID = 0; static const char aip_name[] = "Alignment from assumptions"; INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME, aip_name, false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME, @@ -411,7 +410,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) { bool AlignmentFromAssumptions::runOnFunction(Function &F) { bool Changed = false; - AT = &getAnalysis<AssumptionTracker>(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); @@ -420,8 +419,9 @@ bool AlignmentFromAssumptions::runOnFunction(Function &F) { NewDestAlignments.clear(); NewSrcAlignments.clear(); - for (auto &I : AT->assumptions(&F)) - Changed |= processAssumption(I); + for (auto &AssumeVH : AC.assumptions()) + if (AssumeVH) + Changed |= processAssumption(cast<CallInst>(AssumeVH)); return Changed; } diff --git a/lib/Transforms/Scalar/Android.mk b/lib/Transforms/Scalar/Android.mk index 9028b42..ed803cd 100644 --- a/lib/Transforms/Scalar/Android.mk +++ b/lib/Transforms/Scalar/Android.mk @@ -2,6 +2,7 @@ LOCAL_PATH:= $(call my-dir) transforms_scalar_SRC_FILES := \ ADCE.cpp \ + BDCE.cpp \ AlignmentFromAssumptions.cpp \ ConstantProp.cpp \ ConstantHoisting.cpp \ @@ -12,6 +13,7 @@ transforms_scalar_SRC_FILES := \ FlattenCFGPass.cpp \ GVN.cpp \ IndVarSimplify.cpp \ + InductiveRangeCheckElimination.cpp \ JumpThreading.cpp \ LICM.cpp \ LoadCombine.cpp \ @@ -24,11 +26,14 @@ transforms_scalar_SRC_FILES := \ LoopUnrollPass.cpp \ LoopUnswitch.cpp \ LowerAtomic.cpp \ + LowerExpectIntrinsic.cpp \ MemCpyOptimizer.cpp \ MergedLoadStoreMotion.cpp \ PartiallyInlineLibCalls.cpp \ + PlaceSafepoints.cpp \ Reassociate.cpp \ Reg2Mem.cpp \ + RewriteStatepointsForGC.cpp \ SCCP.cpp \ SROA.cpp \ SampleProfile.cpp \ @@ -38,6 +43,7 @@ transforms_scalar_SRC_FILES := \ SeparateConstOffsetFromGEP.cpp \ SimplifyCFGPass.cpp \ Sink.cpp \ + StraightLineStrengthReduce.cpp \ StructurizeCFG.cpp \ TailRecursionElimination.cpp diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp new file mode 100644 index 0000000..c7bd79d --- /dev/null +++ b/lib/Transforms/Scalar/BDCE.cpp @@ -0,0 +1,411 @@ +//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Bit-Tracking Dead Code Elimination pass. Some +// instructions (shifts, some ands, ors, etc.) kill some of their input bits. +// We track these dead bits and remove instructions that compute only these +// dead bits. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "bdce" + +STATISTIC(NumRemoved, "Number of instructions removed (unused)"); +STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)"); + +namespace { +struct BDCE : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + BDCE() : FunctionPass(ID) { + initializeBDCEPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function& F) override; + + void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.setPreservesCFG(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + } + + void determineLiveOperandBits(const Instruction *UserI, + const Instruction *I, unsigned OperandNo, + const APInt &AOut, APInt &AB, + APInt &KnownZero, APInt &KnownOne, + APInt &KnownZero2, APInt &KnownOne2); + + AssumptionCache *AC; + const DataLayout *DL; + DominatorTree *DT; +}; +} + +char BDCE::ID = 0; +INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", + false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", + false, false) + +static bool isAlwaysLive(Instruction *I) { + return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || + isa<LandingPadInst>(I) || I->mayHaveSideEffects(); +} + +void BDCE::determineLiveOperandBits(const Instruction *UserI, + const Instruction *I, unsigned OperandNo, + const APInt &AOut, APInt &AB, + APInt &KnownZero, APInt &KnownOne, + APInt &KnownZero2, APInt &KnownOne2) { + unsigned BitWidth = AB.getBitWidth(); + + // We're called once per operand, but for some instructions, we need to + // compute known bits of both operands in order to determine the live bits of + // either (when both operands are instructions themselves). We don't, + // however, want to do this twice, so we cache the result in APInts that live + // in the caller. For the two-relevant-operands case, both operand values are + // provided here. + auto ComputeKnownBits = [&](unsigned BitWidth, const Value *V1, + const Value *V2) { + KnownZero = APInt(BitWidth, 0); + KnownOne = APInt(BitWidth, 0); + computeKnownBits(const_cast<Value*>(V1), KnownZero, KnownOne, DL, 0, AC, + UserI, DT); + + if (V2) { + KnownZero2 = APInt(BitWidth, 0); + KnownOne2 = APInt(BitWidth, 0); + computeKnownBits(const_cast<Value*>(V2), KnownZero2, KnownOne2, DL, 0, AC, + UserI, DT); + } + }; + + switch (UserI->getOpcode()) { + default: break; + case Instruction::Call: + case Instruction::Invoke: + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI)) + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::bswap: + // The alive bits of the input are the swapped alive bits of + // the output. + AB = AOut.byteSwap(); + break; + case Intrinsic::ctlz: + if (OperandNo == 0) { + // We need some output bits, so we need all bits of the + // input to the left of, and including, the leftmost bit + // known to be one. + ComputeKnownBits(BitWidth, I, nullptr); + AB = APInt::getHighBitsSet(BitWidth, + std::min(BitWidth, KnownOne.countLeadingZeros()+1)); + } + break; + case Intrinsic::cttz: + if (OperandNo == 0) { + // We need some output bits, so we need all bits of the + // input to the right of, and including, the rightmost bit + // known to be one. + ComputeKnownBits(BitWidth, I, nullptr); + AB = APInt::getLowBitsSet(BitWidth, + std::min(BitWidth, KnownOne.countTrailingZeros()+1)); + } + break; + } + break; + case Instruction::Add: + case Instruction::Sub: + // Find the highest live output bit. We don't need any more input + // bits than that (adds, and thus subtracts, ripple only to the + // left). + AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits()); + break; + case Instruction::Shl: + if (OperandNo == 0) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(UserI->getOperand(1))) { + uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); + AB = AOut.lshr(ShiftAmt); + + // If the shift is nuw/nsw, then the high bits are not dead + // (because we've promised that they *must* be zero). + const ShlOperator *S = cast<ShlOperator>(UserI); + if (S->hasNoSignedWrap()) + AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1); + else if (S->hasNoUnsignedWrap()) + AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt); + } + break; + case Instruction::LShr: + if (OperandNo == 0) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(UserI->getOperand(1))) { + uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); + AB = AOut.shl(ShiftAmt); + + // If the shift is exact, then the low bits are not dead + // (they must be zero). + if (cast<LShrOperator>(UserI)->isExact()) + AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + } + break; + case Instruction::AShr: + if (OperandNo == 0) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(UserI->getOperand(1))) { + uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); + AB = AOut.shl(ShiftAmt); + // Because the high input bit is replicated into the + // high-order bits of the result, if we need any of those + // bits, then we must keep the highest input bit. + if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt)) + .getBoolValue()) + AB.setBit(BitWidth-1); + + // If the shift is exact, then the low bits are not dead + // (they must be zero). + if (cast<AShrOperator>(UserI)->isExact()) + AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + } + break; + case Instruction::And: + AB = AOut; + + // For bits that are known zero, the corresponding bits in the + // other operand are dead (unless they're both zero, in which + // case they can't both be dead, so just mark the LHS bits as + // dead). + if (OperandNo == 0) { + ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); + AB &= ~KnownZero2; + } else { + if (!isa<Instruction>(UserI->getOperand(0))) + ComputeKnownBits(BitWidth, UserI->getOperand(0), I); + AB &= ~(KnownZero & ~KnownZero2); + } + break; + case Instruction::Or: + AB = AOut; + + // For bits that are known one, the corresponding bits in the + // other operand are dead (unless they're both one, in which + // case they can't both be dead, so just mark the LHS bits as + // dead). + if (OperandNo == 0) { + ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); + AB &= ~KnownOne2; + } else { + if (!isa<Instruction>(UserI->getOperand(0))) + ComputeKnownBits(BitWidth, UserI->getOperand(0), I); + AB &= ~(KnownOne & ~KnownOne2); + } + break; + case Instruction::Xor: + case Instruction::PHI: + AB = AOut; + break; + case Instruction::Trunc: + AB = AOut.zext(BitWidth); + break; + case Instruction::ZExt: + AB = AOut.trunc(BitWidth); + break; + case Instruction::SExt: + AB = AOut.trunc(BitWidth); + // Because the high input bit is replicated into the + // high-order bits of the result, if we need any of those + // bits, then we must keep the highest input bit. + if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(), + AOut.getBitWidth() - BitWidth)) + .getBoolValue()) + AB.setBit(BitWidth-1); + break; + case Instruction::Select: + if (OperandNo != 0) + AB = AOut; + break; + } +} + +bool BDCE::runOnFunction(Function& F) { + if (skipOptnoneFunction(F)) + return false; + + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + DL = F.getParent()->getDataLayout(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + DenseMap<Instruction *, APInt> AliveBits; + SmallVector<Instruction*, 128> Worklist; + + // The set of visited instructions (non-integer-typed only). + SmallPtrSet<Instruction*, 128> Visited; + + // Collect the set of "root" instructions that are known live. + for (Instruction &I : inst_range(F)) { + if (!isAlwaysLive(&I)) + continue; + + DEBUG(dbgs() << "BDCE: Root: " << I << "\n"); + // For integer-valued instructions, set up an initial empty set of alive + // bits and add the instruction to the work list. For other instructions + // add their operands to the work list (for integer values operands, mark + // all bits as live). + if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) { + if (!AliveBits.count(&I)) { + AliveBits[&I] = APInt(IT->getBitWidth(), 0); + Worklist.push_back(&I); + } + + continue; + } + + // Non-integer-typed instructions... + for (Use &OI : I.operands()) { + if (Instruction *J = dyn_cast<Instruction>(OI)) { + if (IntegerType *IT = dyn_cast<IntegerType>(J->getType())) + AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth()); + Worklist.push_back(J); + } + } + // To save memory, we don't add I to the Visited set here. Instead, we + // check isAlwaysLive on every instruction when searching for dead + // instructions later (we need to check isAlwaysLive for the + // integer-typed instructions anyway). + } + + // Propagate liveness backwards to operands. + while (!Worklist.empty()) { + Instruction *UserI = Worklist.pop_back_val(); + + DEBUG(dbgs() << "BDCE: Visiting: " << *UserI); + APInt AOut; + if (UserI->getType()->isIntegerTy()) { + AOut = AliveBits[UserI]; + DEBUG(dbgs() << " Alive Out: " << AOut); + } + DEBUG(dbgs() << "\n"); + + if (!UserI->getType()->isIntegerTy()) + Visited.insert(UserI); + + APInt KnownZero, KnownOne, KnownZero2, KnownOne2; + // Compute the set of alive bits for each operand. These are anded into the + // existing set, if any, and if that changes the set of alive bits, the + // operand is added to the work-list. + for (Use &OI : UserI->operands()) { + if (Instruction *I = dyn_cast<Instruction>(OI)) { + if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) { + unsigned BitWidth = IT->getBitWidth(); + APInt AB = APInt::getAllOnesValue(BitWidth); + if (UserI->getType()->isIntegerTy() && !AOut && + !isAlwaysLive(UserI)) { + AB = APInt(BitWidth, 0); + } else { + // If all bits of the output are dead, then all bits of the input + // Bits of each operand that are used to compute alive bits of the + // output are alive, all others are dead. + determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB, + KnownZero, KnownOne, + KnownZero2, KnownOne2); + } + + // If we've added to the set of alive bits (or the operand has not + // been previously visited), then re-queue the operand to be visited + // again. + APInt ABPrev(BitWidth, 0); + auto ABI = AliveBits.find(I); + if (ABI != AliveBits.end()) + ABPrev = ABI->second; + + APInt ABNew = AB | ABPrev; + if (ABNew != ABPrev || ABI == AliveBits.end()) { + AliveBits[I] = std::move(ABNew); + Worklist.push_back(I); + } + } else if (!Visited.count(I)) { + Worklist.push_back(I); + } + } + } + } + + bool Changed = false; + // The inverse of the live set is the dead set. These are those instructions + // which have no side effects and do not influence the control flow or return + // value of the function, and may therefore be deleted safely. + // NOTE: We reuse the Worklist vector here for memory efficiency. + for (Instruction &I : inst_range(F)) { + // For live instructions that have all dead bits, first make them dead by + // replacing all uses with something else. Then, if they don't need to + // remain live (because they have side effects, etc.) we can remove them. + if (I.getType()->isIntegerTy()) { + auto ABI = AliveBits.find(&I); + if (ABI != AliveBits.end()) { + if (ABI->second.getBoolValue()) + continue; + + DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); + // FIXME: In theory we could substitute undef here instead of zero. + // This should be reconsidered once we settle on the semantics of + // undef, poison, etc. + Value *Zero = ConstantInt::get(I.getType(), 0); + ++NumSimplified; + I.replaceAllUsesWith(Zero); + Changed = true; + } + } else if (Visited.count(&I)) { + continue; + } + + if (isAlwaysLive(&I)) + continue; + + Worklist.push_back(&I); + I.dropAllReferences(); + Changed = true; + } + + for (Instruction *&I : Worklist) { + ++NumRemoved; + I->eraseFromParent(); + } + + return Changed; +} + +FunctionPass *llvm::createBitTrackingDCEPass() { + return new BDCE(); +} + diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index b3ee11e..d297eb1 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_library(LLVMScalarOpts ADCE.cpp AlignmentFromAssumptions.cpp + BDCE.cpp ConstantHoisting.cpp ConstantProp.cpp CorrelatedValuePropagation.cpp @@ -9,6 +10,7 @@ add_llvm_library(LLVMScalarOpts EarlyCSE.cpp FlattenCFGPass.cpp GVN.cpp + InductiveRangeCheckElimination.cpp IndVarSimplify.cpp JumpThreading.cpp LICM.cpp @@ -22,11 +24,14 @@ add_llvm_library(LLVMScalarOpts LoopUnrollPass.cpp LoopUnswitch.cpp LowerAtomic.cpp + LowerExpectIntrinsic.cpp MemCpyOptimizer.cpp MergedLoadStoreMotion.cpp PartiallyInlineLibCalls.cpp + PlaceSafepoints.cpp Reassociate.cpp Reg2Mem.cpp + RewriteStatepointsForGC.cpp SCCP.cpp SROA.cpp SampleProfile.cpp @@ -36,8 +41,13 @@ add_llvm_library(LLVMScalarOpts SeparateConstOffsetFromGEP.cpp SimplifyCFGPass.cpp Sink.cpp + StraightLineStrengthReduce.cpp StructurizeCFG.cpp TailRecursionElimination.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Scalar ) add_dependencies(LLVMScalarOpts intrinsics_gen) diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp index 27c177a..e3aab4b 100644 --- a/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -131,14 +131,14 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } private: /// \brief Initialize the pass. void setup(Function &Fn) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TTI = &getAnalysis<TargetTransformInfo>(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn); Entry = &Fn.getEntryBlock(); } @@ -176,7 +176,7 @@ char ConstantHoisting::ID = 0; INITIALIZE_PASS_BEGIN(ConstantHoisting, "consthoist", "Constant Hoisting", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(ConstantHoisting, "consthoist", "Constant Hoisting", false, false) @@ -186,6 +186,9 @@ FunctionPass *llvm::createConstantHoistingPass() { /// \brief Perform the constant hoisting optimization for the given function. bool ConstantHoisting::runOnFunction(Function &Fn) { + if (skipOptnoneFunction(Fn)) + return false; + DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n"); DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp index dd51ce1..29d4e05 100644 --- a/lib/Transforms/Scalar/ConstantProp.cpp +++ b/lib/Transforms/Scalar/ConstantProp.cpp @@ -26,7 +26,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include <set> using namespace llvm; @@ -45,7 +45,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; } @@ -53,7 +53,7 @@ namespace { char ConstantPropagation::ID = 0; INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop", "Simple constant propagation", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(ConstantPropagation, "constprop", "Simple constant propagation", false, false) @@ -70,7 +70,8 @@ bool ConstantPropagation::runOnFunction(Function &F) { bool Changed = false; DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); while (!WorkList.empty()) { Instruction *I = *WorkList.begin(); diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp index 99fac75..3b262a2 100644 --- a/lib/Transforms/Scalar/DCE.cpp +++ b/lib/Transforms/Scalar/DCE.cpp @@ -21,7 +21,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -42,7 +42,8 @@ namespace { bool runOnBasicBlock(BasicBlock &BB) override { if (skipOptnoneFunction(BB)) return false; - TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; bool Changed = false; for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { Instruction *Inst = DI++; @@ -95,7 +96,8 @@ bool DCE::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; - TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; // Start out with all of the instructions in the worklist... std::vector<Instruction*> WorkList; diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index a1ddc00..c2ce1d5 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -33,7 +33,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index cd2ecad..9309623 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -12,12 +12,13 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" @@ -26,7 +27,8 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/RecyclingAllocator.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <deque> using namespace llvm; @@ -40,49 +42,44 @@ STATISTIC(NumCSELoad, "Number of load instructions CSE'd"); STATISTIC(NumCSECall, "Number of call instructions CSE'd"); STATISTIC(NumDSE, "Number of trivial dead stores removed"); -static unsigned getHash(const void *V) { - return DenseMapInfo<const void*>::getHashValue(V); -} - //===----------------------------------------------------------------------===// // SimpleValue //===----------------------------------------------------------------------===// namespace { - /// SimpleValue - Instances of this struct represent available values in the - /// scoped hash table. - struct SimpleValue { - Instruction *Inst; +/// \brief Struct representing the available values in the scoped hash table. +struct SimpleValue { + Instruction *Inst; - SimpleValue(Instruction *I) : Inst(I) { - assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); - } + SimpleValue(Instruction *I) : Inst(I) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } - bool isSentinel() const { - return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || - Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); - } + bool isSentinel() const { + return Inst == DenseMapInfo<Instruction *>::getEmptyKey() || + Inst == DenseMapInfo<Instruction *>::getTombstoneKey(); + } - static bool canHandle(Instruction *Inst) { - // This can only handle non-void readnone functions. - if (CallInst *CI = dyn_cast<CallInst>(Inst)) - return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy(); - return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) || - isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) || - isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || - isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) || - isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst); - } - }; + static bool canHandle(Instruction *Inst) { + // This can only handle non-void readnone functions. + if (CallInst *CI = dyn_cast<CallInst>(Inst)) + return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy(); + return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) || + isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) || + isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || + isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) || + isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst); + } +}; } namespace llvm { -template<> struct DenseMapInfo<SimpleValue> { +template <> struct DenseMapInfo<SimpleValue> { static inline SimpleValue getEmptyKey() { - return DenseMapInfo<Instruction*>::getEmptyKey(); + return DenseMapInfo<Instruction *>::getEmptyKey(); } static inline SimpleValue getTombstoneKey() { - return DenseMapInfo<Instruction*>::getTombstoneKey(); + return DenseMapInfo<Instruction *>::getTombstoneKey(); } static unsigned getHashValue(SimpleValue Val); static bool isEqual(SimpleValue LHS, SimpleValue RHS); @@ -92,7 +89,7 @@ template<> struct DenseMapInfo<SimpleValue> { unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { Instruction *Inst = Val.Inst; // Hash in all of the operands as pointers. - if (BinaryOperator* BinOp = dyn_cast<BinaryOperator>(Inst)) { + if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) { Value *LHS = BinOp->getOperand(0); Value *RHS = BinOp->getOperand(1); if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1)) @@ -101,8 +98,9 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { if (isa<OverflowingBinaryOperator>(BinOp)) { // Hash the overflow behavior unsigned Overflow = - BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap | - BinOp->hasNoUnsignedWrap() * OverflowingBinaryOperator::NoUnsignedWrap; + BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap | + BinOp->hasNoUnsignedWrap() * + OverflowingBinaryOperator::NoUnsignedWrap; return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS); } @@ -135,12 +133,13 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) || - isa<ShuffleVectorInst>(Inst)) && "Invalid/unknown instruction"); + isa<ShuffleVectorInst>(Inst)) && + "Invalid/unknown instruction"); // Mix in the opcode. - return hash_combine(Inst->getOpcode(), - hash_combine_range(Inst->value_op_begin(), - Inst->value_op_end())); + return hash_combine( + Inst->getOpcode(), + hash_combine_range(Inst->value_op_begin(), Inst->value_op_end())); } bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { @@ -149,22 +148,24 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { if (LHS.isSentinel() || RHS.isSentinel()) return LHSI == RHSI; - if (LHSI->getOpcode() != RHSI->getOpcode()) return false; - if (LHSI->isIdenticalTo(RHSI)) return true; + if (LHSI->getOpcode() != RHSI->getOpcode()) + return false; + if (LHSI->isIdenticalTo(RHSI)) + return true; // If we're not strictly identical, we still might be a commutable instruction if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) { if (!LHSBinOp->isCommutative()) return false; - assert(isa<BinaryOperator>(RHSI) - && "same opcode, but different instruction type?"); + assert(isa<BinaryOperator>(RHSI) && + "same opcode, but different instruction type?"); BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI); // Check overflow attributes if (isa<OverflowingBinaryOperator>(LHSBinOp)) { - assert(isa<OverflowingBinaryOperator>(RHSBinOp) - && "same opcode, but different operator type?"); + assert(isa<OverflowingBinaryOperator>(RHSBinOp) && + "same opcode, but different operator type?"); if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() || LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap()) return false; @@ -172,16 +173,16 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { // Commuted equality return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) && - LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0); + LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0); } if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) { - assert(isa<CmpInst>(RHSI) - && "same opcode, but different instruction type?"); + assert(isa<CmpInst>(RHSI) && + "same opcode, but different instruction type?"); CmpInst *RHSCmp = cast<CmpInst>(RHSI); // Commuted equality return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) && - LHSCmp->getOperand(1) == RHSCmp->getOperand(0) && - LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate(); + LHSCmp->getOperand(1) == RHSCmp->getOperand(0) && + LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate(); } return false; @@ -192,57 +193,52 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { //===----------------------------------------------------------------------===// namespace { - /// CallValue - Instances of this struct represent available call values in - /// the scoped hash table. - struct CallValue { - Instruction *Inst; +/// \brief Struct representing the available call values in the scoped hash +/// table. +struct CallValue { + Instruction *Inst; - CallValue(Instruction *I) : Inst(I) { - assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); - } + CallValue(Instruction *I) : Inst(I) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } - bool isSentinel() const { - return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || - Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); - } + bool isSentinel() const { + return Inst == DenseMapInfo<Instruction *>::getEmptyKey() || + Inst == DenseMapInfo<Instruction *>::getTombstoneKey(); + } - static bool canHandle(Instruction *Inst) { - // Don't value number anything that returns void. - if (Inst->getType()->isVoidTy()) - return false; + static bool canHandle(Instruction *Inst) { + // Don't value number anything that returns void. + if (Inst->getType()->isVoidTy()) + return false; - CallInst *CI = dyn_cast<CallInst>(Inst); - if (!CI || !CI->onlyReadsMemory()) - return false; - return true; - } - }; + CallInst *CI = dyn_cast<CallInst>(Inst); + if (!CI || !CI->onlyReadsMemory()) + return false; + return true; + } +}; } namespace llvm { - template<> struct DenseMapInfo<CallValue> { - static inline CallValue getEmptyKey() { - return DenseMapInfo<Instruction*>::getEmptyKey(); - } - static inline CallValue getTombstoneKey() { - return DenseMapInfo<Instruction*>::getTombstoneKey(); - } - static unsigned getHashValue(CallValue Val); - static bool isEqual(CallValue LHS, CallValue RHS); - }; +template <> struct DenseMapInfo<CallValue> { + static inline CallValue getEmptyKey() { + return DenseMapInfo<Instruction *>::getEmptyKey(); + } + static inline CallValue getTombstoneKey() { + return DenseMapInfo<Instruction *>::getTombstoneKey(); + } + static unsigned getHashValue(CallValue Val); + static bool isEqual(CallValue LHS, CallValue RHS); +}; } + unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) { Instruction *Inst = Val.Inst; - // Hash in all of the operands as pointers. - unsigned Res = 0; - for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) { - assert(!Inst->getOperand(i)->getType()->isMetadataTy() && - "Cannot value number calls with metadata operands"); - Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); - } - - // Mix in the opcode. - return (Res << 1) ^ Inst->getOpcode(); + // Hash all of the operands as pointers and mix in the opcode. + return hash_combine( + Inst->getOpcode(), + hash_combine_range(Inst->value_op_begin(), Inst->value_op_end())); } bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { @@ -252,103 +248,106 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { return LHSI->isIdenticalTo(RHSI); } - //===----------------------------------------------------------------------===// -// EarlyCSE pass. +// EarlyCSE implementation //===----------------------------------------------------------------------===// namespace { - -/// EarlyCSE - This pass does a simple depth-first walk over the dominator -/// tree, eliminating trivially redundant instructions and using instsimplify -/// to canonicalize things as it goes. It is intended to be fast and catch -/// obvious cases so that instcombine and other passes are more effective. It -/// is expected that a later pass of GVN will catch the interesting/hard -/// cases. -class EarlyCSE : public FunctionPass { +/// \brief A simple and fast domtree-based CSE pass. +/// +/// This pass does a simple depth-first walk over the dominator tree, +/// eliminating trivially redundant instructions and using instsimplify to +/// canonicalize things as it goes. It is intended to be fast and catch obvious +/// cases so that instcombine and other passes are more effective. It is +/// expected that a later pass of GVN will catch the interesting/hard cases. +class EarlyCSE { public: + Function &F; const DataLayout *DL; - const TargetLibraryInfo *TLI; - DominatorTree *DT; - AssumptionTracker *AT; - typedef RecyclingAllocator<BumpPtrAllocator, - ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy; - typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>, + const TargetLibraryInfo &TLI; + const TargetTransformInfo &TTI; + DominatorTree &DT; + AssumptionCache &AC; + typedef RecyclingAllocator< + BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy; + typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>, AllocatorTy> ScopedHTType; - /// AvailableValues - This scoped hash table contains the current values of - /// all of our simple scalar expressions. As we walk down the domtree, we - /// look to see if instructions are in this: if so, we replace them with what - /// we find, otherwise we insert them so that dominated values can succeed in - /// their lookup. - ScopedHTType *AvailableValues; - - /// AvailableLoads - This scoped hash table contains the current values - /// of loads. This allows us to get efficient access to dominating loads when - /// we have a fully redundant load. In addition to the most recent load, we - /// keep track of a generation count of the read, which is compared against - /// the current generation count. The current generation count is - /// incremented after every possibly writing memory operation, which ensures - /// that we only CSE loads with other loads that have no intervening store. - typedef RecyclingAllocator<BumpPtrAllocator, - ScopedHashTableVal<Value*, std::pair<Value*, unsigned> > > LoadMapAllocator; - typedef ScopedHashTable<Value*, std::pair<Value*, unsigned>, - DenseMapInfo<Value*>, LoadMapAllocator> LoadHTType; - LoadHTType *AvailableLoads; - - /// AvailableCalls - This scoped hash table contains the current values - /// of read-only call values. It uses the same generation count as loads. - typedef ScopedHashTable<CallValue, std::pair<Value*, unsigned> > CallHTType; - CallHTType *AvailableCalls; - - /// CurrentGeneration - This is the current generation of the memory value. + /// \brief A scoped hash table of the current values of all of our simple + /// scalar expressions. + /// + /// As we walk down the domtree, we look to see if instructions are in this: + /// if so, we replace them with what we find, otherwise we insert them so + /// that dominated values can succeed in their lookup. + ScopedHTType AvailableValues; + + /// \brief A scoped hash table of the current values of loads. + /// + /// This allows us to get efficient access to dominating loads when we have + /// a fully redundant load. In addition to the most recent load, we keep + /// track of a generation count of the read, which is compared against the + /// current generation count. The current generation count is incremented + /// after every possibly writing memory operation, which ensures that we only + /// CSE loads with other loads that have no intervening store. + typedef RecyclingAllocator< + BumpPtrAllocator, + ScopedHashTableVal<Value *, std::pair<Value *, unsigned>>> + LoadMapAllocator; + typedef ScopedHashTable<Value *, std::pair<Value *, unsigned>, + DenseMapInfo<Value *>, LoadMapAllocator> LoadHTType; + LoadHTType AvailableLoads; + + /// \brief A scoped hash table of the current values of read-only call + /// values. + /// + /// It uses the same generation count as loads. + typedef ScopedHashTable<CallValue, std::pair<Value *, unsigned>> CallHTType; + CallHTType AvailableCalls; + + /// \brief This is the current generation of the memory value. unsigned CurrentGeneration; - static char ID; - explicit EarlyCSE() : FunctionPass(ID) { - initializeEarlyCSEPass(*PassRegistry::getPassRegistry()); + /// \brief Set up the EarlyCSE runner for a particular function. + EarlyCSE(Function &F, const DataLayout *DL, const TargetLibraryInfo &TLI, + const TargetTransformInfo &TTI, DominatorTree &DT, + AssumptionCache &AC) + : F(F), DL(DL), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) { } - bool runOnFunction(Function &F) override; + bool run(); private: - - // NodeScope - almost a POD, but needs to call the constructors for the - // scoped hash tables so that a new scope gets pushed on. These are RAII so - // that the scope gets popped when the NodeScope is destroyed. + // Almost a POD, but needs to call the constructors for the scoped hash + // tables so that a new scope gets pushed on. These are RAII so that the + // scope gets popped when the NodeScope is destroyed. class NodeScope { - public: - NodeScope(ScopedHTType *availableValues, - LoadHTType *availableLoads, - CallHTType *availableCalls) : - Scope(*availableValues), - LoadScope(*availableLoads), - CallScope(*availableCalls) {} - - private: - NodeScope(const NodeScope&) LLVM_DELETED_FUNCTION; - void operator=(const NodeScope&) LLVM_DELETED_FUNCTION; + public: + NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, + CallHTType &AvailableCalls) + : Scope(AvailableValues), LoadScope(AvailableLoads), + CallScope(AvailableCalls) {} + + private: + NodeScope(const NodeScope &) = delete; + void operator=(const NodeScope &) = delete; ScopedHTType::ScopeTy Scope; LoadHTType::ScopeTy LoadScope; CallHTType::ScopeTy CallScope; }; - // StackNode - contains all the needed information to create a stack for - // doing a depth first tranversal of the tree. This includes scopes for - // values, loads, and calls as well as the generation. There is a child - // iterator so that the children do not need to be store spearately. + // Contains all the needed information to create a stack for doing a depth + // first tranversal of the tree. This includes scopes for values, loads, and + // calls as well as the generation. There is a child iterator so that the + // children do not need to be store spearately. class StackNode { - public: - StackNode(ScopedHTType *availableValues, - LoadHTType *availableLoads, - CallHTType *availableCalls, - unsigned cg, DomTreeNode *n, - DomTreeNode::iterator child, DomTreeNode::iterator end) : - CurrentGeneration(cg), ChildGeneration(cg), Node(n), - ChildIter(child), EndIter(end), - Scopes(availableValues, availableLoads, availableCalls), - Processed(false) {} + public: + StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, + CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n, + DomTreeNode::iterator child, DomTreeNode::iterator end) + : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child), + EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls), + Processed(false) {} // Accessors. unsigned currentGeneration() { return CurrentGeneration; } @@ -365,9 +364,9 @@ private: bool isProcessed() { return Processed; } void process() { Processed = true; } - private: - StackNode(const StackNode&) LLVM_DELETED_FUNCTION; - void operator=(const StackNode&) LLVM_DELETED_FUNCTION; + private: + StackNode(const StackNode &) = delete; + void operator=(const StackNode &) = delete; // Members. unsigned CurrentGeneration; @@ -379,31 +378,78 @@ private: bool Processed; }; + /// \brief Wrapper class to handle memory instructions, including loads, + /// stores and intrinsic loads and stores defined by the target. + class ParseMemoryInst { + public: + ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI) + : Load(false), Store(false), Vol(false), MayReadFromMemory(false), + MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) { + MayReadFromMemory = Inst->mayReadFromMemory(); + MayWriteToMemory = Inst->mayWriteToMemory(); + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { + MemIntrinsicInfo Info; + if (!TTI.getTgtMemIntrinsic(II, Info)) + return; + if (Info.NumMemRefs == 1) { + Store = Info.WriteMem; + Load = Info.ReadMem; + MatchingId = Info.MatchingId; + MayReadFromMemory = Info.ReadMem; + MayWriteToMemory = Info.WriteMem; + Vol = Info.Vol; + Ptr = Info.PtrVal; + } + } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + Load = true; + Vol = !LI->isSimple(); + Ptr = LI->getPointerOperand(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + Store = true; + Vol = !SI->isSimple(); + Ptr = SI->getPointerOperand(); + } + } + bool isLoad() { return Load; } + bool isStore() { return Store; } + bool isVolatile() { return Vol; } + bool isMatchingMemLoc(const ParseMemoryInst &Inst) { + return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId; + } + bool isValid() { return Ptr != nullptr; } + int getMatchingId() { return MatchingId; } + Value *getPtr() { return Ptr; } + bool mayReadFromMemory() { return MayReadFromMemory; } + bool mayWriteToMemory() { return MayWriteToMemory; } + + private: + bool Load; + bool Store; + bool Vol; + bool MayReadFromMemory; + bool MayWriteToMemory; + // For regular (non-intrinsic) loads/stores, this is set to -1. For + // intrinsic loads/stores, the id is retrieved from the corresponding + // field in the MemIntrinsicInfo structure. That field contains + // non-negative values only. + int MatchingId; + Value *Ptr; + }; + bool processNode(DomTreeNode *Node); - // This transformation requires dominator postdominator info - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfo>(); - AU.setPreservesCFG(); + Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const { + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) + return LI; + else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + return SI->getValueOperand(); + assert(isa<IntrinsicInst>(Inst) && "Instruction not supported"); + return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst), + ExpectedType); } }; } -char EarlyCSE::ID = 0; - -// createEarlyCSEPass - The public interface to this file. -FunctionPass *llvm::createEarlyCSEPass() { - return new EarlyCSE(); -} - -INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) -INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false) - bool EarlyCSE::processNode(DomTreeNode *Node) { BasicBlock *BB = Node->getBlock(); @@ -420,17 +466,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { /// as long as there in no instruction that reads memory. If we see a store /// to the same location, we delete the dead store. This zaps trivial dead /// stores which can occur in bitfield code among other things. - StoreInst *LastStore = nullptr; + Instruction *LastStore = nullptr; bool Changed = false; // See if any instructions in the block can be eliminated. If so, do it. If // not, add them to AvailableValues. - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { Instruction *Inst = I++; // Dead instructions should just be removed. - if (isInstructionTriviallyDead(Inst, TLI)) { + if (isInstructionTriviallyDead(Inst, &TLI)) { DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); Inst->eraseFromParent(); Changed = true; @@ -449,7 +495,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If the instruction can be simplified (e.g. X+0 = X) then replace it with // its simpler value. - if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT, AT)) { + if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) { DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); Inst->replaceAllUsesWith(V); Inst->eraseFromParent(); @@ -461,7 +507,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If this is a simple instruction that we can value number, process it. if (SimpleValue::canHandle(Inst)) { // See if the instruction has an available value. If so, use it. - if (Value *V = AvailableValues->lookup(Inst)) { + if (Value *V = AvailableValues.lookup(Inst)) { DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V << '\n'); Inst->replaceAllUsesWith(V); Inst->eraseFromParent(); @@ -471,52 +517,66 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { } // Otherwise, just remember that this value is available. - AvailableValues->insert(Inst, Inst); + AvailableValues.insert(Inst, Inst); continue; } + ParseMemoryInst MemInst(Inst, TTI); // If this is a non-volatile load, process it. - if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + if (MemInst.isValid() && MemInst.isLoad()) { // Ignore volatile loads. - if (!LI->isSimple()) { + if (MemInst.isVolatile()) { LastStore = nullptr; + // Don't CSE across synchronization boundaries. + if (Inst->mayWriteToMemory()) + ++CurrentGeneration; continue; } // If we have an available version of this load, and if it is the right // generation, replace this instruction. - std::pair<Value*, unsigned> InVal = - AvailableLoads->lookup(Inst->getOperand(0)); + std::pair<Value *, unsigned> InVal = + AvailableLoads.lookup(MemInst.getPtr()); if (InVal.first != nullptr && InVal.second == CurrentGeneration) { - DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << " to: " - << *InVal.first << '\n'); - if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); - Inst->eraseFromParent(); - Changed = true; - ++NumCSELoad; - continue; + Value *Op = getOrCreateResult(InVal.first, Inst->getType()); + if (Op != nullptr) { + DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst + << " to: " << *InVal.first << '\n'); + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(Op); + Inst->eraseFromParent(); + Changed = true; + ++NumCSELoad; + continue; + } } // Otherwise, remember that we have this instruction. - AvailableLoads->insert(Inst->getOperand(0), - std::pair<Value*, unsigned>(Inst, CurrentGeneration)); + AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( + Inst, CurrentGeneration)); LastStore = nullptr; continue; } // If this instruction may read from memory, forget LastStore. - if (Inst->mayReadFromMemory()) + // Load/store intrinsics will indicate both a read and a write to + // memory. The target may override this (e.g. so that a store intrinsic + // does not read from memory, and thus will be treated the same as a + // regular store for commoning purposes). + if (Inst->mayReadFromMemory() && + !(MemInst.isValid() && !MemInst.mayReadFromMemory())) LastStore = nullptr; // If this is a read-only call, process it. if (CallValue::canHandle(Inst)) { // If we have an available version of this call, and if it is the right // generation, replace this instruction. - std::pair<Value*, unsigned> InVal = AvailableCalls->lookup(Inst); + std::pair<Value *, unsigned> InVal = AvailableCalls.lookup(Inst); if (InVal.first != nullptr && InVal.second == CurrentGeneration) { - DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << " to: " - << *InVal.first << '\n'); - if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); + DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst + << " to: " << *InVal.first << '\n'); + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(InVal.first); Inst->eraseFromParent(); Changed = true; ++NumCSECall; @@ -524,8 +584,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { } // Otherwise, remember that we have this instruction. - AvailableCalls->insert(Inst, - std::pair<Value*, unsigned>(Inst, CurrentGeneration)); + AvailableCalls.insert( + Inst, std::pair<Value *, unsigned>(Inst, CurrentGeneration)); continue; } @@ -535,17 +595,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (Inst->mayWriteToMemory()) { ++CurrentGeneration; - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (MemInst.isValid() && MemInst.isStore()) { // We do a trivial form of DSE if there are two stores to the same // location with no intervening loads. Delete the earlier store. - if (LastStore && - LastStore->getPointerOperand() == SI->getPointerOperand()) { - DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << " due to: " - << *Inst << '\n'); - LastStore->eraseFromParent(); - Changed = true; - ++NumDSE; - LastStore = nullptr; + if (LastStore) { + ParseMemoryInst LastStoreMemInst(LastStore, TTI); + if (LastStoreMemInst.isMatchingMemLoc(MemInst)) { + DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore + << " due to: " << *Inst << '\n'); + LastStore->eraseFromParent(); + Changed = true; + ++NumDSE; + LastStore = nullptr; + } // fallthrough - we can exploit information about this store } @@ -554,12 +616,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // version of the pointer. It is safe to forward from volatile stores // to non-volatile loads, so we don't have to check for volatility of // the store. - AvailableLoads->insert(SI->getPointerOperand(), - std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration)); + AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( + Inst, CurrentGeneration)); // Remember that this was the last store we saw for DSE. - if (SI->isSimple()) - LastStore = SI; + if (!MemInst.isVolatile()) + LastStore = Inst; } } } @@ -567,40 +629,20 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { return Changed; } - -bool EarlyCSE::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - - // Note, deque is being used here because there is significant performance gains - // over vector when the container becomes very large due to the specific access - // patterns. For more information see the mailing list discussion on this: +bool EarlyCSE::run() { + // Note, deque is being used here because there is significant performance + // gains over vector when the container becomes very large due to the + // specific access patterns. For more information see the mailing list + // discussion on this: // http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html std::deque<StackNode *> nodesToProcess; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - AT = &getAnalysis<AssumptionTracker>(); - - // Tables that the pass uses when walking the domtree. - ScopedHTType AVTable; - AvailableValues = &AVTable; - LoadHTType LoadTable; - AvailableLoads = &LoadTable; - CallHTType CallTable; - AvailableCalls = &CallTable; - - CurrentGeneration = 0; bool Changed = false; // Process the root node. - nodesToProcess.push_back( - new StackNode(AvailableValues, AvailableLoads, AvailableCalls, - CurrentGeneration, DT->getRootNode(), - DT->getRootNode()->begin(), - DT->getRootNode()->end())); + nodesToProcess.push_back(new StackNode( + AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration, + DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end())); // Save the current generation. unsigned LiveOutGeneration = CurrentGeneration; @@ -624,11 +666,9 @@ bool EarlyCSE::runOnFunction(Function &F) { // Push the next child onto the stack. DomTreeNode *child = NodeToProcess->nextChild(); nodesToProcess.push_back( - new StackNode(AvailableValues, - AvailableLoads, - AvailableCalls, - NodeToProcess->childGeneration(), child, - child->begin(), child->end())); + new StackNode(AvailableValues, AvailableLoads, AvailableCalls, + NodeToProcess->childGeneration(), child, child->begin(), + child->end())); } else { // It has been processed, and there are no more children to process, // so delete it and pop it off the stack. @@ -642,3 +682,78 @@ bool EarlyCSE::runOnFunction(Function &F) { return Changed; } + +PreservedAnalyses EarlyCSEPass::run(Function &F, + AnalysisManager<Function> *AM) { + const DataLayout *DL = F.getParent()->getDataLayout(); + + auto &TLI = AM->getResult<TargetLibraryAnalysis>(F); + auto &TTI = AM->getResult<TargetIRAnalysis>(F); + auto &DT = AM->getResult<DominatorTreeAnalysis>(F); + auto &AC = AM->getResult<AssumptionAnalysis>(F); + + EarlyCSE CSE(F, DL, TLI, TTI, DT, AC); + + if (!CSE.run()) + return PreservedAnalyses::all(); + + // CSE preserves the dominator tree because it doesn't mutate the CFG. + // FIXME: Bundle this with other CFG-preservation. + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + return PA; +} + +namespace { +/// \brief A simple and fast domtree-based CSE pass. +/// +/// This pass does a simple depth-first walk over the dominator tree, +/// eliminating trivially redundant instructions and using instsimplify to +/// canonicalize things as it goes. It is intended to be fast and catch obvious +/// cases so that instcombine and other passes are more effective. It is +/// expected that a later pass of GVN will catch the interesting/hard cases. +class EarlyCSELegacyPass : public FunctionPass { +public: + static char ID; + + EarlyCSELegacyPass() : FunctionPass(ID) { + initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipOptnoneFunction(F)) + return false; + + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + auto *DL = DLP ? &DLP->getDataLayout() : nullptr; + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + + EarlyCSE CSE(F, DL, TLI, TTI, DT, AC); + + return CSE.run(); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); + } +}; +} + +char EarlyCSELegacyPass::ID = 0; + +FunctionPass *llvm::createEarlyCSEPass() { return new EarlyCSELegacyPass(); } + +INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false) diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 7dba4e2..73a1f25 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -20,11 +20,12 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -44,7 +45,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -457,7 +458,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) { return e; } -/// lookup - Returns the value number of the specified value. Fails if +/// Returns the value number of the specified value. Fails if /// the value has not yet been numbered. uint32_t ValueTable::lookup(Value *V) const { DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V); @@ -465,7 +466,7 @@ uint32_t ValueTable::lookup(Value *V) const { return VI->second; } -/// lookup_or_add_cmp - Returns the value number of the given comparison, +/// Returns the value number of the given comparison, /// assigning it a new number if it did not have one before. Useful when /// we deduced the result of a comparison, but don't immediately have an /// instruction realizing that comparison to hand. @@ -478,14 +479,14 @@ uint32_t ValueTable::lookup_or_add_cmp(unsigned Opcode, return e; } -/// clear - Remove all entries from the ValueTable. +/// Remove all entries from the ValueTable. void ValueTable::clear() { valueNumbering.clear(); expressionNumbering.clear(); nextValueNumber = 1; } -/// erase - Remove a value from the value numbering. +/// Remove a value from the value numbering. void ValueTable::erase(Value *V) { valueNumbering.erase(V); } @@ -581,8 +582,8 @@ namespace { return cast<MemIntrinsic>(Val.getPointer()); } - /// MaterializeAdjustedValue - Emit code into this block to adjust the value - /// defined here to the specified type. This handles various coercion cases. + /// Emit code into this block to adjust the value defined here to the + /// specified type. This handles various coercion cases. Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const; }; @@ -592,12 +593,12 @@ namespace { DominatorTree *DT; const DataLayout *DL; const TargetLibraryInfo *TLI; - AssumptionTracker *AT; + AssumptionCache *AC; SetVector<BasicBlock *> DeadBlocks; ValueTable VN; - /// LeaderTable - A mapping from value numbers to lists of Value*'s that + /// A mapping from value numbers to lists of Value*'s that /// have that value number. Use findLeader to query it. struct LeaderTableEntry { Value *Val; @@ -622,7 +623,7 @@ namespace { bool runOnFunction(Function &F) override; - /// markInstructionForDeletion - This removes the specified instruction from + /// This removes the specified instruction from /// our various maps and marks it for deletion. void markInstructionForDeletion(Instruction *I) { VN.erase(I); @@ -634,8 +635,7 @@ namespace { AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } MemoryDependenceAnalysis &getMemDep() const { return *MD; } private: - /// addToLeaderTable - Push a new Value to the LeaderTable onto the list for - /// its value number. + /// Push a new Value to the LeaderTable onto the list for its value number. void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) { LeaderTableEntry &Curr = LeaderTable[N]; if (!Curr.Val) { @@ -651,7 +651,7 @@ namespace { Curr.Next = Node; } - /// removeFromLeaderTable - Scan the list of values corresponding to a given + /// Scan the list of values corresponding to a given /// value number, and remove the given instruction if encountered. void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) { LeaderTableEntry* Prev = nullptr; @@ -682,9 +682,9 @@ namespace { // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); if (!NoLoads) AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); @@ -709,6 +709,9 @@ namespace { void dump(DenseMap<uint32_t, Value*> &d); bool iterateOnFunction(Function &F); bool performPRE(Function &F); + bool performScalarPRE(Instruction *I); + bool performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, + unsigned int ValNo); Value *findLeader(const BasicBlock *BB, uint32_t num); void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; @@ -725,16 +728,16 @@ namespace { char GVN::ID = 0; } -// createGVNPass - The public interface to this file... +// The public interface to this file... FunctionPass *llvm::createGVNPass(bool NoLoads) { return new GVN(NoLoads); } INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) @@ -750,7 +753,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) { } #endif -/// IsValueFullyAvailableInBlock - Return true if we can prove that the value +/// Return true if we can prove that the value /// we're analyzing is fully available in the specified block. As we go, keep /// track of which blocks we know are fully alive in FullyAvailableBlocks. This /// map is actually a tri-state map with the following values: @@ -796,7 +799,7 @@ static bool IsValueFullyAvailableInBlock(BasicBlock *BB, return true; -// SpeculationFailure - If we get here, we found out that this is not, after +// If we get here, we found out that this is not, after // all, a fully-available block. We have a problem if we speculated on this and // used the speculation to mark other blocks as available. SpeculationFailure: @@ -831,8 +834,7 @@ SpeculationFailure: } -/// CanCoerceMustAliasedValueToLoad - Return true if -/// CoerceAvailableValueToLoadType will succeed. +/// Return true if CoerceAvailableValueToLoadType will succeed. static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, const DataLayout &DL) { @@ -851,7 +853,7 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, return true; } -/// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and +/// If we saw a store of a value to memory, and /// then a load from a must-aliased pointer of a different type, try to coerce /// the stored value. LoadedTy is the type of the load we want to replace and /// InsertPt is the place to insert new instructions. @@ -936,7 +938,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, return new BitCastInst(StoredVal, LoadedTy, "bitcast", InsertPt); } -/// AnalyzeLoadFromClobberingWrite - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering memory write (store, /// memset, memcpy, memmove). This means that the write *may* provide bits used /// by the load but we can't be sure because the pointers don't mustalias. @@ -1016,7 +1018,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, return LoadOffset-StoreOffset; } -/// AnalyzeLoadFromClobberingStore - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering store. static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, StoreInst *DepSI, @@ -1032,7 +1034,7 @@ static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, StorePtr, StoreSize, DL); } -/// AnalyzeLoadFromClobberingLoad - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being clobbered by another load. See if /// the other load can feed into the second load. static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, @@ -1108,7 +1110,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, } -/// GetStoreValueForLoad - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering store. This means /// that the store provides bits used by the load but we the pointers don't /// mustalias. Check this case to see if there is anything more we can do @@ -1147,7 +1149,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, DL); } -/// GetLoadValueForLoad - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering load. This means /// that the load *may* provide bits used by the load but we can't be sure /// because the pointers don't mustalias. Check this case to see if there is @@ -1210,7 +1212,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, } -/// GetMemInstValueForLoad - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering mem intrinsic. static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, Type *LoadTy, Instruction *InsertPt, @@ -1267,7 +1269,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, } -/// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock, +/// Given a set of loads specified by ValuesPerBlock, /// construct SSA form, allowing us to eliminate LI. This returns the value /// that should be used at LI's definition site. static Value *ConstructSSAForLoadSet(LoadInst *LI, @@ -1621,7 +1623,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // If all preds have a single successor, then we know it is safe to insert // the load on the pred (?!?), so we can insert code to materialize the // pointer if it is not available. - PHITransAddr Address(LI->getPointerOperand(), DL, AT); + PHITransAddr Address(LI->getPointerOperand(), DL, AC); Value *LoadPtr = nullptr; LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT, NewInsts); @@ -1702,13 +1704,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, return true; } -/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are +/// Attempt to eliminate a load whose dependencies are /// non-local by performing PHI construction. bool GVN::processNonLocalLoad(LoadInst *LI) { // Step 1: Find the non-local dependencies of the load. LoadDepVect Deps; - AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI); - MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps); + MD->getNonLocalPointerDependency(LI, Deps); // If we had to process more than one hundred blocks to find the // dependencies, this load isn't worth worrying about. Optimizing @@ -1729,6 +1730,15 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return false; } + // If this load follows a GEP, see if we can PRE the indices before analyzing. + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) { + for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(), + OE = GEP->idx_end(); + OI != OE; ++OI) + if (Instruction *I = dyn_cast<Instruction>(OI->get())) + performScalarPRE(I); + } + // Step 2: Analyze the availability of the load AvailValInBlkVect ValuesPerBlock; UnavailBlkVect UnavailableBlocks; @@ -1807,7 +1817,7 @@ static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) { I->replaceAllUsesWith(Repl); } -/// processLoad - Attempt to eliminate a load, first by eliminating it +/// Attempt to eliminate a load, first by eliminating it /// locally, and then attempting non-local elimination if that fails. bool GVN::processLoad(LoadInst *L) { if (!MD) @@ -2006,7 +2016,7 @@ bool GVN::processLoad(LoadInst *L) { return false; } -// findLeader - In order to find a leader for a given value number at a +// In order to find a leader for a given value number at a // specific basic block, we first obtain the list of all Values for that number, // and then scan the list to find one whose block dominates the block in // question. This is fast because dominator tree queries consist of only @@ -2034,9 +2044,8 @@ Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) { return Val; } -/// replaceAllDominatedUsesWith - Replace all uses of 'From' with 'To' if the -/// use is dominated by the given basic block. Returns the number of uses that -/// were replaced. +/// Replace all uses of 'From' with 'To' if the use is dominated by the given +/// basic block. Returns the number of uses that were replaced. unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To, const BasicBlockEdge &Root) { unsigned Count = 0; @@ -2052,7 +2061,7 @@ unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To, return Count; } -/// isOnlyReachableViaThisEdge - There is an edge from 'Src' to 'Dst'. Return +/// There is an edge from 'Src' to 'Dst'. Return /// true if every path from the entry block to 'Dst' passes via this edge. In /// particular 'Dst' must not be reachable via another edge from 'Src'. static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, @@ -2069,7 +2078,7 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, return Pred != nullptr; } -/// propagateEquality - The given values are known to be equal in every block +/// The given values are known to be equal in every block /// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with /// 'RHS' everywhere in the scope. Returns whether a change was made. bool GVN::propagateEquality(Value *LHS, Value *RHS, @@ -2096,15 +2105,15 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, std::swap(LHS, RHS); assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!"); - // If there is no obvious reason to prefer the left-hand side over the right- - // hand side, ensure the longest lived term is on the right-hand side, so the - // shortest lived term will be replaced by the longest lived. This tends to - // expose more simplifications. + // If there is no obvious reason to prefer the left-hand side over the + // right-hand side, ensure the longest lived term is on the right-hand side, + // so the shortest lived term will be replaced by the longest lived. + // This tends to expose more simplifications. uint32_t LVN = VN.lookup_or_add(LHS); if ((isa<Argument>(LHS) && isa<Argument>(RHS)) || (isa<Instruction>(LHS) && isa<Instruction>(RHS))) { - // Move the 'oldest' value to the right-hand side, using the value number as - // a proxy for age. + // Move the 'oldest' value to the right-hand side, using the value number + // as a proxy for age. uint32_t RVN = VN.lookup_or_add(RHS); if (LVN < RVN) { std::swap(LHS, RHS); @@ -2133,10 +2142,10 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, NumGVNEqProp += NumReplacements; } - // Now try to deduce additional equalities from this one. For example, if the - // known equality was "(A != B)" == "false" then it follows that A and B are - // equal in the scope. Only boolean equalities with an explicit true or false - // RHS are currently supported. + // Now try to deduce additional equalities from this one. For example, if + // the known equality was "(A != B)" == "false" then it follows that A and B + // are equal in the scope. Only boolean equalities with an explicit true or + // false RHS are currently supported. if (!RHS->getType()->isIntegerTy(1)) // Not a boolean equality - bail out. continue; @@ -2161,7 +2170,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, // If we are propagating an equality like "(A == B)" == "true" then also // propagate the equality A == B. When propagating a comparison such as // "(A >= B)" == "true", replace all instances of "A < B" with "false". - if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) { + if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) { Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1); // If "A == B" is known true, or "A != B" is known false, then replace @@ -2170,12 +2179,28 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) Worklist.push_back(std::make_pair(Op0, Op1)); + // Handle the floating point versions of equality comparisons too. + if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) || + (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) { + + // Floating point -0.0 and 0.0 compare equal, so we can only + // propagate values if we know that we have a constant and that + // its value is non-zero. + + // FIXME: We should do this optimization if 'no signed zeros' is + // applicable via an instruction-level fast-math-flag or some other + // indicator that relaxed FP semantics are being used. + + if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero()) + Worklist.push_back(std::make_pair(Op0, Op1)); + } + // If "A >= B" is known true, replace "A < B" with false everywhere. CmpInst::Predicate NotPred = Cmp->getInversePredicate(); Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse); - // Since we don't have the instruction "A < B" immediately to hand, work out - // the value number that it would have and use that to find an appropriate - // instruction (if any). + // Since we don't have the instruction "A < B" immediately to hand, work + // out the value number that it would have and use that to find an + // appropriate instruction (if any). uint32_t NextNum = VN.getNextUnusedValueNumber(); uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1); // If the number we were assigned was brand new then there is no point in @@ -2203,7 +2228,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, return Changed; } -/// processInstruction - When calculating availability, handle an instruction +/// When calculating availability, handle an instruction /// by inserting it into the appropriate sets bool GVN::processInstruction(Instruction *I) { // Ignore dbg info intrinsics. @@ -2214,7 +2239,7 @@ bool GVN::processInstruction(Instruction *I) { // to value numbering it. Value numbering often exposes redundancies, for // example if it determines that %y is equal to %x then the instruction // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. - if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AT)) { + if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) { I->replaceAllUsesWith(V); if (MD && V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); @@ -2334,8 +2359,8 @@ bool GVN::runOnFunction(Function& F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - AT = &getAnalysis<AssumptionTracker>(); - TLI = &getAnalysis<TargetLibraryInfo>(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); VN.setMemDep(MD); VN.setDomTree(DT); @@ -2348,7 +2373,8 @@ bool GVN::runOnFunction(Function& F) { for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { BasicBlock *BB = FI++; - bool removedBlock = MergeBlockIntoPredecessor(BB, this); + bool removedBlock = MergeBlockIntoPredecessor( + BB, DT, /* LoopInfo */ nullptr, VN.getAliasAnalysis(), MD); if (removedBlock) ++NumGVNBlocks; Changed |= removedBlock; @@ -2431,175 +2457,204 @@ bool GVN::processBlock(BasicBlock *BB) { return ChangedFunction; } -/// performPRE - Perform a purely local form of PRE that looks for diamond -/// control flow patterns and attempts to perform simple PRE at the join point. -bool GVN::performPRE(Function &F) { - bool Changed = false; +// Instantiate an expression in a predecessor that lacked it. +bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, + unsigned int ValNo) { + // Because we are going top-down through the block, all value numbers + // will be available in the predecessor by the time we need them. Any + // that weren't originally present will have been instantiated earlier + // in this loop. + bool success = true; + for (unsigned i = 0, e = Instr->getNumOperands(); i != e; ++i) { + Value *Op = Instr->getOperand(i); + if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) + continue; + + if (Value *V = findLeader(Pred, VN.lookup(Op))) { + Instr->setOperand(i, V); + } else { + success = false; + break; + } + } + + // Fail out if we encounter an operand that is not available in + // the PRE predecessor. This is typically because of loads which + // are not value numbered precisely. + if (!success) + return false; + + Instr->insertBefore(Pred->getTerminator()); + Instr->setName(Instr->getName() + ".pre"); + Instr->setDebugLoc(Instr->getDebugLoc()); + VN.add(Instr, ValNo); + + // Update the availability map to include the new instruction. + addToLeaderTable(ValNo, Instr, Pred); + return true; +} + +bool GVN::performScalarPRE(Instruction *CurInst) { SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap; - for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) { - // Nothing to PRE in the entry block. - if (CurrentBlock == &F.getEntryBlock()) continue; - // Don't perform PRE on a landing pad. - if (CurrentBlock->isLandingPad()) continue; + if (isa<AllocaInst>(CurInst) || isa<TerminatorInst>(CurInst) || + isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() || + CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || + isa<DbgInfoIntrinsic>(CurInst)) + return false; - for (BasicBlock::iterator BI = CurrentBlock->begin(), - BE = CurrentBlock->end(); BI != BE; ) { - Instruction *CurInst = BI++; + // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from + // sinking the compare again, and it would force the code generator to + // move the i1 from processor flags or predicate registers into a general + // purpose register. + if (isa<CmpInst>(CurInst)) + return false; - if (isa<AllocaInst>(CurInst) || - isa<TerminatorInst>(CurInst) || isa<PHINode>(CurInst) || - CurInst->getType()->isVoidTy() || - CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || - isa<DbgInfoIntrinsic>(CurInst)) - continue; + // We don't currently value number ANY inline asm calls. + if (CallInst *CallI = dyn_cast<CallInst>(CurInst)) + if (CallI->isInlineAsm()) + return false; - // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from - // sinking the compare again, and it would force the code generator to - // move the i1 from processor flags or predicate registers into a general - // purpose register. - if (isa<CmpInst>(CurInst)) - continue; + uint32_t ValNo = VN.lookup(CurInst); + + // Look for the predecessors for PRE opportunities. We're + // only trying to solve the basic diamond case, where + // a value is computed in the successor and one predecessor, + // but not the other. We also explicitly disallow cases + // where the successor is its own predecessor, because they're + // more complicated to get right. + unsigned NumWith = 0; + unsigned NumWithout = 0; + BasicBlock *PREPred = nullptr; + BasicBlock *CurrentBlock = CurInst->getParent(); + predMap.clear(); + + for (pred_iterator PI = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock); + PI != PE; ++PI) { + BasicBlock *P = *PI; + // We're not interested in PRE where the block is its + // own predecessor, or in blocks with predecessors + // that are not reachable. + if (P == CurrentBlock) { + NumWithout = 2; + break; + } else if (!DT->isReachableFromEntry(P)) { + NumWithout = 2; + break; + } - // We don't currently value number ANY inline asm calls. - if (CallInst *CallI = dyn_cast<CallInst>(CurInst)) - if (CallI->isInlineAsm()) - continue; + Value *predV = findLeader(P, ValNo); + if (!predV) { + predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P)); + PREPred = P; + ++NumWithout; + } else if (predV == CurInst) { + /* CurInst dominates this predecessor. */ + NumWithout = 2; + break; + } else { + predMap.push_back(std::make_pair(predV, P)); + ++NumWith; + } + } - uint32_t ValNo = VN.lookup(CurInst); - - // Look for the predecessors for PRE opportunities. We're - // only trying to solve the basic diamond case, where - // a value is computed in the successor and one predecessor, - // but not the other. We also explicitly disallow cases - // where the successor is its own predecessor, because they're - // more complicated to get right. - unsigned NumWith = 0; - unsigned NumWithout = 0; - BasicBlock *PREPred = nullptr; - predMap.clear(); - - for (pred_iterator PI = pred_begin(CurrentBlock), - PE = pred_end(CurrentBlock); PI != PE; ++PI) { - BasicBlock *P = *PI; - // We're not interested in PRE where the block is its - // own predecessor, or in blocks with predecessors - // that are not reachable. - if (P == CurrentBlock) { - NumWithout = 2; - break; - } else if (!DT->isReachableFromEntry(P)) { - NumWithout = 2; - break; - } + // Don't do PRE when it might increase code size, i.e. when + // we would need to insert instructions in more than one pred. + if (NumWithout > 1 || NumWith == 0) + return false; - Value* predV = findLeader(P, ValNo); - if (!predV) { - predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P)); - PREPred = P; - ++NumWithout; - } else if (predV == CurInst) { - /* CurInst dominates this predecessor. */ - NumWithout = 2; - break; - } else { - predMap.push_back(std::make_pair(predV, P)); - ++NumWith; - } - } + // We may have a case where all predecessors have the instruction, + // and we just need to insert a phi node. Otherwise, perform + // insertion. + Instruction *PREInstr = nullptr; - // Don't do PRE when it might increase code size, i.e. when - // we would need to insert instructions in more than one pred. - if (NumWithout != 1 || NumWith == 0) - continue; + if (NumWithout != 0) { + // Don't do PRE across indirect branch. + if (isa<IndirectBrInst>(PREPred->getTerminator())) + return false; - // Don't do PRE across indirect branch. - if (isa<IndirectBrInst>(PREPred->getTerminator())) - continue; + // We can't do PRE safely on a critical edge, so instead we schedule + // the edge to be split and perform the PRE the next time we iterate + // on the function. + unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock); + if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) { + toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum)); + return false; + } + // We need to insert somewhere, so let's give it a shot + PREInstr = CurInst->clone(); + if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) { + // If we failed insertion, make sure we remove the instruction. + DEBUG(verifyRemoved(PREInstr)); + delete PREInstr; + return false; + } + } - // We can't do PRE safely on a critical edge, so instead we schedule - // the edge to be split and perform the PRE the next time we iterate - // on the function. - unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock); - if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) { - toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum)); - continue; - } + // Either we should have filled in the PRE instruction, or we should + // not have needed insertions. + assert (PREInstr != nullptr || NumWithout == 0); - // Instantiate the expression in the predecessor that lacked it. - // Because we are going top-down through the block, all value numbers - // will be available in the predecessor by the time we need them. Any - // that weren't originally present will have been instantiated earlier - // in this loop. - Instruction *PREInstr = CurInst->clone(); - bool success = true; - for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) { - Value *Op = PREInstr->getOperand(i); - if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) - continue; + ++NumGVNPRE; - if (Value *V = findLeader(PREPred, VN.lookup(Op))) { - PREInstr->setOperand(i, V); - } else { - success = false; - break; - } - } + // Create a PHI to make the value available in this block. + PHINode *Phi = + PHINode::Create(CurInst->getType(), predMap.size(), + CurInst->getName() + ".pre-phi", CurrentBlock->begin()); + for (unsigned i = 0, e = predMap.size(); i != e; ++i) { + if (Value *V = predMap[i].first) + Phi->addIncoming(V, predMap[i].second); + else + Phi->addIncoming(PREInstr, PREPred); + } + + VN.add(Phi, ValNo); + addToLeaderTable(ValNo, Phi, CurrentBlock); + Phi->setDebugLoc(CurInst->getDebugLoc()); + CurInst->replaceAllUsesWith(Phi); + if (Phi->getType()->getScalarType()->isPointerTy()) { + // Because we have added a PHI-use of the pointer value, it has now + // "escaped" from alias analysis' perspective. We need to inform + // AA of this. + for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; ++ii) { + unsigned jj = PHINode::getOperandNumForIncomingValue(ii); + VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj)); + } - // Fail out if we encounter an operand that is not available in - // the PRE predecessor. This is typically because of loads which - // are not value numbered precisely. - if (!success) { - DEBUG(verifyRemoved(PREInstr)); - delete PREInstr; - continue; - } + if (MD) + MD->invalidateCachedPointerInfo(Phi); + } + VN.erase(CurInst); + removeFromLeaderTable(ValNo, CurInst, CurrentBlock); - PREInstr->insertBefore(PREPred->getTerminator()); - PREInstr->setName(CurInst->getName() + ".pre"); - PREInstr->setDebugLoc(CurInst->getDebugLoc()); - VN.add(PREInstr, ValNo); - ++NumGVNPRE; - - // Update the availability map to include the new instruction. - addToLeaderTable(ValNo, PREInstr, PREPred); - - // Create a PHI to make the value available in this block. - PHINode* Phi = PHINode::Create(CurInst->getType(), predMap.size(), - CurInst->getName() + ".pre-phi", - CurrentBlock->begin()); - for (unsigned i = 0, e = predMap.size(); i != e; ++i) { - if (Value *V = predMap[i].first) - Phi->addIncoming(V, predMap[i].second); - else - Phi->addIncoming(PREInstr, PREPred); - } + DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n'); + if (MD) + MD->removeInstruction(CurInst); + DEBUG(verifyRemoved(CurInst)); + CurInst->eraseFromParent(); + ++NumGVNInstr; + + return true; +} - VN.add(Phi, ValNo); - addToLeaderTable(ValNo, Phi, CurrentBlock); - Phi->setDebugLoc(CurInst->getDebugLoc()); - CurInst->replaceAllUsesWith(Phi); - if (Phi->getType()->getScalarType()->isPointerTy()) { - // Because we have added a PHI-use of the pointer value, it has now - // "escaped" from alias analysis' perspective. We need to inform - // AA of this. - for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; - ++ii) { - unsigned jj = PHINode::getOperandNumForIncomingValue(ii); - VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj)); - } +/// Perform a purely local form of PRE that looks for diamond +/// control flow patterns and attempts to perform simple PRE at the join point. +bool GVN::performPRE(Function &F) { + bool Changed = false; + for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) { + // Nothing to PRE in the entry block. + if (CurrentBlock == &F.getEntryBlock()) + continue; - if (MD) - MD->invalidateCachedPointerInfo(Phi); - } - VN.erase(CurInst); - removeFromLeaderTable(ValNo, CurInst, CurrentBlock); + // Don't perform PRE on a landing pad. + if (CurrentBlock->isLandingPad()) + continue; - DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n'); - if (MD) MD->removeInstruction(CurInst); - DEBUG(verifyRemoved(CurInst)); - CurInst->eraseFromParent(); - Changed = true; + for (BasicBlock::iterator BI = CurrentBlock->begin(), + BE = CurrentBlock->end(); + BI != BE;) { + Instruction *CurInst = BI++; + Changed = performScalarPRE(CurInst); } } @@ -2612,50 +2667,48 @@ bool GVN::performPRE(Function &F) { /// Split the critical edge connecting the given two blocks, and return /// the block inserted to the critical edge. BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { - BasicBlock *BB = SplitCriticalEdge(Pred, Succ, this); + BasicBlock *BB = SplitCriticalEdge( + Pred, Succ, CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); if (MD) MD->invalidateCachedPredecessors(); return BB; } -/// splitCriticalEdges - Split critical edges found during the previous +/// Split critical edges found during the previous /// iteration that may enable further optimization. bool GVN::splitCriticalEdges() { if (toSplit.empty()) return false; do { std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val(); - SplitCriticalEdge(Edge.first, Edge.second, this); + SplitCriticalEdge(Edge.first, Edge.second, + CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); } while (!toSplit.empty()); if (MD) MD->invalidateCachedPredecessors(); return true; } -/// iterateOnFunction - Executes one iteration of GVN +/// Executes one iteration of GVN bool GVN::iterateOnFunction(Function &F) { cleanupGlobalSets(); // Top-down walk of the dominator tree bool Changed = false; -#if 0 - // Needed for value numbering with phi construction to work. - ReversePostOrderTraversal<Function*> RPOT(&F); - for (ReversePostOrderTraversal<Function*>::rpo_iterator RI = RPOT.begin(), - RE = RPOT.end(); RI != RE; ++RI) - Changed |= processBlock(*RI); -#else // Save the blocks this function have before transformation begins. GVN may // split critical edge, and hence may invalidate the RPO/DT iterator. // std::vector<BasicBlock *> BBVect; BBVect.reserve(256); - for (DomTreeNode *X : depth_first(DT->getRootNode())) - BBVect.push_back(X->getBlock()); + // Needed for value numbering with phi construction to work. + ReversePostOrderTraversal<Function *> RPOT(&F); + for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(), + RE = RPOT.end(); + RI != RE; ++RI) + BBVect.push_back(*RI); for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end(); I != E; I++) Changed |= processBlock(*I); -#endif return Changed; } @@ -2666,7 +2719,7 @@ void GVN::cleanupGlobalSets() { TableAllocator.Reset(); } -/// verifyRemoved - Verify that the specified instruction does not occur in our +/// Verify that the specified instruction does not occur in our /// internal data structures. void GVN::verifyRemoved(const Instruction *Inst) const { VN.verifyRemoved(Inst); @@ -2685,11 +2738,10 @@ void GVN::verifyRemoved(const Instruction *Inst) const { } } -// BB is declared dead, which implied other blocks become dead as well. This -// function is to add all these blocks to "DeadBlocks". For the dead blocks' -// live successors, update their phi nodes by replacing the operands -// corresponding to dead blocks with UndefVal. -// +/// BB is declared dead, which implied other blocks become dead as well. This +/// function is to add all these blocks to "DeadBlocks". For the dead blocks' +/// live successors, update their phi nodes by replacing the operands +/// corresponding to dead blocks with UndefVal. void GVN::addDeadBlock(BasicBlock *BB) { SmallVector<BasicBlock *, 4> NewDead; SmallSetVector<BasicBlock *, 4> DF; diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index c01f57f..f99ebbc 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -44,7 +44,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" @@ -91,7 +91,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); @@ -126,7 +126,7 @@ char IndVarSimplify::ID = 0; INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars", "Induction Variable Simplification", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) @@ -1929,13 +1929,15 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { if (!L->isLoopSimplifyForm()) return false; - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); - TTI = getAnalysisIfAvailable<TargetTransformInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TLI = TLIP ? &TLIP->getTLI() : nullptr; + auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>(); + TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr; DeadInsts.clear(); Changed = false; diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp new file mode 100644 index 0000000..8559e63 --- /dev/null +++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -0,0 +1,1422 @@ +//===-- InductiveRangeCheckElimination.cpp - ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// The InductiveRangeCheckElimination pass splits a loop's iteration space into +// three disjoint ranges. It does that in a way such that the loop running in +// the middle loop provably does not need range checks. As an example, it will +// convert +// +// len = < known positive > +// for (i = 0; i < n; i++) { +// if (0 <= i && i < len) { +// do_something(); +// } else { +// throw_out_of_bounds(); +// } +// } +// +// to +// +// len = < known positive > +// limit = smin(n, len) +// // no first segment +// for (i = 0; i < limit; i++) { +// if (0 <= i && i < len) { // this check is fully redundant +// do_something(); +// } else { +// throw_out_of_bounds(); +// } +// } +// for (i = limit; i < n; i++) { +// if (0 <= i && i < len) { +// do_something(); +// } else { +// throw_out_of_bounds(); +// } +// } +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Optional.h" + +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" + +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/IR/Verifier.h" + +#include "llvm/Support/Debug.h" + +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/SimplifyIndVar.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" + +#include "llvm/Pass.h" + +#include <array> + +using namespace llvm; + +static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden, + cl::init(64)); + +static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden, + cl::init(false)); + +static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal", + cl::Hidden, cl::init(10)); + +#define DEBUG_TYPE "irce" + +namespace { + +/// An inductive range check is conditional branch in a loop with +/// +/// 1. a very cold successor (i.e. the branch jumps to that successor very +/// rarely) +/// +/// and +/// +/// 2. a condition that is provably true for some range of values taken by the +/// containing loop's induction variable. +/// +/// Currently all inductive range checks are branches conditional on an +/// expression of the form +/// +/// 0 <= (Offset + Scale * I) < Length +/// +/// where `I' is the canonical induction variable of a loop to which Offset and +/// Scale are loop invariant, and Length is >= 0. Currently the 'false' branch +/// is considered cold, looking at profiling data to verify that is a TODO. + +class InductiveRangeCheck { + const SCEV *Offset; + const SCEV *Scale; + Value *Length; + BranchInst *Branch; + + InductiveRangeCheck() : + Offset(nullptr), Scale(nullptr), Length(nullptr), Branch(nullptr) { } + +public: + const SCEV *getOffset() const { return Offset; } + const SCEV *getScale() const { return Scale; } + Value *getLength() const { return Length; } + + void print(raw_ostream &OS) const { + OS << "InductiveRangeCheck:\n"; + OS << " Offset: "; + Offset->print(OS); + OS << " Scale: "; + Scale->print(OS); + OS << " Length: "; + Length->print(OS); + OS << " Branch: "; + getBranch()->print(OS); + OS << "\n"; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() { + print(dbgs()); + } +#endif + + BranchInst *getBranch() const { return Branch; } + + /// Represents an signed integer range [Range.getBegin(), Range.getEnd()). If + /// R.getEnd() sle R.getBegin(), then R denotes the empty range. + + class Range { + const SCEV *Begin; + const SCEV *End; + + public: + Range(const SCEV *Begin, const SCEV *End) : Begin(Begin), End(End) { + assert(Begin->getType() == End->getType() && "ill-typed range!"); + } + + Type *getType() const { return Begin->getType(); } + const SCEV *getBegin() const { return Begin; } + const SCEV *getEnd() const { return End; } + }; + + typedef SpecificBumpPtrAllocator<InductiveRangeCheck> AllocatorTy; + + /// This is the value the condition of the branch needs to evaluate to for the + /// branch to take the hot successor (see (1) above). + bool getPassingDirection() { return true; } + + /// Computes a range for the induction variable (IndVar) in which the range + /// check is redundant and can be constant-folded away. The induction + /// variable is not required to be the canonical {0,+,1} induction variable. + Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE, + const SCEVAddRecExpr *IndVar, + IRBuilder<> &B) const; + + /// Create an inductive range check out of BI if possible, else return + /// nullptr. + static InductiveRangeCheck *create(AllocatorTy &Alloc, BranchInst *BI, + Loop *L, ScalarEvolution &SE, + BranchProbabilityInfo &BPI); +}; + +class InductiveRangeCheckElimination : public LoopPass { + InductiveRangeCheck::AllocatorTy Allocator; + +public: + static char ID; + InductiveRangeCheckElimination() : LoopPass(ID) { + initializeInductiveRangeCheckEliminationPass( + *PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<BranchProbabilityInfo>(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +char InductiveRangeCheckElimination::ID = 0; +} + +INITIALIZE_PASS(InductiveRangeCheckElimination, "irce", + "Inductive range check elimination", false, false) + +static bool IsLowerBoundCheck(Value *Check, Value *&IndexV) { + using namespace llvm::PatternMatch; + + ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + Value *LHS = nullptr, *RHS = nullptr; + + if (!match(Check, m_ICmp(Pred, m_Value(LHS), m_Value(RHS)))) + return false; + + switch (Pred) { + default: + return false; + + case ICmpInst::ICMP_SLE: + std::swap(LHS, RHS); + // fallthrough + case ICmpInst::ICMP_SGE: + if (!match(RHS, m_ConstantInt<0>())) + return false; + IndexV = LHS; + return true; + + case ICmpInst::ICMP_SLT: + std::swap(LHS, RHS); + // fallthrough + case ICmpInst::ICMP_SGT: + if (!match(RHS, m_ConstantInt<-1>())) + return false; + IndexV = LHS; + return true; + } +} + +static bool IsUpperBoundCheck(Value *Check, Value *Index, Value *&UpperLimit) { + using namespace llvm::PatternMatch; + + ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + Value *LHS = nullptr, *RHS = nullptr; + + if (!match(Check, m_ICmp(Pred, m_Value(LHS), m_Value(RHS)))) + return false; + + switch (Pred) { + default: + return false; + + case ICmpInst::ICMP_SGT: + std::swap(LHS, RHS); + // fallthrough + case ICmpInst::ICMP_SLT: + if (LHS != Index) + return false; + UpperLimit = RHS; + return true; + + case ICmpInst::ICMP_UGT: + std::swap(LHS, RHS); + // fallthrough + case ICmpInst::ICMP_ULT: + if (LHS != Index) + return false; + UpperLimit = RHS; + return true; + } +} + +/// Split a condition into something semantically equivalent to (0 <= I < +/// Limit), both comparisons signed and Len loop invariant on L and positive. +/// On success, return true and set Index to I and UpperLimit to Limit. Return +/// false on failure (we may still write to UpperLimit and Index on failure). +/// It does not try to interpret I as a loop index. +/// +static bool SplitRangeCheckCondition(Loop *L, ScalarEvolution &SE, + Value *Condition, const SCEV *&Index, + Value *&UpperLimit) { + + // TODO: currently this catches some silly cases like comparing "%idx slt 1". + // Our transformations are still correct, but less likely to be profitable in + // those cases. We have to come up with some heuristics that pick out the + // range checks that are more profitable to clone a loop for. This function + // in general can be made more robust. + + using namespace llvm::PatternMatch; + + Value *A = nullptr; + Value *B = nullptr; + ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + + // In these early checks we assume that the matched UpperLimit is positive. + // We'll verify that fact later, before returning true. + + if (match(Condition, m_And(m_Value(A), m_Value(B)))) { + Value *IndexV = nullptr; + Value *ExpectedUpperBoundCheck = nullptr; + + if (IsLowerBoundCheck(A, IndexV)) + ExpectedUpperBoundCheck = B; + else if (IsLowerBoundCheck(B, IndexV)) + ExpectedUpperBoundCheck = A; + else + return false; + + if (!IsUpperBoundCheck(ExpectedUpperBoundCheck, IndexV, UpperLimit)) + return false; + + Index = SE.getSCEV(IndexV); + + if (isa<SCEVCouldNotCompute>(Index)) + return false; + + } else if (match(Condition, m_ICmp(Pred, m_Value(A), m_Value(B)))) { + switch (Pred) { + default: + return false; + + case ICmpInst::ICMP_SGT: + std::swap(A, B); + // fall through + case ICmpInst::ICMP_SLT: + UpperLimit = B; + Index = SE.getSCEV(A); + if (isa<SCEVCouldNotCompute>(Index) || !SE.isKnownNonNegative(Index)) + return false; + break; + + case ICmpInst::ICMP_UGT: + std::swap(A, B); + // fall through + case ICmpInst::ICMP_ULT: + UpperLimit = B; + Index = SE.getSCEV(A); + if (isa<SCEVCouldNotCompute>(Index)) + return false; + break; + } + } else { + return false; + } + + const SCEV *UpperLimitSCEV = SE.getSCEV(UpperLimit); + if (isa<SCEVCouldNotCompute>(UpperLimitSCEV) || + !SE.isKnownNonNegative(UpperLimitSCEV)) + return false; + + if (SE.getLoopDisposition(UpperLimitSCEV, L) != + ScalarEvolution::LoopInvariant) { + DEBUG(dbgs() << " in function: " << L->getHeader()->getParent()->getName() + << " "; + dbgs() << " UpperLimit is not loop invariant: " + << UpperLimit->getName() << "\n";); + return false; + } + + return true; +} + + +InductiveRangeCheck * +InductiveRangeCheck::create(InductiveRangeCheck::AllocatorTy &A, BranchInst *BI, + Loop *L, ScalarEvolution &SE, + BranchProbabilityInfo &BPI) { + + if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch()) + return nullptr; + + BranchProbability LikelyTaken(15, 16); + + if (BPI.getEdgeProbability(BI->getParent(), (unsigned) 0) < LikelyTaken) + return nullptr; + + Value *Length = nullptr; + const SCEV *IndexSCEV = nullptr; + + if (!SplitRangeCheckCondition(L, SE, BI->getCondition(), IndexSCEV, Length)) + return nullptr; + + assert(IndexSCEV && Length && "contract with SplitRangeCheckCondition!"); + + const SCEVAddRecExpr *IndexAddRec = dyn_cast<SCEVAddRecExpr>(IndexSCEV); + bool IsAffineIndex = + IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine(); + + if (!IsAffineIndex) + return nullptr; + + InductiveRangeCheck *IRC = new (A.Allocate()) InductiveRangeCheck; + IRC->Length = Length; + IRC->Offset = IndexAddRec->getStart(); + IRC->Scale = IndexAddRec->getStepRecurrence(SE); + IRC->Branch = BI; + return IRC; +} + +namespace { + +// Keeps track of the structure of a loop. This is similar to llvm::Loop, +// except that it is more lightweight and can track the state of a loop through +// changing and potentially invalid IR. This structure also formalizes the +// kinds of loops we can deal with -- ones that have a single latch that is also +// an exiting block *and* have a canonical induction variable. +struct LoopStructure { + const char *Tag; + + BasicBlock *Header; + BasicBlock *Latch; + + // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th + // successor is `LatchExit', the exit block of the loop. + BranchInst *LatchBr; + BasicBlock *LatchExit; + unsigned LatchBrExitIdx; + + Value *IndVarNext; + Value *IndVarStart; + Value *LoopExitAt; + bool IndVarIncreasing; + + LoopStructure() + : Tag(""), Header(nullptr), Latch(nullptr), LatchBr(nullptr), + LatchExit(nullptr), LatchBrExitIdx(-1), IndVarNext(nullptr), + IndVarStart(nullptr), LoopExitAt(nullptr), IndVarIncreasing(false) {} + + template <typename M> LoopStructure map(M Map) const { + LoopStructure Result; + Result.Tag = Tag; + Result.Header = cast<BasicBlock>(Map(Header)); + Result.Latch = cast<BasicBlock>(Map(Latch)); + Result.LatchBr = cast<BranchInst>(Map(LatchBr)); + Result.LatchExit = cast<BasicBlock>(Map(LatchExit)); + Result.LatchBrExitIdx = LatchBrExitIdx; + Result.IndVarNext = Map(IndVarNext); + Result.IndVarStart = Map(IndVarStart); + Result.LoopExitAt = Map(LoopExitAt); + Result.IndVarIncreasing = IndVarIncreasing; + return Result; + } + + static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &, + BranchProbabilityInfo &BPI, + Loop &, + const char *&); +}; + +/// This class is used to constrain loops to run within a given iteration space. +/// The algorithm this class implements is given a Loop and a range [Begin, +/// End). The algorithm then tries to break out a "main loop" out of the loop +/// it is given in a way that the "main loop" runs with the induction variable +/// in a subset of [Begin, End). The algorithm emits appropriate pre and post +/// loops to run any remaining iterations. The pre loop runs any iterations in +/// which the induction variable is < Begin, and the post loop runs any +/// iterations in which the induction variable is >= End. +/// +class LoopConstrainer { + // The representation of a clone of the original loop we started out with. + struct ClonedLoop { + // The cloned blocks + std::vector<BasicBlock *> Blocks; + + // `Map` maps values in the clonee into values in the cloned version + ValueToValueMapTy Map; + + // An instance of `LoopStructure` for the cloned loop + LoopStructure Structure; + }; + + // Result of rewriting the range of a loop. See changeIterationSpaceEnd for + // more details on what these fields mean. + struct RewrittenRangeInfo { + BasicBlock *PseudoExit; + BasicBlock *ExitSelector; + std::vector<PHINode *> PHIValuesAtPseudoExit; + PHINode *IndVarEnd; + + RewrittenRangeInfo() + : PseudoExit(nullptr), ExitSelector(nullptr), IndVarEnd(nullptr) {} + }; + + // Calculated subranges we restrict the iteration space of the main loop to. + // See the implementation of `calculateSubRanges' for more details on how + // these fields are computed. `LowLimit` is None if there is no restriction + // on low end of the restricted iteration space of the main loop. `HighLimit` + // is None if there is no restriction on high end of the restricted iteration + // space of the main loop. + + struct SubRanges { + Optional<const SCEV *> LowLimit; + Optional<const SCEV *> HighLimit; + }; + + // A utility function that does a `replaceUsesOfWith' on the incoming block + // set of a `PHINode' -- replaces instances of `Block' in the `PHINode's + // incoming block list with `ReplaceBy'. + static void replacePHIBlock(PHINode *PN, BasicBlock *Block, + BasicBlock *ReplaceBy); + + // Compute a safe set of limits for the main loop to run in -- effectively the + // intersection of `Range' and the iteration space of the original loop. + // Return None if unable to compute the set of subranges. + // + Optional<SubRanges> calculateSubRanges() const; + + // Clone `OriginalLoop' and return the result in CLResult. The IR after + // running `cloneLoop' is well formed except for the PHI nodes in CLResult -- + // the PHI nodes say that there is an incoming edge from `OriginalPreheader` + // but there is no such edge. + // + void cloneLoop(ClonedLoop &CLResult, const char *Tag) const; + + // Rewrite the iteration space of the loop denoted by (LS, Preheader). The + // iteration space of the rewritten loop ends at ExitLoopAt. The start of the + // iteration space is not changed. `ExitLoopAt' is assumed to be slt + // `OriginalHeaderCount'. + // + // If there are iterations left to execute, control is made to jump to + // `ContinuationBlock', otherwise they take the normal loop exit. The + // returned `RewrittenRangeInfo' object is populated as follows: + // + // .PseudoExit is a basic block that unconditionally branches to + // `ContinuationBlock'. + // + // .ExitSelector is a basic block that decides, on exit from the loop, + // whether to branch to the "true" exit or to `PseudoExit'. + // + // .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value + // for each PHINode in the loop header on taking the pseudo exit. + // + // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate + // preheader because it is made to branch to the loop header only + // conditionally. + // + RewrittenRangeInfo + changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader, + Value *ExitLoopAt, + BasicBlock *ContinuationBlock) const; + + // The loop denoted by `LS' has `OldPreheader' as its preheader. This + // function creates a new preheader for `LS' and returns it. + // + BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader, + const char *Tag) const; + + // `ContinuationBlockAndPreheader' was the continuation block for some call to + // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'. + // This function rewrites the PHI nodes in `LS.Header' to start with the + // correct value. + void rewriteIncomingValuesForPHIs( + LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader, + const LoopConstrainer::RewrittenRangeInfo &RRI) const; + + // Even though we do not preserve any passes at this time, we at least need to + // keep the parent loop structure consistent. The `LPPassManager' seems to + // verify this after running a loop pass. This function adds the list of + // blocks denoted by BBs to this loops parent loop if required. + void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs); + + // Some global state. + Function &F; + LLVMContext &Ctx; + ScalarEvolution &SE; + + // Information about the original loop we started out with. + Loop &OriginalLoop; + LoopInfo &OriginalLoopInfo; + const SCEV *LatchTakenCount; + BasicBlock *OriginalPreheader; + + // The preheader of the main loop. This may or may not be different from + // `OriginalPreheader'. + BasicBlock *MainLoopPreheader; + + // The range we need to run the main loop in. + InductiveRangeCheck::Range Range; + + // The structure of the main loop (see comment at the beginning of this class + // for a definition) + LoopStructure MainLoopStructure; + +public: + LoopConstrainer(Loop &L, LoopInfo &LI, const LoopStructure &LS, + ScalarEvolution &SE, InductiveRangeCheck::Range R) + : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()), + SE(SE), OriginalLoop(L), OriginalLoopInfo(LI), LatchTakenCount(nullptr), + OriginalPreheader(nullptr), MainLoopPreheader(nullptr), Range(R), + MainLoopStructure(LS) {} + + // Entry point for the algorithm. Returns true on success. + bool run(); +}; + +} + +void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block, + BasicBlock *ReplaceBy) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingBlock(i) == Block) + PN->setIncomingBlock(i, ReplaceBy); +} + +static bool CanBeSMax(ScalarEvolution &SE, const SCEV *S) { + APInt SMax = + APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth()); + return SE.getSignedRange(S).contains(SMax) && + SE.getUnsignedRange(S).contains(SMax); +} + +static bool CanBeSMin(ScalarEvolution &SE, const SCEV *S) { + APInt SMin = + APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth()); + return SE.getSignedRange(S).contains(SMin) && + SE.getUnsignedRange(S).contains(SMin); +} + +Optional<LoopStructure> +LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BPI, + Loop &L, const char *&FailureReason) { + assert(L.isLoopSimplifyForm() && "should follow from addRequired<>"); + + BasicBlock *Latch = L.getLoopLatch(); + if (!L.isLoopExiting(Latch)) { + FailureReason = "no loop latch"; + return None; + } + + BasicBlock *Header = L.getHeader(); + BasicBlock *Preheader = L.getLoopPreheader(); + if (!Preheader) { + FailureReason = "no preheader"; + return None; + } + + BranchInst *LatchBr = dyn_cast<BranchInst>(&*Latch->rbegin()); + if (!LatchBr || LatchBr->isUnconditional()) { + FailureReason = "latch terminator not conditional branch"; + return None; + } + + unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0; + + BranchProbability ExitProbability = + BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx); + + if (ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) { + FailureReason = "short running loop, not profitable"; + return None; + } + + ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition()); + if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) { + FailureReason = "latch terminator branch not conditional on integral icmp"; + return None; + } + + const SCEV *LatchCount = SE.getExitCount(&L, Latch); + if (isa<SCEVCouldNotCompute>(LatchCount)) { + FailureReason = "could not compute latch count"; + return None; + } + + ICmpInst::Predicate Pred = ICI->getPredicate(); + Value *LeftValue = ICI->getOperand(0); + const SCEV *LeftSCEV = SE.getSCEV(LeftValue); + IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType()); + + Value *RightValue = ICI->getOperand(1); + const SCEV *RightSCEV = SE.getSCEV(RightValue); + + // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence. + if (!isa<SCEVAddRecExpr>(LeftSCEV)) { + if (isa<SCEVAddRecExpr>(RightSCEV)) { + std::swap(LeftSCEV, RightSCEV); + std::swap(LeftValue, RightValue); + Pred = ICmpInst::getSwappedPredicate(Pred); + } else { + FailureReason = "no add recurrences in the icmp"; + return None; + } + } + + auto IsInductionVar = [&SE](const SCEVAddRecExpr *AR, bool &IsIncreasing) { + if (!AR->isAffine()) + return false; + + IntegerType *Ty = cast<IntegerType>(AR->getType()); + IntegerType *WideTy = + IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2); + + // Currently we only work with induction variables that have been proved to + // not wrap. This restriction can potentially be lifted in the future. + + const SCEVAddRecExpr *ExtendAfterOp = + dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); + if (!ExtendAfterOp) + return false; + + const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy); + const SCEV *ExtendedStep = + SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy); + + bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart && + ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep; + + if (!NoSignedWrap) + return false; + + if (const SCEVConstant *StepExpr = + dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) { + ConstantInt *StepCI = StepExpr->getValue(); + if (StepCI->isOne() || StepCI->isMinusOne()) { + IsIncreasing = StepCI->isOne(); + return true; + } + } + + return false; + }; + + // `ICI` is interpreted as taking the backedge if the *next* value of the + // induction variable satisfies some constraint. + + const SCEVAddRecExpr *IndVarNext = cast<SCEVAddRecExpr>(LeftSCEV); + bool IsIncreasing = false; + if (!IsInductionVar(IndVarNext, IsIncreasing)) { + FailureReason = "LHS in icmp not induction variable"; + return None; + } + + ConstantInt *One = ConstantInt::get(IndVarTy, 1); + // TODO: generalize the predicates here to also match their unsigned variants. + if (IsIncreasing) { + bool FoundExpectedPred = + (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) || + (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0); + + if (!FoundExpectedPred) { + FailureReason = "expected icmp slt semantically, found something else"; + return None; + } + + if (LatchBrExitIdx == 0) { + if (CanBeSMax(SE, RightSCEV)) { + // TODO: this restriction is easily removable -- we just have to + // remember that the icmp was an slt and not an sle. + FailureReason = "limit may overflow when coercing sle to slt"; + return None; + } + + IRBuilder<> B(&*Preheader->rbegin()); + RightValue = B.CreateAdd(RightValue, One); + } + + } else { + bool FoundExpectedPred = + (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) || + (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0); + + if (!FoundExpectedPred) { + FailureReason = "expected icmp sgt semantically, found something else"; + return None; + } + + if (LatchBrExitIdx == 0) { + if (CanBeSMin(SE, RightSCEV)) { + // TODO: this restriction is easily removable -- we just have to + // remember that the icmp was an sgt and not an sge. + FailureReason = "limit may overflow when coercing sge to sgt"; + return None; + } + + IRBuilder<> B(&*Preheader->rbegin()); + RightValue = B.CreateSub(RightValue, One); + } + } + + const SCEV *StartNext = IndVarNext->getStart(); + const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE)); + const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend); + + BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx); + + assert(SE.getLoopDisposition(LatchCount, &L) == + ScalarEvolution::LoopInvariant && + "loop variant exit count doesn't make sense!"); + + assert(!L.contains(LatchExit) && "expected an exit block!"); + + Value *IndVarStartV = SCEVExpander(SE, "irce").expandCodeFor( + IndVarStart, IndVarTy, &*Preheader->rbegin()); + IndVarStartV->setName("indvar.start"); + + LoopStructure Result; + + Result.Tag = "main"; + Result.Header = Header; + Result.Latch = Latch; + Result.LatchBr = LatchBr; + Result.LatchExit = LatchExit; + Result.LatchBrExitIdx = LatchBrExitIdx; + Result.IndVarStart = IndVarStartV; + Result.IndVarNext = LeftValue; + Result.IndVarIncreasing = IsIncreasing; + Result.LoopExitAt = RightValue; + + FailureReason = nullptr; + + return Result; +} + +Optional<LoopConstrainer::SubRanges> +LoopConstrainer::calculateSubRanges() const { + IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType()); + + if (Range.getType() != Ty) + return None; + + LoopConstrainer::SubRanges Result; + + // I think we can be more aggressive here and make this nuw / nsw if the + // addition that feeds into the icmp for the latch's terminating branch is nuw + // / nsw. In any case, a wrapping 2's complement addition is safe. + ConstantInt *One = ConstantInt::get(Ty, 1); + const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart); + const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt); + + bool Increasing = MainLoopStructure.IndVarIncreasing; + // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the + // range of values the induction variable takes. + const SCEV *Smallest = + Increasing ? Start : SE.getAddExpr(End, SE.getSCEV(One)); + const SCEV *Greatest = + Increasing ? End : SE.getAddExpr(Start, SE.getSCEV(One)); + + auto Clamp = [this, Smallest, Greatest](const SCEV *S) { + return SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S)); + }; + + // In some cases we can prove that we don't need a pre or post loop + + bool ProvablyNoPreloop = + SE.isKnownPredicate(ICmpInst::ICMP_SLE, Range.getBegin(), Smallest); + if (!ProvablyNoPreloop) + Result.LowLimit = Clamp(Range.getBegin()); + + bool ProvablyNoPostLoop = + SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd()); + if (!ProvablyNoPostLoop) + Result.HighLimit = Clamp(Range.getEnd()); + + return Result; +} + +void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result, + const char *Tag) const { + for (BasicBlock *BB : OriginalLoop.getBlocks()) { + BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F); + Result.Blocks.push_back(Clone); + Result.Map[BB] = Clone; + } + + auto GetClonedValue = [&Result](Value *V) { + assert(V && "null values not in domain!"); + auto It = Result.Map.find(V); + if (It == Result.Map.end()) + return V; + return static_cast<Value *>(It->second); + }; + + Result.Structure = MainLoopStructure.map(GetClonedValue); + Result.Structure.Tag = Tag; + + for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) { + BasicBlock *ClonedBB = Result.Blocks[i]; + BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i]; + + assert(Result.Map[OriginalBB] == ClonedBB && "invariant!"); + + for (Instruction &I : *ClonedBB) + RemapInstruction(&I, Result.Map, + RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); + + // Exit blocks will now have one more predecessor and their PHI nodes need + // to be edited to reflect that. No phi nodes need to be introduced because + // the loop is in LCSSA. + + for (auto SBBI = succ_begin(OriginalBB), SBBE = succ_end(OriginalBB); + SBBI != SBBE; ++SBBI) { + + if (OriginalLoop.contains(*SBBI)) + continue; // not an exit block + + for (Instruction &I : **SBBI) { + if (!isa<PHINode>(&I)) + break; + + PHINode *PN = cast<PHINode>(&I); + Value *OldIncoming = PN->getIncomingValueForBlock(OriginalBB); + PN->addIncoming(GetClonedValue(OldIncoming), ClonedBB); + } + } + } +} + +LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( + const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt, + BasicBlock *ContinuationBlock) const { + + // We start with a loop with a single latch: + // + // +--------------------+ + // | | + // | preheader | + // | | + // +--------+-----------+ + // | ----------------\ + // | / | + // +--------v----v------+ | + // | | | + // | header | | + // | | | + // +--------------------+ | + // | + // ..... | + // | + // +--------------------+ | + // | | | + // | latch >----------/ + // | | + // +-------v------------+ + // | + // | + // | +--------------------+ + // | | | + // +---> original exit | + // | | + // +--------------------+ + // + // We change the control flow to look like + // + // + // +--------------------+ + // | | + // | preheader >-------------------------+ + // | | | + // +--------v-----------+ | + // | /-------------+ | + // | / | | + // +--------v--v--------+ | | + // | | | | + // | header | | +--------+ | + // | | | | | | + // +--------------------+ | | +-----v-----v-----------+ + // | | | | + // | | | .pseudo.exit | + // | | | | + // | | +-----------v-----------+ + // | | | + // ..... | | | + // | | +--------v-------------+ + // +--------------------+ | | | | + // | | | | | ContinuationBlock | + // | latch >------+ | | | + // | | | +----------------------+ + // +---------v----------+ | + // | | + // | | + // | +---------------^-----+ + // | | | + // +-----> .exit.selector | + // | | + // +----------v----------+ + // | + // +--------------------+ | + // | | | + // | original exit <----+ + // | | + // +--------------------+ + // + + RewrittenRangeInfo RRI; + + auto BBInsertLocation = std::next(Function::iterator(LS.Latch)); + RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector", + &F, BBInsertLocation); + RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F, + BBInsertLocation); + + BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin()); + bool Increasing = LS.IndVarIncreasing; + + IRBuilder<> B(PreheaderJump); + + // EnterLoopCond - is it okay to start executing this `LS'? + Value *EnterLoopCond = Increasing + ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt) + : B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt); + + B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit); + PreheaderJump->eraseFromParent(); + + LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector); + B.SetInsertPoint(LS.LatchBr); + Value *TakeBackedgeLoopCond = + Increasing ? B.CreateICmpSLT(LS.IndVarNext, ExitSubloopAt) + : B.CreateICmpSGT(LS.IndVarNext, ExitSubloopAt); + Value *CondForBranch = LS.LatchBrExitIdx == 1 + ? TakeBackedgeLoopCond + : B.CreateNot(TakeBackedgeLoopCond); + + LS.LatchBr->setCondition(CondForBranch); + + B.SetInsertPoint(RRI.ExitSelector); + + // IterationsLeft - are there any more iterations left, given the original + // upper bound on the induction variable? If not, we branch to the "real" + // exit. + Value *IterationsLeft = Increasing + ? B.CreateICmpSLT(LS.IndVarNext, LS.LoopExitAt) + : B.CreateICmpSGT(LS.IndVarNext, LS.LoopExitAt); + B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit); + + BranchInst *BranchToContinuation = + BranchInst::Create(ContinuationBlock, RRI.PseudoExit); + + // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of + // each of the PHI nodes in the loop header. This feeds into the initial + // value of the same PHI nodes if/when we continue execution. + for (Instruction &I : *LS.Header) { + if (!isa<PHINode>(&I)) + break; + + PHINode *PN = cast<PHINode>(&I); + + PHINode *NewPHI = PHINode::Create(PN->getType(), 2, PN->getName() + ".copy", + BranchToContinuation); + + NewPHI->addIncoming(PN->getIncomingValueForBlock(Preheader), Preheader); + NewPHI->addIncoming(PN->getIncomingValueForBlock(LS.Latch), + RRI.ExitSelector); + RRI.PHIValuesAtPseudoExit.push_back(NewPHI); + } + + RRI.IndVarEnd = PHINode::Create(LS.IndVarNext->getType(), 2, "indvar.end", + BranchToContinuation); + RRI.IndVarEnd->addIncoming(LS.IndVarStart, Preheader); + RRI.IndVarEnd->addIncoming(LS.IndVarNext, RRI.ExitSelector); + + // The latch exit now has a branch from `RRI.ExitSelector' instead of + // `LS.Latch'. The PHI nodes need to be updated to reflect that. + for (Instruction &I : *LS.LatchExit) { + if (PHINode *PN = dyn_cast<PHINode>(&I)) + replacePHIBlock(PN, LS.Latch, RRI.ExitSelector); + else + break; + } + + return RRI; +} + +void LoopConstrainer::rewriteIncomingValuesForPHIs( + LoopStructure &LS, BasicBlock *ContinuationBlock, + const LoopConstrainer::RewrittenRangeInfo &RRI) const { + + unsigned PHIIndex = 0; + for (Instruction &I : *LS.Header) { + if (!isa<PHINode>(&I)) + break; + + PHINode *PN = cast<PHINode>(&I); + + for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) + if (PN->getIncomingBlock(i) == ContinuationBlock) + PN->setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]); + } + + LS.IndVarStart = RRI.IndVarEnd; +} + +BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS, + BasicBlock *OldPreheader, + const char *Tag) const { + + BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header); + BranchInst::Create(LS.Header, Preheader); + + for (Instruction &I : *LS.Header) { + if (!isa<PHINode>(&I)) + break; + + PHINode *PN = cast<PHINode>(&I); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) + replacePHIBlock(PN, OldPreheader, Preheader); + } + + return Preheader; +} + +void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) { + Loop *ParentLoop = OriginalLoop.getParentLoop(); + if (!ParentLoop) + return; + + for (BasicBlock *BB : BBs) + ParentLoop->addBasicBlockToLoop(BB, OriginalLoopInfo); +} + +bool LoopConstrainer::run() { + BasicBlock *Preheader = nullptr; + LatchTakenCount = SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch); + Preheader = OriginalLoop.getLoopPreheader(); + assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr && + "preconditions!"); + + OriginalPreheader = Preheader; + MainLoopPreheader = Preheader; + + Optional<SubRanges> MaybeSR = calculateSubRanges(); + if (!MaybeSR.hasValue()) { + DEBUG(dbgs() << "irce: could not compute subranges\n"); + return false; + } + + SubRanges SR = MaybeSR.getValue(); + bool Increasing = MainLoopStructure.IndVarIncreasing; + IntegerType *IVTy = + cast<IntegerType>(MainLoopStructure.IndVarNext->getType()); + + SCEVExpander Expander(SE, "irce"); + Instruction *InsertPt = OriginalPreheader->getTerminator(); + + // It would have been better to make `PreLoop' and `PostLoop' + // `Optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy + // constructor. + ClonedLoop PreLoop, PostLoop; + bool NeedsPreLoop = + Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue(); + bool NeedsPostLoop = + Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue(); + + Value *ExitPreLoopAt = nullptr; + Value *ExitMainLoopAt = nullptr; + const SCEVConstant *MinusOneS = + cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */)); + + if (NeedsPreLoop) { + const SCEV *ExitPreLoopAtSCEV = nullptr; + + if (Increasing) + ExitPreLoopAtSCEV = *SR.LowLimit; + else { + if (CanBeSMin(SE, *SR.HighLimit)) { + DEBUG(dbgs() << "irce: could not prove no-overflow when computing " + << "preloop exit limit. HighLimit = " << *(*SR.HighLimit) + << "\n"); + return false; + } + ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS); + } + + ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt); + ExitPreLoopAt->setName("exit.preloop.at"); + } + + if (NeedsPostLoop) { + const SCEV *ExitMainLoopAtSCEV = nullptr; + + if (Increasing) + ExitMainLoopAtSCEV = *SR.HighLimit; + else { + if (CanBeSMin(SE, *SR.LowLimit)) { + DEBUG(dbgs() << "irce: could not prove no-overflow when computing " + << "mainloop exit limit. LowLimit = " << *(*SR.LowLimit) + << "\n"); + return false; + } + ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS); + } + + ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt); + ExitMainLoopAt->setName("exit.mainloop.at"); + } + + // We clone these ahead of time so that we don't have to deal with changing + // and temporarily invalid IR as we transform the loops. + if (NeedsPreLoop) + cloneLoop(PreLoop, "preloop"); + if (NeedsPostLoop) + cloneLoop(PostLoop, "postloop"); + + RewrittenRangeInfo PreLoopRRI; + + if (NeedsPreLoop) { + Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header, + PreLoop.Structure.Header); + + MainLoopPreheader = + createPreheader(MainLoopStructure, Preheader, "mainloop"); + PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader, + ExitPreLoopAt, MainLoopPreheader); + rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader, + PreLoopRRI); + } + + BasicBlock *PostLoopPreheader = nullptr; + RewrittenRangeInfo PostLoopRRI; + + if (NeedsPostLoop) { + PostLoopPreheader = + createPreheader(PostLoop.Structure, Preheader, "postloop"); + PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader, + ExitMainLoopAt, PostLoopPreheader); + rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader, + PostLoopRRI); + } + + BasicBlock *NewMainLoopPreheader = + MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr; + BasicBlock *NewBlocks[] = {PostLoopPreheader, PreLoopRRI.PseudoExit, + PreLoopRRI.ExitSelector, PostLoopRRI.PseudoExit, + PostLoopRRI.ExitSelector, NewMainLoopPreheader}; + + // Some of the above may be nullptr, filter them out before passing to + // addToParentLoopIfNeeded. + auto NewBlocksEnd = + std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr); + + addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd)); + addToParentLoopIfNeeded(PreLoop.Blocks); + addToParentLoopIfNeeded(PostLoop.Blocks); + + return true; +} + +/// Computes and returns a range of values for the induction variable (IndVar) +/// in which the range check can be safely elided. If it cannot compute such a +/// range, returns None. +Optional<InductiveRangeCheck::Range> +InductiveRangeCheck::computeSafeIterationSpace(ScalarEvolution &SE, + const SCEVAddRecExpr *IndVar, + IRBuilder<> &) const { + // IndVar is of the form "A + B * I" (where "I" is the canonical induction + // variable, that may or may not exist as a real llvm::Value in the loop) and + // this inductive range check is a range check on the "C + D * I" ("C" is + // getOffset() and "D" is getScale()). We rewrite the value being range + // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA". + // Currently we support this only for "B" = "D" = { 1 or -1 }, but the code + // can be generalized as needed. + // + // The actual inequalities we solve are of the form + // + // 0 <= M + 1 * IndVar < L given L >= 0 (i.e. N == 1) + // + // The inequality is satisfied by -M <= IndVar < (L - M) [^1]. All additions + // and subtractions are twos-complement wrapping and comparisons are signed. + // + // Proof: + // + // If there exists IndVar such that -M <= IndVar < (L - M) then it follows + // that -M <= (-M + L) [== Eq. 1]. Since L >= 0, if (-M + L) sign-overflows + // then (-M + L) < (-M). Hence by [Eq. 1], (-M + L) could not have + // overflown. + // + // This means IndVar = t + (-M) for t in [0, L). Hence (IndVar + M) = t. + // Hence 0 <= (IndVar + M) < L + + // [^1]: Note that the solution does _not_ apply if L < 0; consider values M = + // 127, IndVar = 126 and L = -2 in an i8 world. + + if (!IndVar->isAffine()) + return None; + + const SCEV *A = IndVar->getStart(); + const SCEVConstant *B = dyn_cast<SCEVConstant>(IndVar->getStepRecurrence(SE)); + if (!B) + return None; + + const SCEV *C = getOffset(); + const SCEVConstant *D = dyn_cast<SCEVConstant>(getScale()); + if (D != B) + return None; + + ConstantInt *ConstD = D->getValue(); + if (!(ConstD->isMinusOne() || ConstD->isOne())) + return None; + + const SCEV *M = SE.getMinusSCEV(C, A); + + const SCEV *Begin = SE.getNegativeSCEV(M); + const SCEV *End = SE.getMinusSCEV(SE.getSCEV(getLength()), M); + + return InductiveRangeCheck::Range(Begin, End); +} + +static Optional<InductiveRangeCheck::Range> +IntersectRange(ScalarEvolution &SE, + const Optional<InductiveRangeCheck::Range> &R1, + const InductiveRangeCheck::Range &R2, IRBuilder<> &B) { + if (!R1.hasValue()) + return R2; + auto &R1Value = R1.getValue(); + + // TODO: we could widen the smaller range and have this work; but for now we + // bail out to keep things simple. + if (R1Value.getType() != R2.getType()) + return None; + + const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin()); + const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd()); + + return InductiveRangeCheck::Range(NewBegin, NewEnd); +} + +bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { + if (L->getBlocks().size() >= LoopSizeCutoff) { + DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";); + return false; + } + + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) { + DEBUG(dbgs() << "irce: loop has no preheader, leaving\n"); + return false; + } + + LLVMContext &Context = Preheader->getContext(); + InductiveRangeCheck::AllocatorTy IRCAlloc; + SmallVector<InductiveRangeCheck *, 16> RangeChecks; + ScalarEvolution &SE = getAnalysis<ScalarEvolution>(); + BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>(); + + for (auto BBI : L->getBlocks()) + if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator())) + if (InductiveRangeCheck *IRC = + InductiveRangeCheck::create(IRCAlloc, TBI, L, SE, BPI)) + RangeChecks.push_back(IRC); + + if (RangeChecks.empty()) + return false; + + DEBUG(dbgs() << "irce: looking at loop "; L->print(dbgs()); + dbgs() << "irce: loop has " << RangeChecks.size() + << " inductive range checks: \n"; + for (InductiveRangeCheck *IRC : RangeChecks) + IRC->print(dbgs()); + ); + + const char *FailureReason = nullptr; + Optional<LoopStructure> MaybeLoopStructure = + LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason); + if (!MaybeLoopStructure.hasValue()) { + DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason + << "\n";); + return false; + } + LoopStructure LS = MaybeLoopStructure.getValue(); + bool Increasing = LS.IndVarIncreasing; + const SCEV *MinusOne = + SE.getConstant(LS.IndVarNext->getType(), Increasing ? -1 : 1, true); + const SCEVAddRecExpr *IndVar = + cast<SCEVAddRecExpr>(SE.getAddExpr(SE.getSCEV(LS.IndVarNext), MinusOne)); + + Optional<InductiveRangeCheck::Range> SafeIterRange; + Instruction *ExprInsertPt = Preheader->getTerminator(); + + SmallVector<InductiveRangeCheck *, 4> RangeChecksToEliminate; + + IRBuilder<> B(ExprInsertPt); + for (InductiveRangeCheck *IRC : RangeChecks) { + auto Result = IRC->computeSafeIterationSpace(SE, IndVar, B); + if (Result.hasValue()) { + auto MaybeSafeIterRange = + IntersectRange(SE, SafeIterRange, Result.getValue(), B); + if (MaybeSafeIterRange.hasValue()) { + RangeChecksToEliminate.push_back(IRC); + SafeIterRange = MaybeSafeIterRange.getValue(); + } + } + } + + if (!SafeIterRange.hasValue()) + return false; + + LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LS, + SE, SafeIterRange.getValue()); + bool Changed = LC.run(); + + if (Changed) { + auto PrintConstrainedLoopInfo = [L]() { + dbgs() << "irce: in function "; + dbgs() << L->getHeader()->getParent()->getName() << ": "; + dbgs() << "constrained "; + L->print(dbgs()); + }; + + DEBUG(PrintConstrainedLoopInfo()); + + if (PrintChangedLoops) + PrintConstrainedLoopInfo(); + + // Optimize away the now-redundant range checks. + + for (InductiveRangeCheck *IRC : RangeChecksToEliminate) { + ConstantInt *FoldedRangeCheck = IRC->getPassingDirection() + ? ConstantInt::getTrue(Context) + : ConstantInt::getFalse(Context); + IRC->getBranch()->setCondition(FoldedRangeCheck); + } + } + + return Changed; +} + +Pass *llvm::createInductiveRangeCheckEliminationPass() { + return new InductiveRangeCheckElimination; +} diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 60a4925..8b54abd 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -32,7 +32,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -115,7 +115,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); AU.addPreserved<LazyValueInfo>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } void FindLoopHeaders(Function &F); @@ -145,7 +145,7 @@ char JumpThreading::ID = 0; INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading", "Jump Threading", false, false) INITIALIZE_PASS_DEPENDENCY(LazyValueInfo) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(JumpThreading, "jump-threading", "Jump Threading", false, false) @@ -161,7 +161,7 @@ bool JumpThreading::runOnFunction(Function &F) { DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); LVI = &getAnalysis<LazyValueInfo>(); // Remove unreachable blocks from function as they may result in infinite @@ -188,7 +188,7 @@ bool JumpThreading::runOnFunction(Function &F) { // If the block is trivially dead, zap it. This eliminates the successor // edges which simplifies the CFG. - if (pred_begin(BB) == pred_end(BB) && + if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) { DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName() << "' with terminator: " << *BB->getTerminator() << '\n'); @@ -662,7 +662,7 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) { bool JumpThreading::ProcessBlock(BasicBlock *BB) { // If the block is trivially dead, just return and let the caller nuke it. // This simplifies other transformations. - if (pred_begin(BB) == pred_end(BB) && + if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) return false; @@ -797,7 +797,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { } } else if (CondBr && CondConst && CondBr->isConditional()) { - // There might be an invairant in the same block with the conditional + // There might be an invariant in the same block with the conditional // that can determine the predicate. LazyValueInfo::Tristate Ret = @@ -902,8 +902,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // only happen in dead loops. if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType()); if (AvailableVal->getType() != LI->getType()) - AvailableVal = CastInst::Create(CastInst::BitCast, AvailableVal, - LI->getType(), "", LI); + AvailableVal = + CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI); LI->replaceAllUsesWith(AvailableVal); LI->eraseFromParent(); return true; @@ -993,7 +993,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Split them out to their own block. UnavailablePred = - SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split", this); + SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split"); } // If the value isn't available in all predecessors, then there will be @@ -1040,8 +1040,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // predecessor use the same bitcast. Value *&PredV = I->second; if (PredV->getType() != LI->getType()) - PredV = CastInst::Create(CastInst::BitCast, PredV, LI->getType(), "", - P->getTerminator()); + PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "", + P->getTerminator()); PN->addIncoming(PredV, I->first); } @@ -1418,7 +1418,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this); + PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); } // And finally, do it! @@ -1561,7 +1561,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this); + PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); } // Okay, we decided to do this! Clone all the instructions in BB onto the end @@ -1575,7 +1575,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator()); if (!OldPredBranch || !OldPredBranch->isUnconditional()) { - PredBB = SplitEdge(PredBB, BB, this); + PredBB = SplitEdge(PredBB, BB); OldPredBranch = cast<BranchInst>(PredBB->getTerminator()); } diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 5f00bb9..14af38b 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -52,7 +52,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -71,6 +71,27 @@ static cl::opt<bool> DisablePromotion("disable-licm-promotion", cl::Hidden, cl::desc("Disable memory promotion in LICM pass")); +static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); +static bool isNotUsedInLoop(Instruction &I, Loop *CurLoop); +static bool hoist(Instruction &I, BasicBlock *Preheader); +static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, + Loop *CurLoop, AliasSetTracker *CurAST ); +static bool isGuaranteedToExecute(Instruction &Inst, DominatorTree *DT, + Loop *CurLoop, LICMSafetyInfo * SafetyInfo); +static bool isSafeToExecuteUnconditionally(Instruction &Inst,DominatorTree *DT, + const DataLayout *DL, Loop *CurLoop, + LICMSafetyInfo * SafetyInfo); +static bool pointerInvalidatedByLoop(Value *V, uint64_t Size, + const AAMDNodes &AAInfo, + AliasSetTracker *CurAST); +static Instruction *CloneInstructionInExitBlock(Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN, LoopInfo *LI); +static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, + DominatorTree *DT, const DataLayout *DL, + Loop *CurLoop, AliasSetTracker *CurAST, + LICMSafetyInfo * SafetyInfo); + namespace { struct LICM : public LoopPass { static char ID; // Pass identification, replacement for typeid @@ -86,7 +107,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); @@ -94,7 +115,7 @@ namespace { AU.addRequired<AliasAnalysis>(); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } using llvm::Pass::doFinalization; @@ -117,9 +138,6 @@ namespace { BasicBlock *Preheader; // The preheader block of the current loop... Loop *CurLoop; // The current loop we are working on... AliasSetTracker *CurAST; // AliasSet information for the current loop... - bool MayThrow; // The current loop contains an instruction which - // may throw, thus preventing code motion of - // instructions with side effects. DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap; /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. @@ -132,88 +150,17 @@ namespace { /// Simple Analysis hook. Delete loop L from alias set map. void deleteAnalysisLoop(Loop *L) override; - - /// SinkRegion - Walk the specified region of the CFG (defined by all blocks - /// dominated by the specified block, and that are in the current loop) in - /// reverse depth first order w.r.t the DominatorTree. This allows us to - /// visit uses before definitions, allowing us to sink a loop body in one - /// pass without iteration. - /// - void SinkRegion(DomTreeNode *N); - - /// HoistRegion - Walk the specified region of the CFG (defined by all - /// blocks dominated by the specified block, and that are in the current - /// loop) in depth first order w.r.t the DominatorTree. This allows us to - /// visit definitions before uses, allowing us to hoist a loop body in one - /// pass without iteration. - /// - void HoistRegion(DomTreeNode *N); - - /// inSubLoop - Little predicate that returns true if the specified basic - /// block is in a subloop of the current one, not the current one itself. - /// - bool inSubLoop(BasicBlock *BB) { - assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop"); - return LI->getLoopFor(BB) != CurLoop; - } - - /// sink - When an instruction is found to only be used outside of the loop, - /// this function moves it to the exit blocks and patches up SSA form as - /// needed. - /// - void sink(Instruction &I); - - /// hoist - When an instruction is found to only use loop invariant operands - /// that is safe to hoist, this instruction is called to do the dirty work. - /// - void hoist(Instruction &I); - - /// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it - /// is not a trapping instruction or if it is a trapping instruction and is - /// guaranteed to execute. - /// - bool isSafeToExecuteUnconditionally(Instruction &I); - - /// isGuaranteedToExecute - Check that the instruction is guaranteed to - /// execute. - /// - bool isGuaranteedToExecute(Instruction &I); - - /// pointerInvalidatedByLoop - Return true if the body of this loop may - /// store into the memory location pointed to by V. - /// - bool pointerInvalidatedByLoop(Value *V, uint64_t Size, - const AAMDNodes &AAInfo) { - // Check to see if any of the basic blocks in CurLoop invalidate *V. - return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod(); - } - - bool canSinkOrHoistInst(Instruction &I); - bool isNotUsedInLoop(Instruction &I); - - void PromoteAliasSet(AliasSet &AS, - SmallVectorImpl<BasicBlock*> &ExitBlocks, - SmallVectorImpl<Instruction*> &InsertPts, - PredIteratorCache &PIC); - - /// \brief Create a copy of the instruction in the exit block and patch up - /// SSA. - /// PN is a user of I in ExitBlock that can be used to get the number and - /// list of predecessors fast. - Instruction *CloneInstructionInExitBlock(Instruction &I, - BasicBlock &ExitBlock, - PHINode &PN); }; } char LICM::ID = 0; INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) @@ -230,13 +177,13 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { Changed = false; // Get our Loop and Alias Analysis information... - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); AA = &getAnalysis<AliasAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); @@ -273,14 +220,9 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { CurAST->add(*BB); // Incorporate the specified basic block } - MayThrow = false; - // TODO: We've already searched for instructions which may throw in subloops. - // We may want to reuse this information. - for (Loop::block_iterator BB = L->block_begin(), BBE = L->block_end(); - (BB != BBE) && !MayThrow ; ++BB) - for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); - (I != E) && !MayThrow; ++I) - MayThrow |= I->mayThrow(); + // Compute loop safety information. + LICMSafetyInfo SafetyInfo; + computeLICMSafetyInfo(&SafetyInfo, CurLoop); // We want to visit all of the instructions in this loop... that are not parts // of our subloops (they have already had their invariants hoisted out of @@ -293,9 +235,11 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // instructions, we perform another pass to hoist them out of the loop. // if (L->hasDedicatedExits()) - SinkRegion(DT->getNode(L->getHeader())); + Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, DL, TLI, + CurLoop, CurAST, &SafetyInfo); if (Preheader) - HoistRegion(DT->getNode(L->getHeader())); + Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, DL, TLI, + CurLoop, CurAST, &SafetyInfo); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -307,7 +251,9 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // Loop over all of the alias sets in the tracker object. for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end(); I != E; ++I) - PromoteAliasSet(*I, ExitBlocks, InsertPts, PIC); + Changed |= promoteLoopAccessesToScalars(*I, ExitBlocks, InsertPts, + PIC, LI, DT, CurLoop, + CurAST, &SafetyInfo); // Once we have promoted values across the loop body we have to recursively // reform LCSSA as any nested loop may now have values defined within the @@ -316,7 +262,8 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // SSAUpdater strategy during promotion that was LCSSA aware and reformed // it as it went. if (Changed) - formLCSSARecursively(*L, *DT, getAnalysisIfAvailable<ScalarEvolution>()); + formLCSSARecursively(*L, *DT, LI, + getAnalysisIfAvailable<ScalarEvolution>()); } // Check that neither this loop nor its parent have had LCSSA broken. LICM is @@ -339,27 +286,36 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { return Changed; } -/// SinkRegion - Walk the specified region of the CFG (defined by all blocks -/// dominated by the specified block, and that are in the current loop) in -/// reverse depth first order w.r.t the DominatorTree. This allows us to visit -/// uses before definitions, allowing us to sink a loop body in one pass without -/// iteration. +/// Walk the specified region of the CFG (defined by all blocks dominated by +/// the specified block, and that are in the current loop) in reverse depth +/// first order w.r.t the DominatorTree. This allows us to visit uses before +/// definitions, allowing us to sink a loop body in one pass without iteration. /// -void LICM::SinkRegion(DomTreeNode *N) { - assert(N != nullptr && "Null dominator tree node?"); +bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, + DominatorTree *DT, const DataLayout *DL, + TargetLibraryInfo *TLI, Loop *CurLoop, + AliasSetTracker *CurAST, LICMSafetyInfo * SafetyInfo) { + + // Verify inputs. + assert(N != nullptr && AA != nullptr && LI != nullptr && + DT != nullptr && CurLoop != nullptr && CurAST != nullptr && + SafetyInfo != nullptr && "Unexpected input to sinkRegion"); + + // Set changed as false. + bool Changed = false; + // Get basic block BasicBlock *BB = N->getBlock(); - // If this subregion is not in the top level loop at all, exit. - if (!CurLoop->contains(BB)) return; + if (!CurLoop->contains(BB)) return Changed; // We are processing blocks in reverse dfo, so process children first. const std::vector<DomTreeNode*> &Children = N->getChildren(); for (unsigned i = 0, e = Children.size(); i != e; ++i) - SinkRegion(Children[i]); - + Changed |= sinkRegion(Children[i], AA, LI, DT, DL, TLI, CurLoop, + CurAST, SafetyInfo); // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). - if (inSubLoop(BB)) return; + if (inSubLoop(BB,CurLoop,LI)) return Changed; for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) { Instruction &I = *--II; @@ -380,31 +336,39 @@ void LICM::SinkRegion(DomTreeNode *N) { // outside of the loop. In this case, it doesn't even matter if the // operands of the instruction are loop invariant. // - if (isNotUsedInLoop(I) && canSinkOrHoistInst(I)) { + if (isNotUsedInLoop(I, CurLoop) && + canSinkOrHoistInst(I, AA, DT, DL, CurLoop, CurAST, SafetyInfo)) { ++II; - sink(I); + Changed |= sink(I, LI, DT, CurLoop, CurAST); } } + return Changed; } -/// HoistRegion - Walk the specified region of the CFG (defined by all blocks -/// dominated by the specified block, and that are in the current loop) in depth -/// first order w.r.t the DominatorTree. This allows us to visit definitions -/// before uses, allowing us to hoist a loop body in one pass without iteration. +/// Walk the specified region of the CFG (defined by all blocks dominated by +/// the specified block, and that are in the current loop) in depth first +/// order w.r.t the DominatorTree. This allows us to visit definitions before +/// uses, allowing us to hoist a loop body in one pass without iteration. /// -void LICM::HoistRegion(DomTreeNode *N) { - assert(N != nullptr && "Null dominator tree node?"); +bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, + DominatorTree *DT, const DataLayout *DL, + TargetLibraryInfo *TLI, Loop *CurLoop, + AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { + // Verify inputs. + assert(N != nullptr && AA != nullptr && LI != nullptr && + DT != nullptr && CurLoop != nullptr && CurAST != nullptr && + SafetyInfo != nullptr && "Unexpected input to hoistRegion"); + // Set changed as false. + bool Changed = false; + // Get basic block BasicBlock *BB = N->getBlock(); - // If this subregion is not in the top level loop at all, exit. - if (!CurLoop->contains(BB)) return; - + if (!CurLoop->contains(BB)) return Changed; // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). - if (!inSubLoop(BB)) + if (!inSubLoop(BB, CurLoop, LI)) for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) { Instruction &I = *II++; - // Try constant folding this instruction. If all the operands are // constants, it is technically hoistable, but it would be better to just // fold it. @@ -421,20 +385,49 @@ void LICM::HoistRegion(DomTreeNode *N) { // if all of the operands of the instruction are loop invariant and if it // is safe to hoist the instruction. // - if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I) && - isSafeToExecuteUnconditionally(I)) - hoist(I); + if (CurLoop->hasLoopInvariantOperands(&I) && + canSinkOrHoistInst(I, AA, DT, DL, CurLoop, CurAST, SafetyInfo) && + isSafeToExecuteUnconditionally(I, DT, DL, CurLoop, SafetyInfo)) + Changed |= hoist(I, CurLoop->getLoopPreheader()); } const std::vector<DomTreeNode*> &Children = N->getChildren(); for (unsigned i = 0, e = Children.size(); i != e; ++i) - HoistRegion(Children[i]); + Changed |= hoistRegion(Children[i], AA, LI, DT, DL, TLI, CurLoop, + CurAST, SafetyInfo); + return Changed; +} + +/// Computes loop safety information, checks loop body & header +/// for the possiblity of may throw exception. +/// +void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { + assert(CurLoop != nullptr && "CurLoop cant be null"); + BasicBlock *Header = CurLoop->getHeader(); + // Setting default safety values. + SafetyInfo->MayThrow = false; + SafetyInfo->HeaderMayThrow = false; + // Iterate over header and compute dafety info. + for (BasicBlock::iterator I = Header->begin(), E = Header->end(); + (I != E) && !SafetyInfo->HeaderMayThrow; ++I) + SafetyInfo->HeaderMayThrow |= I->mayThrow(); + + SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow; + // Iterate over loop instructions and compute safety info. + for (Loop::block_iterator BB = CurLoop->block_begin(), + BBE = CurLoop->block_end(); (BB != BBE) && !SafetyInfo->MayThrow ; ++BB) + for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); + (I != E) && !SafetyInfo->MayThrow; ++I) + SafetyInfo->MayThrow |= I->mayThrow(); } /// canSinkOrHoistInst - Return true if the hoister and sinker can handle this /// instruction. /// -bool LICM::canSinkOrHoistInst(Instruction &I) { +bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, + DominatorTree *DT, const DataLayout *DL, + Loop *CurLoop, AliasSetTracker *CurAST, + LICMSafetyInfo * SafetyInfo) { // Loads have extra constraints we have to verify before we can hoist them. if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { if (!LI->isUnordered()) @@ -455,7 +448,7 @@ bool LICM::canSinkOrHoistInst(Instruction &I) { AAMDNodes AAInfo; LI->getAAMetadata(AAInfo); - return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo); + return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST); } else if (CallInst *CI = dyn_cast<CallInst>(&I)) { // Don't sink or hoist dbg info; it's legal, but not useful. if (isa<DbgInfoIntrinsic>(I)) @@ -494,14 +487,14 @@ bool LICM::canSinkOrHoistInst(Instruction &I) { !isa<InsertValueInst>(I)) return false; - return isSafeToExecuteUnconditionally(I); + return isSafeToExecuteUnconditionally(I, DT, DL, CurLoop, SafetyInfo); } -/// \brief Returns true if a PHINode is a trivially replaceable with an +/// Returns true if a PHINode is a trivially replaceable with an /// Instruction. +/// This is true when all incoming values are that instruction. +/// This pattern occurs most often with LCSSA PHI nodes. /// -/// This is true when all incoming values are that instruction. This pattern -/// occurs most often with LCSSA PHI nodes. static bool isTriviallyReplacablePHI(PHINode &PN, Instruction &I) { for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) if (PN.getIncomingValue(i) != &I) @@ -510,11 +503,11 @@ static bool isTriviallyReplacablePHI(PHINode &PN, Instruction &I) { return true; } -/// isNotUsedInLoop - Return true if the only users of this instruction are -/// outside of the loop. If this is true, we can sink the instruction to the -/// exit blocks of the loop. +/// Return true if the only users of this instruction are outside of +/// the loop. If this is true, we can sink the instruction to the exit +/// blocks of the loop. /// -bool LICM::isNotUsedInLoop(Instruction &I) { +static bool isNotUsedInLoop(Instruction &I, Loop *CurLoop) { for (User *U : I.users()) { Instruction *UI = cast<Instruction>(U); if (PHINode *PN = dyn_cast<PHINode>(UI)) { @@ -545,9 +538,9 @@ bool LICM::isNotUsedInLoop(Instruction &I) { return true; } -Instruction *LICM::CloneInstructionInExitBlock(Instruction &I, - BasicBlock &ExitBlock, - PHINode &PN) { +static Instruction *CloneInstructionInExitBlock(Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN, LoopInfo *LI) { Instruction *New = I.clone(); ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New); if (!I.getName().empty()) New->setName(I.getName() + ".le"); @@ -574,14 +567,15 @@ Instruction *LICM::CloneInstructionInExitBlock(Instruction &I, return New; } -/// sink - When an instruction is found to only be used outside of the loop, -/// this function moves it to the exit blocks and patches up SSA form as needed. +/// When an instruction is found to only be used outside of the loop, this +/// function moves it to the exit blocks and patches up SSA form as needed. /// This method is guaranteed to remove the original instruction from its /// position, and may either delete it or move it to outside of the loop. /// -void LICM::sink(Instruction &I) { +static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, + Loop *CurLoop, AliasSetTracker *CurAST ) { DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); - + bool Changed = false; if (isa<LoadInst>(I)) ++NumMovedLoads; else if (isa<CallInst>(I)) ++NumMovedCalls; ++NumSunk; @@ -590,7 +584,8 @@ void LICM::sink(Instruction &I) { #ifndef NDEBUG SmallVector<BasicBlock *, 32> ExitBlocks; CurLoop->getUniqueExitBlocks(ExitBlocks); - SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end()); + SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); #endif // Clones of this instruction. Don't create more than one per exit block! @@ -618,7 +613,7 @@ void LICM::sink(Instruction &I) { New = It->second; else New = SunkCopies[ExitBlock] = - CloneInstructionInExitBlock(I, *ExitBlock, *PN); + CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI); PN->replaceAllUsesWith(New); PN->eraseFromParent(); @@ -626,44 +621,41 @@ void LICM::sink(Instruction &I) { CurAST->deleteValue(&I); I.eraseFromParent(); + return Changed; } -/// hoist - When an instruction is found to only use loop invariant operands -/// that is safe to hoist, this instruction is called to do the dirty work. +/// When an instruction is found to only use loop invariant operands that +/// is safe to hoist, this instruction is called to do the dirty work. /// -void LICM::hoist(Instruction &I) { +static bool hoist(Instruction &I, BasicBlock *Preheader) { DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I << "\n"); - // Move the new node to the Preheader, before its terminator. I.moveBefore(Preheader->getTerminator()); if (isa<LoadInst>(I)) ++NumMovedLoads; else if (isa<CallInst>(I)) ++NumMovedCalls; ++NumHoisted; - Changed = true; + return true; } -/// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it is -/// not a trapping instruction or if it is a trapping instruction and is -/// guaranteed to execute. +/// Only sink or hoist an instruction if it is not a trapping instruction +/// or if it is a trapping instruction and is guaranteed to execute. /// -bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { +static bool isSafeToExecuteUnconditionally(Instruction &Inst, DominatorTree *DT, + const DataLayout *DL, Loop *CurLoop, + LICMSafetyInfo * SafetyInfo) { // If it is not a trapping instruction, it is always safe to hoist. if (isSafeToSpeculativelyExecute(&Inst, DL)) return true; - return isGuaranteedToExecute(Inst); + return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo); } -bool LICM::isGuaranteedToExecute(Instruction &Inst) { - - // Somewhere in this loop there is an instruction which may throw and make us - // exit the loop. - if (MayThrow) - return false; +static bool isGuaranteedToExecute(Instruction &Inst, DominatorTree *DT, + Loop *CurLoop, LICMSafetyInfo * SafetyInfo) { - // Otherwise we have to check to make sure that the instruction dominates all + // We have to check to make sure that the instruction dominates all // of the exit blocks. If it doesn't, then there is a path out of the loop // which does not execute this instruction, so we can't hoist it. @@ -671,7 +663,14 @@ bool LICM::isGuaranteedToExecute(Instruction &Inst) { // common), it is always guaranteed to dominate the exit blocks. Since this // is a common case, and can save some work, check it now. if (Inst.getParent() == CurLoop->getHeader()) - return true; + // If there's a throw in the header block, we can't guarantee we'll reach + // Inst. + return !SafetyInfo->HeaderMayThrow; + + // Somewhere in this loop there is an instruction which may throw and make us + // exit the loop. + if (SafetyInfo->MayThrow) + return false; // Get the exit blocks for the current loop. SmallVector<BasicBlock*, 8> ExitBlocks; @@ -768,25 +767,37 @@ namespace { }; } // end anon namespace -/// PromoteAliasSet - Try to promote memory values to scalars by sinking -/// stores out of the loop and moving loads to before the loop. We do this by -/// looping over the stores in the loop, looking for stores to Must pointers -/// which are loop invariant. +/// Try to promote memory values to scalars by sinking stores out of the +/// loop and moving loads to before the loop. We do this by looping over +/// the stores in the loop, looking for stores to Must pointers which are +/// loop invariant. /// -void LICM::PromoteAliasSet(AliasSet &AS, - SmallVectorImpl<BasicBlock*> &ExitBlocks, - SmallVectorImpl<Instruction*> &InsertPts, - PredIteratorCache &PIC) { +bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, + SmallVectorImpl<BasicBlock*>&ExitBlocks, + SmallVectorImpl<Instruction*>&InsertPts, + PredIteratorCache &PIC, LoopInfo *LI, + DominatorTree *DT, Loop *CurLoop, + AliasSetTracker *CurAST, + LICMSafetyInfo * SafetyInfo) { + // Verify inputs. + assert(LI != nullptr && DT != nullptr && + CurLoop != nullptr && CurAST != nullptr && + SafetyInfo != nullptr && + "Unexpected Input to promoteLoopAccessesToScalars"); + // Initially set Changed status to false. + bool Changed = false; // We can promote this alias set if it has a store, if it is a "Must" alias // set, if the pointer is loop invariant, and if we are not eliminating any // volatile loads or stores. if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() || AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue())) - return; + return Changed; assert(!AS.empty() && "Must alias set should have at least one pointer element in it!"); + Value *SomePtr = AS.begin()->getValue(); + BasicBlock * Preheader = CurLoop->getLoopPreheader(); // It isn't safe to promote a load/store from the loop if the load/store is // conditional. For example, turning: @@ -810,6 +821,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, // us to prove better alignment. unsigned Alignment = 1; AAMDNodes AATags; + bool HasDedicatedExits = CurLoop->hasDedicatedExits(); // Check that all of the pointers in the alias set have the same type. We // cannot (yet) promote a memory location that is loaded and stored in @@ -822,7 +834,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, // cannot (yet) promote a memory location that is loaded and stored in // different sizes. if (SomePtr->getType() != ASIV->getType()) - return; + return Changed; for (User *U : ASIV->users()) { // Ignore instructions that are outside the loop. @@ -835,7 +847,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, if (LoadInst *load = dyn_cast<LoadInst>(UI)) { assert(!load->isVolatile() && "AST broken"); if (!load->isSimple()) - return; + return Changed; } else if (StoreInst *store = dyn_cast<StoreInst>(UI)) { // Stores *of* the pointer are not interesting, only stores *to* the // pointer. @@ -843,7 +855,14 @@ void LICM::PromoteAliasSet(AliasSet &AS, continue; assert(!store->isVolatile() && "AST broken"); if (!store->isSimple()) - return; + return Changed; + // Don't sink stores from loops without dedicated block exits. Exits + // containing indirect branches are not transformed by loop simplify, + // make sure we catch that. An additional load may be generated in the + // preheader for SSA updater, so also avoid sinking when no preheader + // is available. + if (!HasDedicatedExits || !Preheader) + return Changed; // Note that we only check GuaranteedToExecute inside the store case // so that we do not introduce stores where they did not exist before @@ -855,16 +874,17 @@ void LICM::PromoteAliasSet(AliasSet &AS, // Larger is better, with the exception of 0 being the best alignment. unsigned InstAlignment = store->getAlignment(); if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0) - if (isGuaranteedToExecute(*UI)) { + if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) { GuaranteedToExecute = true; Alignment = InstAlignment; } if (!GuaranteedToExecute) - GuaranteedToExecute = isGuaranteedToExecute(*UI); + GuaranteedToExecute = isGuaranteedToExecute(*UI, DT, + CurLoop, SafetyInfo); } else - return; // Not a load or store. + return Changed; // Not a load or store. // Merge the AA tags. if (LoopUses.empty()) { @@ -880,7 +900,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, // If there isn't a guaranteed-to-execute instruction, we can't promote. if (!GuaranteedToExecute) - return; + return Changed; // Otherwise, this is safe to promote, lets do it! DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n'); @@ -925,10 +945,12 @@ void LICM::PromoteAliasSet(AliasSet &AS, // If the SSAUpdater didn't use the load in the preheader, just zap it now. if (PreheaderLoad->use_empty()) PreheaderLoad->eraseFromParent(); -} + return Changed; +} -/// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. +/// Simple Analysis hook. Clone alias set info. +/// void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) { AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); if (!AST) @@ -937,8 +959,8 @@ void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) { AST->copyValue(From, To); } -/// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias -/// set. +/// Simple Analysis hook. Delete value V from alias set +/// void LICM::deleteAnalysisValue(Value *V, Loop *L) { AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); if (!AST) @@ -948,6 +970,7 @@ void LICM::deleteAnalysisValue(Value *V, Loop *L) { } /// Simple Analysis hook. Delete value L from alias set map. +/// void LICM::deleteAnalysisLoop(Loop *L) { AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); if (!AST) @@ -956,3 +979,23 @@ void LICM::deleteAnalysisLoop(Loop *L) { delete AST; LoopToAliasSetMap.erase(L); } + + +/// Return true if the body of this loop may store into the memory +/// location pointed to by V. +/// +static bool pointerInvalidatedByLoop(Value *V, uint64_t Size, + const AAMDNodes &AAInfo, + AliasSetTracker *CurAST) { + // Check to see if any of the basic blocks in CurLoop invalidate *V. + return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod(); +} + +/// Little predicate that returns true if the specified basic block is in +/// a subloop of the current one, not the current one itself. +/// +static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) { + assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop"); + return LI->getLoopFor(BB) != CurLoop; +} + diff --git a/lib/Transforms/Scalar/LLVMBuild.txt b/lib/Transforms/Scalar/LLVMBuild.txt index 2bb49a3..deea9e2 100644 --- a/lib/Transforms/Scalar/LLVMBuild.txt +++ b/lib/Transforms/Scalar/LLVMBuild.txt @@ -20,4 +20,4 @@ type = Library name = Scalar parent = Transforms library_name = ScalarOpts -required_libraries = Analysis Core InstCombine ProfileData Support Target TransformUtils +required_libraries = Analysis Core InstCombine ProfileData Support TransformUtils diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index 1d1f33a..98b068e 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -39,14 +39,14 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreserved<ScalarEvolution>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); } @@ -63,7 +63,7 @@ char LoopDeletion::ID = 0; INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", "Delete dead loops", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) @@ -236,7 +236,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { // Finally, the blocks from loopinfo. This has to happen late because // otherwise our loop iterators won't work. - LoopInfo &loopInfo = getAnalysis<LoopInfo>(); + LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SmallPtrSet<BasicBlock*, 8> blocks; blocks.insert(L->block_begin(), L->block_end()); for (BasicBlock *BB : blocks) diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a12f5a7..243c624 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -56,7 +56,7 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -163,8 +163,8 @@ namespace { /// loop preheaders be inserted into the CFG. /// void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); @@ -175,8 +175,8 @@ namespace { AU.addPreserved<ScalarEvolution>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfo>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } const DataLayout *getDataLayout() { @@ -197,11 +197,16 @@ namespace { } TargetLibraryInfo *getTargetLibraryInfo() { - return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>()); + if (!TLI) + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + + return TLI; } const TargetTransformInfo *getTargetTransformInfo() { - return TTI ? TTI : (TTI = &getAnalysis<TargetTransformInfo>()); + return TTI ? TTI + : (TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *CurLoop->getHeader()->getParent())); } Loop *getLoop() const { return CurLoop; } @@ -215,14 +220,14 @@ namespace { char LoopIdiomRecognize::ID = 0; INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) @@ -232,44 +237,13 @@ Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); } /// and zero out all the operands of this instruction. If any of them become /// dead, delete them and the computation tree that feeds them. /// -static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE, +static void deleteDeadInstruction(Instruction *I, const TargetLibraryInfo *TLI) { - SmallVector<Instruction*, 32> NowDeadInsts; - - NowDeadInsts.push_back(I); - - // Before we touch this instruction, remove it from SE! - do { - Instruction *DeadInst = NowDeadInsts.pop_back_val(); - - // This instruction is dead, zap it, in stages. Start by removing it from - // SCEV. - SE.forgetValue(DeadInst); - - for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { - Value *Op = DeadInst->getOperand(op); - DeadInst->setOperand(op, nullptr); - - // If this operand just became dead, add it to the NowDeadInsts list. - if (!Op->use_empty()) continue; - - if (Instruction *OpI = dyn_cast<Instruction>(Op)) - if (isInstructionTriviallyDead(OpI, TLI)) - NowDeadInsts.push_back(OpI); - } - - DeadInst->eraseFromParent(); - - } while (!NowDeadInsts.empty()); -} - -/// deleteIfDeadInstruction - If the specified value is a dead instruction, -/// delete it and any recursively used instructions. -static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, - const TargetLibraryInfo *TLI) { - if (Instruction *I = dyn_cast<Instruction>(V)) - if (isInstructionTriviallyDead(I, TLI)) - deleteDeadInstruction(I, SE, TLI); + SmallVector<Value *, 16> Operands(I->value_op_begin(), I->value_op_end()); + I->replaceAllUsesWith(UndefValue::get(I->getType())); + I->eraseFromParent(); + for (Value *Op : Operands) + RecursivelyDeleteTriviallyDeadInstructions(Op, TLI); } //===----------------------------------------------------------------------===// @@ -285,7 +259,7 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, // the concern of breaking data dependence. bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { if (BranchInst *Br = getBranch(BB)) { - return Br->isUnconditional() && BB->size() == 1; + return Br->isUnconditional() && Br == BB->begin(); } return false; } @@ -542,7 +516,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst, cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); PreCond->replaceAllUsesWith(NewPreCond); - deleteDeadInstruction(PreCond, *SE, TLI); + RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); } // Step 3: Note that the population count is exactly the trip count of the @@ -592,15 +566,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst, // Step 4: All the references to the original population counter outside // the loop are replaced with the NewCount -- the value returned from // __builtin_ctpop(). - { - SmallVector<Value *, 4> CntUses; - for (User *U : CntInst->users()) - if (cast<Instruction>(U)->getParent() != Body) - CntUses.push_back(U); - for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) { - (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount); - } - } + CntInst->replaceUsesOutsideBlock(NewCount, Body); // step 5: Forget the "non-computable" trip-count SCEV associated with the // loop. The loop would otherwise not be deleted even if it becomes empty. @@ -666,8 +632,8 @@ bool LoopIdiomRecognize::runOnCountableLoop() { // set DT (void)getDominatorTree(); - LoopInfo &LI = getAnalysis<LoopInfo>(); - TLI = &getAnalysis<TargetLibraryInfo>(); + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); // set TLI (void)getTargetLibraryInfo(); @@ -997,7 +963,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) { Expander.clear(); // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(BasePtr, *SE, TLI); + RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); return false; } @@ -1053,7 +1019,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // Okay, the memset has been formed. Zap the original store and anything that // feeds into it. - deleteDeadInstruction(TheStore, *SE, TLI); + deleteDeadInstruction(TheStore, TLI); ++NumMemSet; return true; } @@ -1094,7 +1060,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, getAnalysis<AliasAnalysis>(), SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); + RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); return false; } @@ -1109,8 +1075,8 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, StoreSize, getAnalysis<AliasAnalysis>(), SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(LoadBasePtr, *SE, TLI); - deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); + RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); + RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); return false; } @@ -1143,7 +1109,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // Okay, the memset has been formed. Zap the original store and anything that // feeds into it. - deleteDeadInstruction(SI, *SE, TLI); + deleteDeadInstruction(SI, TLI); ++NumMemCpy; return true; } diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp index 8fd7c8f..6dc600e 100644 --- a/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -14,15 +14,16 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -42,13 +43,13 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<AssumptionTracker>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); - AU.addPreserved("scalar-evolution"); - AU.addRequired<TargetLibraryInfo>(); + AU.addPreserved<ScalarEvolution>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; } @@ -56,10 +57,10 @@ namespace { char LoopInstSimplify::ID = 0; INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify", "Simplify instructions in loops", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify", "Simplify instructions in loops", false, false) @@ -75,11 +76,13 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - LoopInfo *LI = &getAnalysis<LoopInfo>(); + LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); - AssumptionTracker *AT = &getAnalysis<AssumptionTracker>(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + *L->getHeader()->getParent()); SmallVector<BasicBlock*, 8> ExitBlocks; L->getUniqueExitBlocks(ExitBlocks); @@ -120,7 +123,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Don't bother simplifying unused instructions. if (!I->use_empty()) { - Value *V = SimplifyInstruction(I, DL, TLI, DT, AT); + Value *V = SimplifyInstruction(I, DL, TLI, DT, &AC); if (V && LI->replacementPreservesLCSSAForm(I, V)) { // Mark all uses for resimplification next time round the loop. for (User *U : I->users()) diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp index 8f12204..fdf7e3b 100644 --- a/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -12,7 +12,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -28,7 +30,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -43,6 +45,12 @@ static cl::opt<unsigned> MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, cl::desc("The maximum increment for loop rerolling")); +static cl::opt<unsigned> +NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400), + cl::Hidden, + cl::desc("The maximum number of failures to tolerate" + " during fuzzy matching. (default: 400)")); + // This loop re-rolling transformation aims to transform loops like this: // // int foo(int a); @@ -119,6 +127,16 @@ MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, // br %cmp, header, exit namespace { + enum IterationLimits { + /// The maximum number of iterations that we'll try and reroll. This + /// has to be less than 25 in order to fit into a SmallBitVector. + IL_MaxRerollIterations = 16, + /// The bitvector index used by loop induction variables and other + /// instructions that belong to all iterations. + IL_All, + IL_End + }; + class LoopReroll : public LoopPass { public: static char ID; // Pass ID, replacement for typeid @@ -130,15 +148,15 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AliasAnalysis>(); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolution>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } -protected: + protected: AliasAnalysis *AA; LoopInfo *LI; ScalarEvolution *SE; @@ -311,26 +329,116 @@ protected: DenseSet<int> Reds; }; + // A DAGRootSet models an induction variable being used in a rerollable + // loop. For example, + // + // x[i*3+0] = y1 + // x[i*3+1] = y2 + // x[i*3+2] = y3 + // + // Base instruction -> i*3 + // +---+----+ + // / | \ + // ST[y1] +1 +2 <-- Roots + // | | + // ST[y2] ST[y3] + // + // There may be multiple DAGRoots, for example: + // + // x[i*2+0] = ... (1) + // x[i*2+1] = ... (1) + // x[i*2+4] = ... (2) + // x[i*2+5] = ... (2) + // x[(i+1234)*2+5678] = ... (3) + // x[(i+1234)*2+5679] = ... (3) + // + // The loop will be rerolled by adding a new loop induction variable, + // one for the Base instruction in each DAGRootSet. + // + struct DAGRootSet { + Instruction *BaseInst; + SmallInstructionVector Roots; + // The instructions between IV and BaseInst (but not including BaseInst). + SmallInstructionSet SubsumedInsts; + }; + + // The set of all DAG roots, and state tracking of all roots + // for a particular induction variable. + struct DAGRootTracker { + DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV, + ScalarEvolution *SE, AliasAnalysis *AA, + TargetLibraryInfo *TLI, const DataLayout *DL) + : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), + DL(DL), IV(IV) { + } + + /// Stage 1: Find all the DAG roots for the induction variable. + bool findRoots(); + /// Stage 2: Validate if the found roots are valid. + bool validate(ReductionTracker &Reductions); + /// Stage 3: Assuming validate() returned true, perform the + /// replacement. + /// @param IterCount The maximum iteration count of L. + void replace(const SCEV *IterCount); + + protected: + typedef MapVector<Instruction*, SmallBitVector> UsesTy; + + bool findRootsRecursive(Instruction *IVU, + SmallInstructionSet SubsumedInsts); + bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts); + bool collectPossibleRoots(Instruction *Base, + std::map<int64_t,Instruction*> &Roots); + + bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet); + void collectInLoopUserSet(const SmallInstructionVector &Roots, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users); + void collectInLoopUserSet(Instruction *Root, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users); + + UsesTy::iterator nextInstr(int Val, UsesTy &In, + const SmallInstructionSet &Exclude, + UsesTy::iterator *StartI=nullptr); + bool isBaseInst(Instruction *I); + bool isRootInst(Instruction *I); + bool instrDependsOn(Instruction *I, + UsesTy::iterator Start, + UsesTy::iterator End); + + LoopReroll *Parent; + + // Members of Parent, replicated here for brevity. + Loop *L; + ScalarEvolution *SE; + AliasAnalysis *AA; + TargetLibraryInfo *TLI; + const DataLayout *DL; + + // The loop induction variable. + Instruction *IV; + // Loop step amount. + uint64_t Inc; + // Loop reroll count; if Inc == 1, this records the scaling applied + // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ; + // If Inc is not 1, Scale = Inc. + uint64_t Scale; + // The roots themselves. + SmallVector<DAGRootSet,16> RootSets; + // All increment instructions for IV. + SmallInstructionVector LoopIncs; + // Map of all instructions in the loop (in order) to the iterations + // they are used in (or specially, IL_All for instructions + // used in the loop increment mechanism). + UsesTy Uses; + }; + void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); void collectPossibleReductions(Loop *L, ReductionTracker &Reductions); - void collectInLoopUserSet(Loop *L, - const SmallInstructionVector &Roots, - const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet<Instruction *> &Users); - void collectInLoopUserSet(Loop *L, - Instruction * Root, - const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet<Instruction *> &Users); - bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale, - Instruction *&IV, - SmallInstructionVector &LoopIncs); - bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV, - SmallVector<SmallInstructionVector, 32> &Roots, - SmallInstructionSet &AllRoots, - SmallInstructionVector &LoopIncs); bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, ReductionTracker &Reductions); }; @@ -339,10 +447,10 @@ protected: char LoopReroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) Pass *llvm::createLoopRerollPass() { @@ -353,10 +461,10 @@ Pass *llvm::createLoopRerollPass() { // This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in // non-loop blocks to be outside the loop. static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { - for (User *U : I->users()) + for (User *U : I->users()) { if (!L->contains(cast<Instruction>(U))) return true; - + } return false; } @@ -403,6 +511,8 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) { // (including the PHI), except for the last value (which is used by the PHI // and also outside the loop). Instruction *C = Instructions.front(); + if (C->user_empty()) + return; do { C = cast<Instruction>(*C->user_begin()); @@ -424,11 +534,12 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) { return; // C is now the (potential) last instruction in the reduction chain. - for (User *U : C->users()) + for (User *U : C->users()) { // The only in-loop user can be the initial PHI. if (L->contains(cast<Instruction>(U))) if (cast<Instruction>(U) != Instructions.front()) return; + } Instructions.push_back(C); Valid = true; @@ -467,7 +578,7 @@ void LoopReroll::collectPossibleReductions(Loop *L, // if they are users, but their users are not added. This is used, for // example, to prevent a reduction update from forcing all later reduction // updates into the use set. -void LoopReroll::collectInLoopUserSet(Loop *L, +void LoopReroll::DAGRootTracker::collectInLoopUserSet( Instruction *Root, const SmallInstructionSet &Exclude, const SmallInstructionSet &Final, DenseSet<Instruction *> &Users) { @@ -504,14 +615,14 @@ void LoopReroll::collectInLoopUserSet(Loop *L, // Collect all of the users of all of the provided root instructions (combined // into a single set). -void LoopReroll::collectInLoopUserSet(Loop *L, +void LoopReroll::DAGRootTracker::collectInLoopUserSet( const SmallInstructionVector &Roots, const SmallInstructionSet &Exclude, const SmallInstructionSet &Final, DenseSet<Instruction *> &Users) { for (SmallInstructionVector::const_iterator I = Roots.begin(), IE = Roots.end(); I != IE; ++I) - collectInLoopUserSet(L, *I, Exclude, Final, Users); + collectInLoopUserSet(*I, Exclude, Final, Users); } static bool isSimpleLoadStore(Instruction *I) { @@ -524,289 +635,372 @@ static bool isSimpleLoadStore(Instruction *I) { return false; } -// Recognize loops that are setup like this: -// -// %iv = phi [ (preheader, ...), (body, %iv.next) ] -// %scaled.iv = mul %iv, scale -// f(%scaled.iv) -// %scaled.iv.1 = add %scaled.iv, 1 -// f(%scaled.iv.1) -// %scaled.iv.2 = add %scaled.iv, 2 -// f(%scaled.iv.2) -// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 -// f(%scaled.iv.scale_m_1) -// ... -// %iv.next = add %iv, 1 -// %cmp = icmp(%iv, ...) -// br %cmp, header, exit -// -// and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs. -bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale, - Instruction *&IV, - SmallInstructionVector &LoopIncs) { - // This is a special case: here we're looking for all uses (except for - // the increment) to be multiplied by a common factor. The increment must - // be by one. This is to capture loops like: - // for (int i = 0; i < 500; ++i) { - // foo(3*i); foo(3*i+1); foo(3*i+2); - // } - if (RealIV->getNumUses() != 2) - return false; - const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV)); - Instruction *User1 = cast<Instruction>(*RealIV->user_begin()), - *User2 = cast<Instruction>(*std::next(RealIV->user_begin())); - if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType())) - return false; - const SCEVAddRecExpr *User1SCEV = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)), - *User2SCEV = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2)); - if (!User1SCEV || !User1SCEV->isAffine() || - !User2SCEV || !User2SCEV->isAffine()) +/// Return true if IVU is a "simple" arithmetic operation. +/// This is used for narrowing the search space for DAGRoots; only arithmetic +/// and GEPs can be part of a DAGRoot. +static bool isSimpleArithmeticOp(User *IVU) { + if (Instruction *I = dyn_cast<Instruction>(IVU)) { + switch (I->getOpcode()) { + default: return false; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::Shl: + case Instruction::AShr: + case Instruction::LShr: + case Instruction::GetElementPtr: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + return true; + } + } + return false; +} + +static bool isLoopIncrement(User *U, Instruction *IV) { + BinaryOperator *BO = dyn_cast<BinaryOperator>(U); + if (!BO || BO->getOpcode() != Instruction::Add) return false; - // We assume below that User1 is the scale multiply and User2 is the - // increment. If this can't be true, then swap them. - if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) { - std::swap(User1, User2); - std::swap(User1SCEV, User2SCEV); + for (auto *UU : BO->users()) { + PHINode *PN = dyn_cast<PHINode>(UU); + if (PN && PN == IV) + return true; } + return false; +} + +bool LoopReroll::DAGRootTracker:: +collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { + SmallInstructionVector BaseUsers; + + for (auto *I : Base->users()) { + ConstantInt *CI = nullptr; + + if (isLoopIncrement(I, IV)) { + LoopIncs.push_back(cast<Instruction>(I)); + continue; + } + + // The root nodes must be either GEPs, ORs or ADDs. + if (auto *BO = dyn_cast<BinaryOperator>(I)) { + if (BO->getOpcode() == Instruction::Add || + BO->getOpcode() == Instruction::Or) + CI = dyn_cast<ConstantInt>(BO->getOperand(1)); + } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { + Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1); + CI = dyn_cast<ConstantInt>(LastOperand); + } + + if (!CI) { + if (Instruction *II = dyn_cast<Instruction>(I)) { + BaseUsers.push_back(II); + continue; + } else { + DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n"); + return false; + } + } + + int64_t V = CI->getValue().getSExtValue(); + if (Roots.find(V) != Roots.end()) + // No duplicates, please. + return false; - if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE)) + // FIXME: Add support for negative values. + if (V < 0) { + DEBUG(dbgs() << "LRR: Aborting due to negative value: " << V << "\n"); + return false; + } + + Roots[V] = cast<Instruction>(I); + } + + if (Roots.empty()) return false; - assert(User2SCEV->getStepRecurrence(*SE)->isOne() && - "Invalid non-unit step for multiplicative scaling"); - LoopIncs.push_back(User2); - - if (const SCEVConstant *MulScale = - dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) { - // Make sure that both the start and step have the same multiplier. - if (RealIVSCEV->getStart()->getType() != MulScale->getType()) + + // If we found non-loop-inc, non-root users of Base, assume they are + // for the zeroth root index. This is because "add %a, 0" gets optimized + // away. + if (BaseUsers.size()) { + if (Roots.find(0) != Roots.end()) { + DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n"); return false; - if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) != - User1SCEV->getStart()) + } + Roots[0] = Base; + } + + // Calculate the number of users of the base, or lowest indexed, iteration. + unsigned NumBaseUses = BaseUsers.size(); + if (NumBaseUses == 0) + NumBaseUses = Roots.begin()->second->getNumUses(); + + // Check that every node has the same number of users. + for (auto &KV : Roots) { + if (KV.first == 0) + continue; + if (KV.second->getNumUses() != NumBaseUses) { + DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: " + << "#Base=" << NumBaseUses << ", #Root=" << + KV.second->getNumUses() << "\n"); return false; + } + } + + return true; +} - ConstantInt *MulScaleCI = MulScale->getValue(); - if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc)) +bool LoopReroll::DAGRootTracker:: +findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) { + // Does the user look like it could be part of a root set? + // All its users must be simple arithmetic ops. + if (I->getNumUses() > IL_MaxRerollIterations) + return false; + + if ((I->getOpcode() == Instruction::Mul || + I->getOpcode() == Instruction::PHI) && + I != IV && + findRootsBase(I, SubsumedInsts)) + return true; + + SubsumedInsts.insert(I); + + for (User *V : I->users()) { + Instruction *I = dyn_cast<Instruction>(V); + if (std::find(LoopIncs.begin(), LoopIncs.end(), I) != LoopIncs.end()) + continue; + + if (!I || !isSimpleArithmeticOp(I) || + !findRootsRecursive(I, SubsumedInsts)) return false; - Scale = MulScaleCI->getZExtValue(); - IV = User1; - } else + } + return true; +} + +bool LoopReroll::DAGRootTracker:: +findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { + + // The base instruction needs to be a multiply so + // that we can erase it. + if (IVU->getOpcode() != Instruction::Mul && + IVU->getOpcode() != Instruction::PHI) return false; - DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n"); + std::map<int64_t, Instruction*> V; + if (!collectPossibleRoots(IVU, V)) + return false; + + // If we didn't get a root for index zero, then IVU must be + // subsumed. + if (V.find(0) == V.end()) + SubsumedInsts.insert(IVU); + + // Partition the vector into monotonically increasing indexes. + DAGRootSet DRS; + DRS.BaseInst = nullptr; + + for (auto &KV : V) { + if (!DRS.BaseInst) { + DRS.BaseInst = KV.second; + DRS.SubsumedInsts = SubsumedInsts; + } else if (DRS.Roots.empty()) { + DRS.Roots.push_back(KV.second); + } else if (V.find(KV.first - 1) != V.end()) { + DRS.Roots.push_back(KV.second); + } else { + // Linear sequence terminated. + RootSets.push_back(DRS); + DRS.BaseInst = KV.second; + DRS.SubsumedInsts = SubsumedInsts; + DRS.Roots.clear(); + } + } + RootSets.push_back(DRS); + return true; } -// Collect all root increments with respect to the provided induction variable -// (normally the PHI, but sometimes a multiply). A root increment is an -// instruction, normally an add, with a positive constant less than Scale. In a -// rerollable loop, each of these increments is the root of an instruction -// graph isomorphic to the others. Also, we collect the final induction -// increment (the increment equal to the Scale), and its users in LoopIncs. -bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, - Instruction *IV, - SmallVector<SmallInstructionVector, 32> &Roots, - SmallInstructionSet &AllRoots, - SmallInstructionVector &LoopIncs) { - for (User *U : IV->users()) { - Instruction *UI = cast<Instruction>(U); - if (!SE->isSCEVable(UI->getType())) - continue; - if (UI->getType() != IV->getType()) - continue; - if (!L->contains(UI)) - continue; - if (hasUsesOutsideLoop(UI, L)) - continue; +bool LoopReroll::DAGRootTracker::findRoots() { - if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV( - SE->getSCEV(UI), SE->getSCEV(IV)))) { - uint64_t Idx = Diff->getValue()->getValue().getZExtValue(); - if (Idx > 0 && Idx < Scale) { - Roots[Idx-1].push_back(UI); - AllRoots.insert(UI); - } else if (Idx == Scale && Inc > 1) { - LoopIncs.push_back(UI); - } + const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); + Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> + getValue()->getZExtValue(); + + assert(RootSets.empty() && "Unclean state!"); + if (Inc == 1) { + for (auto *IVU : IV->users()) { + if (isLoopIncrement(IVU, IV)) + LoopIncs.push_back(cast<Instruction>(IVU)); } + if (!findRootsRecursive(IV, SmallInstructionSet())) + return false; + LoopIncs.push_back(IV); + } else { + if (!findRootsBase(IV, SmallInstructionSet())) + return false; } - if (Roots[0].empty()) + // Ensure all sets have the same size. + if (RootSets.empty()) { + DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n"); return false; - bool AllSame = true; - for (unsigned i = 1; i < Scale-1; ++i) - if (Roots[i].size() != Roots[0].size()) { - AllSame = false; - break; + } + for (auto &V : RootSets) { + if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) { + DEBUG(dbgs() + << "LRR: Aborting because not all root sets have the same size\n"); + return false; } + } - if (!AllSame) + // And ensure all loop iterations are consecutive. We rely on std::map + // providing ordered traversal. + for (auto &V : RootSets) { + const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(V.BaseInst)); + if (!ADR) + return false; + + // Consider a DAGRootSet with N-1 roots (so N different values including + // BaseInst). + // Define d = Roots[0] - BaseInst, which should be the same as + // Roots[I] - Roots[I-1] for all I in [1..N). + // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the + // loop iteration J. + // + // Now, For the loop iterations to be consecutive: + // D = d * N + + unsigned N = V.Roots.size() + 1; + const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(V.Roots[0]), ADR); + const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N); + if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) { + DEBUG(dbgs() << "LRR: Aborting because iterations are not consecutive\n"); + return false; + } + } + Scale = RootSets[0].Roots.size() + 1; + + if (Scale > IL_MaxRerollIterations) { + DEBUG(dbgs() << "LRR: Aborting - too many iterations found. " + << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations + << "\n"); return false; + } + + DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n"); return true; } -// Validate the selected reductions. All iterations must have an isomorphic -// part of the reduction chain and, for non-associative reductions, the chain -// entries must appear in order. -bool LoopReroll::ReductionTracker::validateSelected() { - // For a non-associative reduction, the chain entries must appear in order. - for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); - RI != RIE; ++RI) { - int i = *RI; - int PrevIter = 0, BaseCount = 0, Count = 0; - for (Instruction *J : PossibleReds[i]) { - // Note that all instructions in the chain must have been found because - // all instructions in the function must have been assigned to some - // iteration. - int Iter = PossibleRedIter[J]; - if (Iter != PrevIter && Iter != PrevIter + 1 && - !PossibleReds[i].getReducedValue()->isAssociative()) { - DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " << - J << "\n"); - return false; - } +bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) { + // Populate the MapVector with all instructions in the block, in order first, + // so we can iterate over the contents later in perfect order. + for (auto &I : *L->getHeader()) { + Uses[&I].resize(IL_End); + } - if (Iter != PrevIter) { - if (Count != BaseCount) { - DEBUG(dbgs() << "LRR: Iteration " << PrevIter << - " reduction use count " << Count << - " is not equal to the base use count " << - BaseCount << "\n"); - return false; - } + SmallInstructionSet Exclude; + for (auto &DRS : RootSets) { + Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); + Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); + Exclude.insert(DRS.BaseInst); + } + Exclude.insert(LoopIncs.begin(), LoopIncs.end()); - Count = 0; + for (auto &DRS : RootSets) { + DenseSet<Instruction*> VBase; + collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase); + for (auto *I : VBase) { + Uses[I].set(0); + } + + unsigned Idx = 1; + for (auto *Root : DRS.Roots) { + DenseSet<Instruction*> V; + collectInLoopUserSet(Root, Exclude, PossibleRedSet, V); + + // While we're here, check the use sets are the same size. + if (V.size() != VBase.size()) { + DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n"); + return false; } - ++Count; - if (Iter == 0) - ++BaseCount; + for (auto *I : V) { + Uses[I].set(Idx); + } + ++Idx; + } - PrevIter = Iter; + // Make sure our subsumed instructions are remembered too. + for (auto *I : DRS.SubsumedInsts) { + Uses[I].set(IL_All); } } - return true; -} - -// For all selected reductions, remove all parts except those in the first -// iteration (and the PHI). Replace outside uses of the reduced value with uses -// of the first-iteration reduced value (in other words, reroll the selected -// reductions). -void LoopReroll::ReductionTracker::replaceSelected() { - // Fixup reductions to refer to the last instruction associated with the - // first iteration (not the last). - for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); - RI != RIE; ++RI) { - int i = *RI; - int j = 0; - for (int e = PossibleReds[i].size(); j != e; ++j) - if (PossibleRedIter[PossibleReds[i][j]] != 0) { - --j; - break; - } + // Make sure the loop increments are also accounted for. - // Replace users with the new end-of-chain value. - SmallInstructionVector Users; - for (User *U : PossibleReds[i].getReducedValue()->users()) - Users.push_back(cast<Instruction>(U)); + Exclude.clear(); + for (auto &DRS : RootSets) { + Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); + Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); + Exclude.insert(DRS.BaseInst); + } - for (SmallInstructionVector::iterator J = Users.begin(), - JE = Users.end(); J != JE; ++J) - (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(), - PossibleReds[i][j]); + DenseSet<Instruction*> V; + collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V); + for (auto *I : V) { + Uses[I].set(IL_All); } -} -// Reroll the provided loop with respect to the provided induction variable. -// Generally, we're looking for a loop like this: -// -// %iv = phi [ (preheader, ...), (body, %iv.next) ] -// f(%iv) -// %iv.1 = add %iv, 1 <-- a root increment -// f(%iv.1) -// %iv.2 = add %iv, 2 <-- a root increment -// f(%iv.2) -// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment -// f(%iv.scale_m_1) -// ... -// %iv.next = add %iv, scale -// %cmp = icmp(%iv, ...) -// br %cmp, header, exit -// -// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of -// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can -// be intermixed with eachother. The restriction imposed by this algorithm is -// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), -// etc. be the same. -// -// First, we collect the use set of %iv, excluding the other increment roots. -// This gives us f(%iv). Then we iterate over the loop instructions (scale-1) -// times, having collected the use set of f(%iv.(i+1)), during which we: -// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to -// the next unmatched instruction in f(%iv.(i+1)). -// - Ensure that both matched instructions don't have any external users -// (with the exception of last-in-chain reduction instructions). -// - Track the (aliasing) write set, and other side effects, of all -// instructions that belong to future iterations that come before the matched -// instructions. If the matched instructions read from that write set, then -// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in -// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, -// if any of these future instructions had side effects (could not be -// speculatively executed), and so do the matched instructions, when we -// cannot reorder those side-effect-producing instructions, and rerolling -// fails. -// -// Finally, we make sure that all loop instructions are either loop increment -// roots, belong to simple latch code, parts of validated reductions, part of -// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions -// have been validated), then we reroll the loop. -bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, - const SCEV *IterCount, - ReductionTracker &Reductions) { - const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); - uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> - getValue()->getZExtValue(); - // The collection of loop increment instructions. - SmallInstructionVector LoopIncs; - uint64_t Scale = Inc; - - // The effective induction variable, IV, is normally also the real induction - // variable. When we're dealing with a loop like: - // for (int i = 0; i < 500; ++i) - // x[3*i] = ...; - // x[3*i+1] = ...; - // x[3*i+2] = ...; - // then the real IV is still i, but the effective IV is (3*i). - Instruction *RealIV = IV; - if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs)) - return false; + return true; - assert(Scale <= MaxInc && "Scale is too large"); - assert(Scale > 1 && "Scale must be at least 2"); +} - // The set of increment instructions for each increment value. - SmallVector<SmallInstructionVector, 32> Roots(Scale-1); - SmallInstructionSet AllRoots; - if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs)) - return false; +/// Get the next instruction in "In" that is a member of set Val. +/// Start searching from StartI, and do not return anything in Exclude. +/// If StartI is not given, start from In.begin(). +LoopReroll::DAGRootTracker::UsesTy::iterator +LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In, + const SmallInstructionSet &Exclude, + UsesTy::iterator *StartI) { + UsesTy::iterator I = StartI ? *StartI : In.begin(); + while (I != In.end() && (I->second.test(Val) == 0 || + Exclude.count(I->first) != 0)) + ++I; + return I; +} - DEBUG(dbgs() << "LRR: Found all root induction increments for: " << - *RealIV << "\n"); +bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) { + for (auto &DRS : RootSets) { + if (DRS.BaseInst == I) + return true; + } + return false; +} - // An array of just the possible reductions for this scale factor. When we - // collect the set of all users of some root instructions, these reduction - // instructions are treated as 'final' (their uses are not considered). - // This is important because we don't want the root use set to search down - // the reduction chain. - SmallInstructionSet PossibleRedSet; - SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet; - Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet, - PossibleRedLastSet); +bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) { + for (auto &DRS : RootSets) { + if (std::find(DRS.Roots.begin(), DRS.Roots.end(), I) != DRS.Roots.end()) + return true; + } + return false; +} +/// Return true if instruction I depends on any instruction between +/// Start and End. +bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I, + UsesTy::iterator Start, + UsesTy::iterator End) { + for (auto *U : I->users()) { + for (auto It = Start; It != End; ++It) + if (U == It->first) + return true; + } + return false; +} + +bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { // We now need to check for equivalence of the use graph of each root with // that of the primary induction variable (excluding the roots). Our goal // here is not to solve the full graph isomorphism problem, but rather to @@ -815,121 +1009,167 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, // is the same (although we will not make an assumption about how the // different iterations are intermixed). Note that while the order must be // the same, the instructions may not be in the same basic block. - SmallInstructionSet Exclude(AllRoots); - Exclude.insert(LoopIncs.begin(), LoopIncs.end()); - DenseSet<Instruction *> BaseUseSet; - collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet); + // An array of just the possible reductions for this scale factor. When we + // collect the set of all users of some root instructions, these reduction + // instructions are treated as 'final' (their uses are not considered). + // This is important because we don't want the root use set to search down + // the reduction chain. + SmallInstructionSet PossibleRedSet; + SmallInstructionSet PossibleRedLastSet; + SmallInstructionSet PossibleRedPHISet; + Reductions.restrictToScale(Scale, PossibleRedSet, + PossibleRedPHISet, PossibleRedLastSet); - DenseSet<Instruction *> AllRootUses; - std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1); + // Populate "Uses" with where each instruction is used. + if (!collectUsedInstructions(PossibleRedSet)) + return false; - bool MatchFailed = false; - for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) { - DenseSet<Instruction *> &RootUseSet = RootUseSets[i]; - collectInLoopUserSet(L, Roots[i], SmallInstructionSet(), - PossibleRedSet, RootUseSet); + // Make sure we mark the reduction PHIs as used in all iterations. + for (auto *I : PossibleRedPHISet) { + Uses[I].set(IL_All); + } - DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() << - " vs. iteration increment " << (i+1) << - " use set size: " << RootUseSet.size() << "\n"); + // Make sure all instructions in the loop are in one and only one + // set. + for (auto &KV : Uses) { + if (KV.second.count() != 1) { + DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: " + << *KV.first << " (#uses=" << KV.second.count() << ")\n"); + return false; + } + } - if (BaseUseSet.size() != RootUseSet.size()) { - MatchFailed = true; - break; + DEBUG( + for (auto &KV : Uses) { + dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n"; } + ); + for (unsigned Iter = 1; Iter < Scale; ++Iter) { // In addition to regular aliasing information, we need to look for // instructions from later (future) iterations that have side effects // preventing us from reordering them past other instructions with side // effects. bool FutureSideEffects = false; AliasSetTracker AST(*AA); - // The map between instructions in f(%iv.(i+1)) and f(%iv). DenseMap<Value *, Value *> BaseMap; - assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops"); - for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(), - JE = Header->end(); J1 != JE && !MatchFailed; ++J1) { - if (cast<Instruction>(J1) == RealIV) - continue; - if (cast<Instruction>(J1) == IV) - continue; - if (!BaseUseSet.count(J1)) - continue; - if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs. - continue; - - while (J2 != JE && (!RootUseSet.count(J2) || - std::find(Roots[i].begin(), Roots[i].end(), J2) != - Roots[i].end())) { - // As we iterate through the instructions, instructions that don't - // belong to previous iterations (or the base case), must belong to - // future iterations. We want to track the alias set of writes from - // previous iterations. - if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) && - !AllRootUses.count(J2)) { - if (J2->mayWriteToMemory()) - AST.add(J2); - - // Note: This is specifically guarded by a check on isa<PHINode>, - // which while a valid (somewhat arbitrary) micro-optimization, is - // needed because otherwise isSafeToSpeculativelyExecute returns - // false on PHI nodes. - if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL)) - FutureSideEffects = true; + // Compare iteration Iter to the base. + SmallInstructionSet Visited; + auto BaseIt = nextInstr(0, Uses, Visited); + auto RootIt = nextInstr(Iter, Uses, Visited); + auto LastRootIt = Uses.begin(); + + while (BaseIt != Uses.end() && RootIt != Uses.end()) { + Instruction *BaseInst = BaseIt->first; + Instruction *RootInst = RootIt->first; + + // Skip over the IV or root instructions; only match their users. + bool Continue = false; + if (isBaseInst(BaseInst)) { + Visited.insert(BaseInst); + BaseIt = nextInstr(0, Uses, Visited); + Continue = true; + } + if (isRootInst(RootInst)) { + LastRootIt = RootIt; + Visited.insert(RootInst); + RootIt = nextInstr(Iter, Uses, Visited); + Continue = true; + } + if (Continue) continue; + + if (!BaseInst->isSameOperationAs(RootInst)) { + // Last chance saloon. We don't try and solve the full isomorphism + // problem, but try and at least catch the case where two instructions + // *of different types* are round the wrong way. We won't be able to + // efficiently tell, given two ADD instructions, which way around we + // should match them, but given an ADD and a SUB, we can at least infer + // which one is which. + // + // This should allow us to deal with a greater subset of the isomorphism + // problem. It does however change a linear algorithm into a quadratic + // one, so limit the number of probes we do. + auto TryIt = RootIt; + unsigned N = NumToleratedFailedMatches; + while (TryIt != Uses.end() && + !BaseInst->isSameOperationAs(TryIt->first) && + N--) { + ++TryIt; + TryIt = nextInstr(Iter, Uses, Visited, &TryIt); } - ++J2; + if (TryIt == Uses.end() || TryIt == RootIt || + instrDependsOn(TryIt->first, RootIt, TryIt)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << "\n"); + return false; + } + + RootIt = TryIt; + RootInst = TryIt->first; } - if (!J1->isSameOperationAs(J2)) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << "\n"); - MatchFailed = true; - break; + // All instructions between the last root and this root + // may belong to some other iteration. If they belong to a + // future iteration, then they're dangerous to alias with. + // + // Note that because we allow a limited amount of flexibility in the order + // that we visit nodes, LastRootIt might be *before* RootIt, in which + // case we've already checked this set of instructions so we shouldn't + // do anything. + for (; LastRootIt < RootIt; ++LastRootIt) { + Instruction *I = LastRootIt->first; + if (LastRootIt->second.find_first() < (int)Iter) + continue; + if (I->mayWriteToMemory()) + AST.add(I); + // Note: This is specifically guarded by a check on isa<PHINode>, + // which while a valid (somewhat arbitrary) micro-optimization, is + // needed because otherwise isSafeToSpeculativelyExecute returns + // false on PHI nodes. + if (!isa<PHINode>(I) && !isSimpleLoadStore(I) && + !isSafeToSpeculativelyExecute(I, DL)) + // Intervening instructions cause side effects. + FutureSideEffects = true; } // Make sure that this instruction, which is in the use set of this // root instruction, does not also belong to the base set or the set of - // some previous root instruction. - if (BaseUseSet.count(J2) || AllRootUses.count(J2)) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << " (prev. case overlap)\n"); - MatchFailed = true; - break; + // some other root instruction. + if (RootIt->second.count() > 1) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << " (prev. case overlap)\n"); + return false; } // Make sure that we don't alias with any instruction in the alias set // tracker. If we do, then we depend on a future iteration, and we // can't reroll. - if (J2->mayReadFromMemory()) { - for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end(); - K != KE && !MatchFailed; ++K) { - if (K->aliasesUnknownInst(J2, *AA)) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << " (depends on future store)\n"); - MatchFailed = true; - break; + if (RootInst->mayReadFromMemory()) + for (auto &K : AST) { + if (K.aliasesUnknownInst(RootInst, *AA)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << " (depends on future store)\n"); + return false; } } - } // If we've past an instruction from a future iteration that may have // side effects, and this instruction might also, then we can't reorder // them, and this matching fails. As an exception, we allow the alias // set tracker to handle regular (simple) load/store dependencies. if (FutureSideEffects && - ((!isSimpleLoadStore(J1) && - !isSafeToSpeculativelyExecute(J1, DL)) || - (!isSimpleLoadStore(J2) && - !isSafeToSpeculativelyExecute(J2, DL)))) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << + ((!isSimpleLoadStore(BaseInst) && + !isSafeToSpeculativelyExecute(BaseInst, DL)) || + (!isSimpleLoadStore(RootInst) && + !isSafeToSpeculativelyExecute(RootInst, DL)))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << " (side effects prevent reordering)\n"); - MatchFailed = true; - break; + return false; } // For instructions that are part of a reduction, if the operation is @@ -942,42 +1182,46 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, // x += a[i]; x += b[i]; // x += a[i+1]; x += b[i+1]; // x += b[i+2]; x += a[i+2]; - bool InReduction = Reductions.isPairInSame(J1, J2); + bool InReduction = Reductions.isPairInSame(BaseInst, RootInst); - if (!(InReduction && J1->isAssociative())) { + if (!(InReduction && BaseInst->isAssociative())) { bool Swapped = false, SomeOpMatched = false; - for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) { - Value *Op2 = J2->getOperand(j); + for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) { + Value *Op2 = RootInst->getOperand(j); // If this is part of a reduction (and the operation is not // associatve), then we match all operands, but not those that are // part of the reduction. if (InReduction) if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) - if (Reductions.isPairInSame(J2, Op2I)) + if (Reductions.isPairInSame(RootInst, Op2I)) continue; DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); - if (BMI != BaseMap.end()) + if (BMI != BaseMap.end()) { Op2 = BMI->second; - else if (std::find(Roots[i].begin(), Roots[i].end(), - (Instruction*) Op2) != Roots[i].end()) - Op2 = IV; + } else { + for (auto &DRS : RootSets) { + if (DRS.Roots[Iter-1] == (Instruction*) Op2) { + Op2 = DRS.BaseInst; + break; + } + } + } - if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) { + if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) { // If we've not already decided to swap the matched operands, and // we've not already matched our first operand (note that we could // have skipped matching the first operand because it is part of a // reduction above), and the instruction is commutative, then try // the swapped match. - if (!Swapped && J1->isCommutative() && !SomeOpMatched && - J1->getOperand(!j) == Op2) { + if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched && + BaseInst->getOperand(!j) == Op2) { Swapped = true; } else { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << " (operand " << j << ")\n"); - MatchFailed = true; - break; + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst + << " vs. " << *RootInst << " (operand " << j << ")\n"); + return false; } } @@ -985,81 +1229,41 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, } } - if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) || - (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << " (uses outside loop)\n"); - MatchFailed = true; - break; + if ((!PossibleRedLastSet.count(BaseInst) && + hasUsesOutsideLoop(BaseInst, L)) || + (!PossibleRedLastSet.count(RootInst) && + hasUsesOutsideLoop(RootInst, L))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << " (uses outside loop)\n"); + return false; } - if (!MatchFailed) - BaseMap.insert(std::pair<Value *, Value *>(J2, J1)); - - AllRootUses.insert(J2); - Reductions.recordPair(J1, J2, i+1); + Reductions.recordPair(BaseInst, RootInst, Iter); + BaseMap.insert(std::make_pair(RootInst, BaseInst)); - ++J2; + LastRootIt = RootIt; + Visited.insert(BaseInst); + Visited.insert(RootInst); + BaseIt = nextInstr(0, Uses, Visited); + RootIt = nextInstr(Iter, Uses, Visited); } + assert (BaseIt == Uses.end() && RootIt == Uses.end() && + "Mismatched set sizes!"); } - if (MatchFailed) - return false; - DEBUG(dbgs() << "LRR: Matched all iteration increments for " << - *RealIV << "\n"); - - DenseSet<Instruction *> LoopIncUseSet; - collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(), - SmallInstructionSet(), LoopIncUseSet); - DEBUG(dbgs() << "LRR: Loop increment set size: " << - LoopIncUseSet.size() << "\n"); - - // Make sure that all instructions in the loop have been included in some - // use set. - for (BasicBlock::iterator J = Header->begin(), JE = Header->end(); - J != JE; ++J) { - if (isa<DbgInfoIntrinsic>(J)) - continue; - if (cast<Instruction>(J) == RealIV) - continue; - if (cast<Instruction>(J) == IV) - continue; - if (BaseUseSet.count(J) || AllRootUses.count(J) || - (LoopIncUseSet.count(J) && (J->isTerminator() || - isSafeToSpeculativelyExecute(J, DL)))) - continue; - - if (AllRoots.count(J)) - continue; - - if (Reductions.isSelectedPHI(J)) - continue; + *IV << "\n"); - DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV << - " unprocessed instruction found: " << *J << "\n"); - MatchFailed = true; - break; - } - - if (MatchFailed) - return false; - - DEBUG(dbgs() << "LRR: all instructions processed from " << - *RealIV << "\n"); - - if (!Reductions.validateSelected()) - return false; - - // At this point, we've validated the rerolling, and we're committed to - // making changes! - - Reductions.replaceSelected(); + return true; +} +void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { + BasicBlock *Header = L->getHeader(); // Remove instructions associated with non-base iterations. for (BasicBlock::reverse_iterator J = Header->rbegin(); J != Header->rend();) { - if (AllRootUses.count(&*J)) { + unsigned I = Uses[&*J].find_first(); + if (I > 0 && I < IL_All) { Instruction *D = &*J; DEBUG(dbgs() << "LRR: removing: " << *D << "\n"); D->eraseFromParent(); @@ -1069,57 +1273,198 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, ++J; } - // Insert the new induction variable. - const SCEV *Start = RealIVSCEV->getStart(); - if (Inc == 1) - Start = SE->getMulExpr(Start, - SE->getConstant(Start->getType(), Scale)); - const SCEVAddRecExpr *H = - cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start, - SE->getConstant(RealIVSCEV->getType(), 1), - L, SCEV::FlagAnyWrap)); - { // Limit the lifetime of SCEVExpander. - SCEVExpander Expander(*SE, "reroll"); - Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); - - for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(), - JE = BaseUseSet.end(); J != JE; ++J) - (*J)->replaceUsesOfWith(IV, NewIV); - - if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { - if (LoopIncUseSet.count(BI)) { - const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); - if (Inc == 1) - ICSCEV = - SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale)); - // Iteration count SCEV minus 1 - const SCEV *ICMinus1SCEV = - SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); - - Value *ICMinus1; // Iteration count minus 1 - if (isa<SCEVConstant>(ICMinus1SCEV)) { - ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI); - } else { - BasicBlock *Preheader = L->getLoopPreheader(); - if (!Preheader) - Preheader = InsertPreheaderForLoop(L, this); - - ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), - Preheader->getTerminator()); - } + // We need to create a new induction variable for each different BaseInst. + for (auto &DRS : RootSets) { + // Insert the new induction variable. + const SCEVAddRecExpr *RealIVSCEV = + cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); + const SCEV *Start = RealIVSCEV->getStart(); + const SCEVAddRecExpr *H = cast<SCEVAddRecExpr> + (SE->getAddRecExpr(Start, + SE->getConstant(RealIVSCEV->getType(), 1), + L, SCEV::FlagAnyWrap)); + { // Limit the lifetime of SCEVExpander. + SCEVExpander Expander(*SE, "reroll"); + Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); + + for (auto &KV : Uses) { + if (KV.second.find_first() == 0) + KV.first->replaceUsesOfWith(DRS.BaseInst, NewIV); + } - Value *Cond = + if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { + // FIXME: Why do we need this check? + if (Uses[BI].find_first() == IL_All) { + const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); + + // Iteration count SCEV minus 1 + const SCEV *ICMinus1SCEV = + SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); + + Value *ICMinus1; // Iteration count minus 1 + if (isa<SCEVConstant>(ICMinus1SCEV)) { + ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI); + } else { + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + Preheader = InsertPreheaderForLoop(L, Parent); + + ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), + Preheader->getTerminator()); + } + + Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond"); - BI->setCondition(Cond); + BI->setCondition(Cond); - if (BI->getSuccessor(1) != Header) - BI->swapSuccessors(); + if (BI->getSuccessor(1) != Header) + BI->swapSuccessors(); + } } } } SimplifyInstructionsInBlock(Header, DL, TLI); DeleteDeadPHIs(Header, TLI); +} + +// Validate the selected reductions. All iterations must have an isomorphic +// part of the reduction chain and, for non-associative reductions, the chain +// entries must appear in order. +bool LoopReroll::ReductionTracker::validateSelected() { + // For a non-associative reduction, the chain entries must appear in order. + for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + int PrevIter = 0, BaseCount = 0, Count = 0; + for (Instruction *J : PossibleReds[i]) { + // Note that all instructions in the chain must have been found because + // all instructions in the function must have been assigned to some + // iteration. + int Iter = PossibleRedIter[J]; + if (Iter != PrevIter && Iter != PrevIter + 1 && + !PossibleReds[i].getReducedValue()->isAssociative()) { + DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " << + J << "\n"); + return false; + } + + if (Iter != PrevIter) { + if (Count != BaseCount) { + DEBUG(dbgs() << "LRR: Iteration " << PrevIter << + " reduction use count " << Count << + " is not equal to the base use count " << + BaseCount << "\n"); + return false; + } + + Count = 0; + } + + ++Count; + if (Iter == 0) + ++BaseCount; + + PrevIter = Iter; + } + } + + return true; +} + +// For all selected reductions, remove all parts except those in the first +// iteration (and the PHI). Replace outside uses of the reduced value with uses +// of the first-iteration reduced value (in other words, reroll the selected +// reductions). +void LoopReroll::ReductionTracker::replaceSelected() { + // Fixup reductions to refer to the last instruction associated with the + // first iteration (not the last). + for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + int j = 0; + for (int e = PossibleReds[i].size(); j != e; ++j) + if (PossibleRedIter[PossibleReds[i][j]] != 0) { + --j; + break; + } + + // Replace users with the new end-of-chain value. + SmallInstructionVector Users; + for (User *U : PossibleReds[i].getReducedValue()->users()) { + Users.push_back(cast<Instruction>(U)); + } + + for (SmallInstructionVector::iterator J = Users.begin(), + JE = Users.end(); J != JE; ++J) + (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(), + PossibleReds[i][j]); + } +} + +// Reroll the provided loop with respect to the provided induction variable. +// Generally, we're looking for a loop like this: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// f(%iv) +// %iv.1 = add %iv, 1 <-- a root increment +// f(%iv.1) +// %iv.2 = add %iv, 2 <-- a root increment +// f(%iv.2) +// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment +// f(%iv.scale_m_1) +// ... +// %iv.next = add %iv, scale +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit +// +// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of +// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can +// be intermixed with eachother. The restriction imposed by this algorithm is +// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), +// etc. be the same. +// +// First, we collect the use set of %iv, excluding the other increment roots. +// This gives us f(%iv). Then we iterate over the loop instructions (scale-1) +// times, having collected the use set of f(%iv.(i+1)), during which we: +// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to +// the next unmatched instruction in f(%iv.(i+1)). +// - Ensure that both matched instructions don't have any external users +// (with the exception of last-in-chain reduction instructions). +// - Track the (aliasing) write set, and other side effects, of all +// instructions that belong to future iterations that come before the matched +// instructions. If the matched instructions read from that write set, then +// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in +// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, +// if any of these future instructions had side effects (could not be +// speculatively executed), and so do the matched instructions, when we +// cannot reorder those side-effect-producing instructions, and rerolling +// fails. +// +// Finally, we make sure that all loop instructions are either loop increment +// roots, belong to simple latch code, parts of validated reductions, part of +// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions +// have been validated), then we reroll the loop. +bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, + const SCEV *IterCount, + ReductionTracker &Reductions) { + DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DL); + + if (!DAGRoots.findRoots()) + return false; + DEBUG(dbgs() << "LRR: Found all root induction increments for: " << + *IV << "\n"); + + if (!DAGRoots.validate(Reductions)) + return false; + if (!Reductions.validateSelected()) + return false; + // At this point, we've validated the rerolling, and we're committed to + // making changes! + + Reductions.replaceSelected(); + DAGRoots.replace(IterCount); + ++NumRerolledLoops; return true; } @@ -1129,9 +1474,9 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { return false; AA = &getAnalysis<AliasAnalysis>(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolution>(); - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index afd2eca..4d12349 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -13,7 +13,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" @@ -54,16 +54,16 @@ namespace { // LCSSA form makes instruction renaming easier. void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } bool runOnLoop(Loop *L, LPPassManager &LPM) override; @@ -74,15 +74,16 @@ namespace { unsigned MaxHeaderSize; LoopInfo *LI; const TargetTransformInfo *TTI; - AssumptionTracker *AT; + AssumptionCache *AC; + DominatorTree *DT; }; } char LoopRotate::ID = 0; INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) @@ -100,9 +101,13 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { // Save the loop metadata. MDNode *LoopMD = L->getLoopID(); - LI = &getAnalysis<LoopInfo>(); - TTI = &getAnalysis<TargetTransformInfo>(); - AT = &getAnalysis<AssumptionTracker>(); + Function &F = *L->getHeader()->getParent(); + + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; // Simplify the loop latch before attempting to rotate the header // upward. Rotation may not be needed if the loop tail can be folded into the @@ -225,20 +230,17 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: { - Value *IVOpnd = nullptr; - if (isa<ConstantInt>(I->getOperand(0))) - IVOpnd = I->getOperand(1); - - if (isa<ConstantInt>(I->getOperand(1))) { - if (IVOpnd) - return false; - - IVOpnd = I->getOperand(0); - } + Value *IVOpnd = !isa<Constant>(I->getOperand(0)) + ? I->getOperand(0) + : !isa<Constant>(I->getOperand(1)) + ? I->getOperand(1) + : nullptr; + if (!IVOpnd) + return false; // If increment operand is used outside of the loop, this speculation // could cause extra live range interference. - if (MultiExitLoop && IVOpnd) { + if (MultiExitLoop) { for (User *UseI : IVOpnd->users()) { auto *UserInst = cast<Instruction>(UseI); if (!L->contains(UserInst)) @@ -307,9 +309,8 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) { // Nuke the Latch block. assert(Latch->empty() && "unable to evacuate Latch"); LI->removeBlock(Latch); - if (DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>()) - DTWP->getDomTree().eraseNode(Latch); + if (DT) + DT->eraseNode(Latch); Latch->eraseFromParent(); return true; } @@ -356,7 +357,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // duplicate blocks inside it. { SmallPtrSet<const Value *, 32> EphValues; - CodeMetrics::collectEphemeralValues(L, AT, EphValues); + CodeMetrics::collectEphemeralValues(L, AC, EphValues); CodeMetrics Metrics; Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); @@ -441,7 +442,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // With the operands remapped, see if the instruction constant folds or is // otherwise simplifyable. This commonly occurs because the entry from PHI // nodes allows icmps and other instructions to fold. - // FIXME: Provide DL, TLI, DT, AT to SimplifyInstruction. + // FIXME: Provide DL, TLI, DT, AC to SimplifyInstruction. Value *V = SimplifyInstruction(C); if (V && LI->replacementPreservesLCSSAForm(C, V)) { // If so, then delete the temporary instruction and stick the folded value @@ -494,31 +495,31 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // The conditional branch can't be folded, handle the general case. // Update DominatorTree to reflect the CFG change we just made. Then split // edges as necessary to preserve LoopSimplify form. - if (DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DominatorTree &DT = DTWP->getDomTree(); + if (DT) { // Everything that was dominated by the old loop header is now dominated // by the original loop preheader. Conceptually the header was merged // into the preheader, even though we reuse the actual block as a new // loop latch. - DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader); + DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), OrigHeaderNode->end()); - DomTreeNode *OrigPreheaderNode = DT.getNode(OrigPreheader); + DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader); for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) - DT.changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); + DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); - assert(DT.getNode(Exit)->getIDom() == OrigPreheaderNode); - assert(DT.getNode(NewHeader)->getIDom() == OrigPreheaderNode); + assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode); + assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode); // Update OrigHeader to be dominated by the new header block. - DT.changeImmediateDominator(OrigHeader, OrigLatch); + DT->changeImmediateDominator(OrigHeader, OrigLatch); } // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and // thus is not a preheader anymore. // Split the edge to form a real preheader. - BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this); + BasicBlock *NewPH = SplitCriticalEdge( + OrigPreheader, NewHeader, + CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); NewPH->setName(NewHeader->getName() + ".lr.ph"); // Preserve canonical loop form, which means that 'Exit' should have only @@ -534,8 +535,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { Loop *PredLoop = LI->getLoopFor(*PI); if (!PredLoop || PredLoop->contains(Exit)) continue; + if (isa<IndirectBrInst>((*PI)->getTerminator())) + continue; SplitLatchEdge |= L->getLoopLatch() == *PI; - BasicBlock *ExitSplit = SplitCriticalEdge(*PI, Exit, this); + BasicBlock *ExitSplit = SplitCriticalEdge( + *PI, Exit, CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); ExitSplit->moveBefore(Exit); } assert(SplitLatchEdge && @@ -549,17 +553,15 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { PHBI->eraseFromParent(); // With our CFG finalized, update DomTree if it is available. - if (DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DominatorTree &DT = DTWP->getDomTree(); + if (DT) { // Update OrigHeader to be dominated by the new header block. - DT.changeImmediateDominator(NewHeader, OrigPreheader); - DT.changeImmediateDominator(OrigHeader, OrigLatch); + DT->changeImmediateDominator(NewHeader, OrigPreheader); + DT->changeImmediateDominator(OrigHeader, OrigLatch); // Brute force incremental dominator tree update. Call // findNearestCommonDominator on all CFG predecessors of each child of the // original header. - DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader); + DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), OrigHeaderNode->end()); bool Changed; @@ -572,11 +574,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { pred_iterator PI = pred_begin(BB); BasicBlock *NearestDom = *PI; for (pred_iterator PE = pred_end(BB); PI != PE; ++PI) - NearestDom = DT.findNearestCommonDominator(NearestDom, *PI); + NearestDom = DT->findNearestCommonDominator(NearestDom, *PI); // Remember if this changes the DomTree. if (Node->getIDom()->getBlock() != NearestDom) { - DT.changeImmediateDominator(BB, NearestDom); + DT->changeImmediateDominator(BB, NearestDom); Changed = true; } } @@ -594,7 +596,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // the OrigHeader block into OrigLatch. This will succeed if they are // connected by an unconditional branch. This is just a cleanup so the // emitted code isn't too gross in this common case. - MergeBlockIntoPredecessor(OrigHeader, this); + MergeBlockIntoPredecessor(OrigHeader, DT, LI); DEBUG(dbgs() << "LoopRotation: into "; L->dump()); diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 7b60373..318065e 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1327,11 +1327,9 @@ void LSRUse::DeleteFormula(Formula &F) { /// RecomputeRegs - Recompute the Regs field, and update RegUses. void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { // Now that we've filtered out some formulae, recompute the Regs set. - SmallPtrSet<const SCEV *, 4> OldRegs = Regs; + SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs); Regs.clear(); - for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(), - E = Formulae.end(); I != E; ++I) { - const Formula &F = *I; + for (const Formula &F : Formulae) { if (F.ScaledReg) Regs.insert(F.ScaledReg); Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); } @@ -4728,12 +4726,14 @@ void LSRInstance::RewriteForPHI(PHINode *PN, // Split the critical edge. BasicBlock *NewBB = nullptr; if (!Parent->isLandingPad()) { - NewBB = SplitCriticalEdge(BB, Parent, P, - /*MergeIdenticalEdges=*/true, - /*DontDeleteUselessPhis=*/true); + NewBB = SplitCriticalEdge(BB, Parent, + CriticalEdgeSplittingOptions(&DT, &LI) + .setMergeIdenticalEdges() + .setDontDeleteUselessPHIs()); } else { SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs); + SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, + /*AliasAnalysis*/ nullptr, &DT, &LI); NewBB = NewBBs[0]; } // If NewBB==NULL, then SplitCriticalEdge refused to split because all @@ -4863,9 +4863,10 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, LSRInstance::LSRInstance(Loop *L, Pass *P) : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()), - LI(P->getAnalysis<LoopInfo>()), - TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false), - IVIncInsertPos(nullptr) { + LI(P->getAnalysis<LoopInfoWrapperPass>().getLoopInfo()), + TTI(P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *L->getHeader()->getParent())), + L(L), Changed(false), IVIncInsertPos(nullptr) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; @@ -5041,11 +5042,11 @@ private: char LoopStrengthReduce::ID = 0; INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(IVUsers) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) @@ -5064,8 +5065,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { // many analyses if they are around. AU.addPreservedID(LoopSimplifyID); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); @@ -5076,7 +5077,7 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredID(LoopSimplifyID); AU.addRequired<IVUsers>(); AU.addPreserved<IVUsers>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { @@ -5098,7 +5099,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { #endif unsigned numFolded = Rewriter.replaceCongruentIVs( L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts, - &getAnalysis<TargetTransformInfo>()); + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *L->getHeader()->getParent())); if (numFolded) { Changed = true; DeleteTriviallyDeadInstructions(DeadInsts); diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index f60d990..924be16 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -13,11 +13,12 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/FunctionTargetTransformInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" @@ -28,6 +29,8 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Analysis/InstructionSimplify.h" #include <climits> using namespace llvm; @@ -38,6 +41,22 @@ static cl::opt<unsigned> UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, cl::desc("The cut-off point for automatic loop unrolling")); +static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze( + "unroll-max-iteration-count-to-analyze", cl::init(0), cl::Hidden, + cl::desc("Don't allow loop unrolling to simulate more than this number of" + "iterations when checking full unroll profitability")); + +static cl::opt<unsigned> UnrollMinPercentOfOptimized( + "unroll-percent-of-optimized-for-complete-unroll", cl::init(20), cl::Hidden, + cl::desc("If complete unrolling could trigger further optimizations, and, " + "by that, remove the given percent of instructions, perform the " + "complete unroll even if it's beyond the threshold")); + +static cl::opt<unsigned> UnrollAbsoluteThreshold( + "unroll-absolute-threshold", cl::init(2000), cl::Hidden, + cl::desc("Don't unroll if the unrolled size is bigger than this threshold," + " even if we can remove big portion of instructions later.")); + static cl::opt<unsigned> UnrollCount("unroll-count", cl::init(0), cl::Hidden, cl::desc("Use this unroll count for all loops including those with " @@ -63,11 +82,16 @@ namespace { static char ID; // Pass ID, replacement for typeid LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) { CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T); + CurrentAbsoluteThreshold = UnrollAbsoluteThreshold; + CurrentMinPercentOfOptimized = UnrollMinPercentOfOptimized; CurrentCount = (C == -1) ? UnrollCount : unsigned(C); CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P; CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R; UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0); + UserAbsoluteThreshold = (UnrollAbsoluteThreshold.getNumOccurrences() > 0); + UserPercentOfOptimized = + (UnrollMinPercentOfOptimized.getNumOccurrences() > 0); UserAllowPartial = (P != -1) || (UnrollAllowPartial.getNumOccurrences() > 0); UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0); @@ -91,10 +115,16 @@ namespace { unsigned CurrentCount; unsigned CurrentThreshold; + unsigned CurrentAbsoluteThreshold; + unsigned CurrentMinPercentOfOptimized; bool CurrentAllowPartial; bool CurrentRuntime; bool UserCount; // CurrentCount is user-specified. bool UserThreshold; // CurrentThreshold is user-specified. + bool UserAbsoluteThreshold; // CurrentAbsoluteThreshold is + // user-specified. + bool UserPercentOfOptimized; // CurrentMinPercentOfOptimized is + // user-specified. bool UserAllowPartial; // CurrentAllowPartial is user-specified. bool UserRuntime; // CurrentRuntime is user-specified. @@ -104,17 +134,16 @@ namespace { /// loop preheaders be inserted into the CFG... /// void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); AU.addRequired<ScalarEvolution>(); AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); - AU.addRequired<FunctionTargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. // If loop unroll does not preserve dom info then LCSSA pass on next // loop will receive invalid dom info. @@ -124,9 +153,11 @@ namespace { // Fill in the UnrollingPreferences parameter with values from the // TargetTransformationInfo. - void getUnrollingPreferences(Loop *L, const FunctionTargetTransformInfo &FTTI, + void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI, TargetTransformInfo::UnrollingPreferences &UP) { UP.Threshold = CurrentThreshold; + UP.AbsoluteThreshold = CurrentAbsoluteThreshold; + UP.MinPercentOfOptimized = CurrentMinPercentOfOptimized; UP.OptSizeThreshold = OptSizeUnrollThreshold; UP.PartialThreshold = CurrentThreshold; UP.PartialOptSizeThreshold = OptSizeUnrollThreshold; @@ -134,7 +165,7 @@ namespace { UP.MaxCount = UINT_MAX; UP.Partial = CurrentAllowPartial; UP.Runtime = CurrentRuntime; - FTTI.getUnrollingPreferences(L, UP); + TTI.getUnrollingPreferences(L, UP); } // Select and return an unroll count based on parameters from @@ -153,18 +184,37 @@ namespace { // unrolled loops respectively. void selectThresholds(const Loop *L, bool HasPragma, const TargetTransformInfo::UnrollingPreferences &UP, - unsigned &Threshold, unsigned &PartialThreshold) { + unsigned &Threshold, unsigned &PartialThreshold, + unsigned NumberOfOptimizedInstructions) { // Determine the current unrolling threshold. While this is // normally set from UnrollThreshold, it is overridden to a // smaller value if the current function is marked as // optimize-for-size, and the unroll threshold was not user // specified. Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; + + // If we are allowed to completely unroll if we can remove M% of + // instructions, and we know that with complete unrolling we'll be able + // to kill N instructions, then we can afford to completely unroll loops + // with unrolled size up to N*100/M. + // Adjust the threshold according to that: + unsigned PercentOfOptimizedForCompleteUnroll = + UserPercentOfOptimized ? CurrentMinPercentOfOptimized + : UP.MinPercentOfOptimized; + unsigned AbsoluteThreshold = UserAbsoluteThreshold + ? CurrentAbsoluteThreshold + : UP.AbsoluteThreshold; + if (PercentOfOptimizedForCompleteUnroll) + Threshold = std::max<unsigned>(Threshold, + NumberOfOptimizedInstructions * 100 / + PercentOfOptimizedForCompleteUnroll); + // But don't allow unrolling loops bigger than absolute threshold. + Threshold = std::min<unsigned>(Threshold, AbsoluteThreshold); + PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold; if (!UserThreshold && - L->getHeader()->getParent()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize)) { + L->getHeader()->getParent()->hasFnAttribute( + Attribute::OptimizeForSize)) { Threshold = UP.OptSizeThreshold; PartialThreshold = UP.PartialOptSizeThreshold; } @@ -185,10 +235,9 @@ namespace { char LoopUnroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) -INITIALIZE_PASS_DEPENDENCY(FunctionTargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) @@ -203,13 +252,333 @@ Pass *llvm::createSimpleLoopUnrollPass() { return llvm::createLoopUnrollPass(-1, -1, 0, 0); } +static bool isLoadFromConstantInitializer(Value *V) { + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + if (GV->isConstant() && GV->hasDefinitiveInitializer()) + return GV->getInitializer(); + return false; +} + +struct FindConstantPointers { + bool LoadCanBeConstantFolded; + bool IndexIsConstant; + APInt Step; + APInt StartValue; + Value *BaseAddress; + const Loop *L; + ScalarEvolution &SE; + FindConstantPointers(const Loop *loop, ScalarEvolution &SE) + : LoadCanBeConstantFolded(true), IndexIsConstant(true), L(loop), SE(SE) {} + + bool follow(const SCEV *S) { + if (const SCEVUnknown *SC = dyn_cast<SCEVUnknown>(S)) { + // We've reached the leaf node of SCEV, it's most probably just a + // variable. Now it's time to see if it corresponds to a global constant + // global (in which case we can eliminate the load), or not. + BaseAddress = SC->getValue(); + LoadCanBeConstantFolded = + IndexIsConstant && isLoadFromConstantInitializer(BaseAddress); + return false; + } + if (isa<SCEVConstant>(S)) + return true; + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + // If the current SCEV expression is AddRec, and its loop isn't the loop + // we are about to unroll, then we won't get a constant address after + // unrolling, and thus, won't be able to eliminate the load. + if (AR->getLoop() != L) + return IndexIsConstant = false; + // If the step isn't constant, we won't get constant addresses in unrolled + // version. Bail out. + if (const SCEVConstant *StepSE = + dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) + Step = StepSE->getValue()->getValue(); + else + return IndexIsConstant = false; + + return IndexIsConstant; + } + // If Result is true, continue traversal. + // Otherwise, we have found something that prevents us from (possible) load + // elimination. + return IndexIsConstant; + } + bool isDone() const { return !IndexIsConstant; } +}; + +// This class is used to get an estimate of the optimization effects that we +// could get from complete loop unrolling. It comes from the fact that some +// loads might be replaced with concrete constant values and that could trigger +// a chain of instruction simplifications. +// +// E.g. we might have: +// int a[] = {0, 1, 0}; +// v = 0; +// for (i = 0; i < 3; i ++) +// v += b[i]*a[i]; +// If we completely unroll the loop, we would get: +// v = b[0]*a[0] + b[1]*a[1] + b[2]*a[2] +// Which then will be simplified to: +// v = b[0]* 0 + b[1]* 1 + b[2]* 0 +// And finally: +// v = b[1] +class UnrollAnalyzer : public InstVisitor<UnrollAnalyzer, bool> { + typedef InstVisitor<UnrollAnalyzer, bool> Base; + friend class InstVisitor<UnrollAnalyzer, bool>; + + const Loop *L; + unsigned TripCount; + ScalarEvolution &SE; + const TargetTransformInfo &TTI; + + DenseMap<Value *, Constant *> SimplifiedValues; + DenseMap<LoadInst *, Value *> LoadBaseAddresses; + SmallPtrSet<Instruction *, 32> CountedInstructions; + + /// \brief Count the number of optimized instructions. + unsigned NumberOfOptimizedInstructions; + + // Provide base case for our instruction visit. + bool visitInstruction(Instruction &I) { return false; }; + // TODO: We should also visit ICmp, FCmp, GetElementPtr, Trunc, ZExt, SExt, + // FPTrunc, FPExt, FPToUI, FPToSI, UIToFP, SIToFP, BitCast, Select, + // ExtractElement, InsertElement, ShuffleVector, ExtractValue, InsertValue. + // + // Probaly it's worth to hoist the code for estimating the simplifications + // effects to a separate class, since we have a very similar code in + // InlineCost already. + bool visitBinaryOperator(BinaryOperator &I) { + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + if (!isa<Constant>(LHS)) + if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) + LHS = SimpleLHS; + if (!isa<Constant>(RHS)) + if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) + RHS = SimpleRHS; + Value *SimpleV = nullptr; + if (auto FI = dyn_cast<FPMathOperator>(&I)) + SimpleV = + SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags()); + else + SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS); + + if (SimpleV && CountedInstructions.insert(&I).second) + NumberOfOptimizedInstructions += TTI.getUserCost(&I); + + if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) { + SimplifiedValues[&I] = C; + return true; + } + return false; + } + + Constant *computeLoadValue(LoadInst *LI, unsigned Iteration) { + if (!LI) + return nullptr; + Value *BaseAddr = LoadBaseAddresses[LI]; + if (!BaseAddr) + return nullptr; + + auto GV = dyn_cast<GlobalVariable>(BaseAddr); + if (!GV) + return nullptr; + + ConstantDataSequential *CDS = + dyn_cast<ConstantDataSequential>(GV->getInitializer()); + if (!CDS) + return nullptr; + + const SCEV *BaseAddrSE = SE.getSCEV(BaseAddr); + const SCEV *S = SE.getSCEV(LI->getPointerOperand()); + const SCEV *OffSE = SE.getMinusSCEV(S, BaseAddrSE); + + APInt StepC, StartC; + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffSE); + if (!AR) + return nullptr; + + if (const SCEVConstant *StepSE = + dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) + StepC = StepSE->getValue()->getValue(); + else + return nullptr; + + if (const SCEVConstant *StartSE = dyn_cast<SCEVConstant>(AR->getStart())) + StartC = StartSE->getValue()->getValue(); + else + return nullptr; + + unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; + unsigned Start = StartC.getLimitedValue(); + unsigned Step = StepC.getLimitedValue(); + + unsigned Index = (Start + Step * Iteration) / ElemSize; + if (Index >= CDS->getNumElements()) + return nullptr; + + Constant *CV = CDS->getElementAsConstant(Index); + + return CV; + } + +public: + UnrollAnalyzer(const Loop *L, unsigned TripCount, ScalarEvolution &SE, + const TargetTransformInfo &TTI) + : L(L), TripCount(TripCount), SE(SE), TTI(TTI), + NumberOfOptimizedInstructions(0) {} + + // Visit all loads the loop L, and for those that, after complete loop + // unrolling, would have a constant address and it will point to a known + // constant initializer, record its base address for future use. It is used + // when we estimate number of potentially simplified instructions. + void findConstFoldableLoads() { + for (auto BB : L->getBlocks()) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + if (!LI->isSimple()) + continue; + Value *AddrOp = LI->getPointerOperand(); + const SCEV *S = SE.getSCEV(AddrOp); + FindConstantPointers Visitor(L, SE); + SCEVTraversal<FindConstantPointers> T(Visitor); + T.visitAll(S); + if (Visitor.IndexIsConstant && Visitor.LoadCanBeConstantFolded) { + LoadBaseAddresses[LI] = Visitor.BaseAddress; + } + } + } + } + } + + // Given a list of loads that could be constant-folded (LoadBaseAddresses), + // estimate number of optimized instructions after substituting the concrete + // values for the given Iteration. Also track how many instructions become + // dead through this process. + unsigned estimateNumberOfOptimizedInstructions(unsigned Iteration) { + // We keep a set vector for the worklist so that we don't wast space in the + // worklist queuing up the same instruction repeatedly. This can happen due + // to multiple operands being the same instruction or due to the same + // instruction being an operand of lots of things that end up dead or + // simplified. + SmallSetVector<Instruction *, 8> Worklist; + + // Clear the simplified values and counts for this iteration. + SimplifiedValues.clear(); + CountedInstructions.clear(); + NumberOfOptimizedInstructions = 0; + + // We start by adding all loads to the worklist. + for (auto &LoadDescr : LoadBaseAddresses) { + LoadInst *LI = LoadDescr.first; + SimplifiedValues[LI] = computeLoadValue(LI, Iteration); + if (CountedInstructions.insert(LI).second) + NumberOfOptimizedInstructions += TTI.getUserCost(LI); + + for (User *U : LI->users()) + Worklist.insert(cast<Instruction>(U)); + } + + // And then we try to simplify every user of every instruction from the + // worklist. If we do simplify a user, add it to the worklist to process + // its users as well. + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + if (!L->contains(I)) + continue; + if (!visit(I)) + continue; + for (User *U : I->users()) + Worklist.insert(cast<Instruction>(U)); + } + + // Now that we know the potentially simplifed instructions, estimate number + // of instructions that would become dead if we do perform the + // simplification. + + // The dead instructions are held in a separate set. This is used to + // prevent us from re-examining instructions and make sure we only count + // the benifit once. The worklist's internal set handles insertion + // deduplication. + SmallPtrSet<Instruction *, 16> DeadInstructions; + + // Lambda to enque operands onto the worklist. + auto EnqueueOperands = [&](Instruction &I) { + for (auto *Op : I.operand_values()) + if (auto *OpI = dyn_cast<Instruction>(Op)) + if (!OpI->use_empty()) + Worklist.insert(OpI); + }; + + // Start by initializing worklist with simplified instructions. + for (auto &FoldedKeyValue : SimplifiedValues) + if (auto *FoldedInst = dyn_cast<Instruction>(FoldedKeyValue.first)) { + DeadInstructions.insert(FoldedInst); + + // Add each instruction operand of this dead instruction to the + // worklist. + EnqueueOperands(*FoldedInst); + } + + // If a definition of an insn is only used by simplified or dead + // instructions, it's also dead. Check defs of all instructions from the + // worklist. + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + if (!L->contains(I)) + continue; + if (DeadInstructions.count(I)) + continue; + + if (std::all_of(I->user_begin(), I->user_end(), [&](User *U) { + return DeadInstructions.count(cast<Instruction>(U)); + })) { + NumberOfOptimizedInstructions += TTI.getUserCost(I); + DeadInstructions.insert(I); + EnqueueOperands(*I); + } + } + return NumberOfOptimizedInstructions; + } +}; + +// Complete loop unrolling can make some loads constant, and we need to know if +// that would expose any further optimization opportunities. +// This routine estimates this optimization effect and returns the number of +// instructions, that potentially might be optimized away. +static unsigned +approximateNumberOfOptimizedInstructions(const Loop *L, ScalarEvolution &SE, + unsigned TripCount, + const TargetTransformInfo &TTI) { + if (!TripCount || !UnrollMaxIterationsCountToAnalyze) + return 0; + + UnrollAnalyzer UA(L, TripCount, SE, TTI); + UA.findConstFoldableLoads(); + + // Estimate number of instructions, that could be simplified if we replace a + // load with the corresponding constant. Since the same load will take + // different values on different iterations, we have to go through all loop's + // iterations here. To limit ourselves here, we check only first N + // iterations, and then scale the found number, if necessary. + unsigned IterationsNumberForEstimate = + std::min<unsigned>(UnrollMaxIterationsCountToAnalyze, TripCount); + unsigned NumberOfOptimizedInstructions = 0; + for (unsigned i = 0; i < IterationsNumberForEstimate; ++i) + NumberOfOptimizedInstructions += + UA.estimateNumberOfOptimizedInstructions(i); + + NumberOfOptimizedInstructions *= TripCount / IterationsNumberForEstimate; + + return NumberOfOptimizedInstructions; +} + /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, const TargetTransformInfo &TTI, - AssumptionTracker *AT) { + AssumptionCache *AC) { SmallPtrSet<const Value *, 32> EphValues; - CodeMetrics::collectEphemeralValues(L, AT, EphValues); + CodeMetrics::collectEphemeralValues(L, AC, EphValues); CodeMetrics Metrics; for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); @@ -222,8 +591,11 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, // Don't allow an estimate of size zero. This would allows unrolling of loops // with huge iteration counts, which is a compile time problem even if it's - // not a problem for code quality. - if (LoopSize == 0) LoopSize = 1; + // not a problem for code quality. Also, the code using this size may assume + // that each loop has at least three instructions (likely a conditional + // branch, a comparison feeding that branch, and some kind of loop increment + // feeding that comparison instruction). + LoopSize = std::max(LoopSize, 3u); return LoopSize; } @@ -231,48 +603,31 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, // Returns the loop hint metadata node with the given name (for example, // "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is // returned. -static const MDNode *GetUnrollMetadata(const Loop *L, StringRef Name) { - MDNode *LoopID = L->getLoopID(); - if (!LoopID) - return nullptr; - - // First operand should refer to the loop id itself. - assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); - assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); - - for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { - const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); - if (!MD) - continue; - - const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); - if (!S) - continue; - - if (Name.equals(S->getString())) - return MD; - } +static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) { + if (MDNode *LoopID = L->getLoopID()) + return GetUnrollMetadata(LoopID, Name); return nullptr; } // Returns true if the loop has an unroll(full) pragma. static bool HasUnrollFullPragma(const Loop *L) { - return GetUnrollMetadata(L, "llvm.loop.unroll.full"); + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full"); } // Returns true if the loop has an unroll(disable) pragma. static bool HasUnrollDisablePragma(const Loop *L) { - return GetUnrollMetadata(L, "llvm.loop.unroll.disable"); + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable"); } // If loop has an unroll_count pragma return the (necessarily // positive) value from the pragma. Otherwise return 0. static unsigned UnrollCountPragmaValue(const Loop *L) { - const MDNode *MD = GetUnrollMetadata(L, "llvm.loop.unroll.count"); + MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count"); if (MD) { assert(MD->getNumOperands() == 2 && "Unroll count hint metadata should have two operands."); - unsigned Count = cast<ConstantInt>(MD->getOperand(1))->getZExtValue(); + unsigned Count = + mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue(); assert(Count >= 1 && "Unroll count must be positive."); return Count; } @@ -288,9 +643,9 @@ static void SetLoopAlreadyUnrolled(Loop *L) { if (!LoopID) return; // First remove any existing loop unrolling metadata. - SmallVector<Value *, 4> Vals; + SmallVector<Metadata *, 4> MDs; // Reserve first location for self reference to the LoopID metadata node. - Vals.push_back(nullptr); + MDs.push_back(nullptr); for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { bool IsUnrollMetadata = false; MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); @@ -298,17 +653,18 @@ static void SetLoopAlreadyUnrolled(Loop *L) { const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); } - if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i)); + if (!IsUnrollMetadata) + MDs.push_back(LoopID->getOperand(i)); } // Add unroll(disable) metadata to disable future unrolling. LLVMContext &Context = L->getHeader()->getContext(); - SmallVector<Value *, 1> DisableOperands; + SmallVector<Metadata *, 1> DisableOperands; DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable")); MDNode *DisableNode = MDNode::get(Context, DisableOperands); - Vals.push_back(DisableNode); + MDs.push_back(DisableNode); - MDNode *NewLoopID = MDNode::get(Context, Vals); + MDNode *NewLoopID = MDNode::get(Context, MDs); // Set operand 0 to refer to the loop id itself. NewLoopID->replaceOperandWith(0, NewLoopID); L->setLoopID(NewLoopID); @@ -358,12 +714,13 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { if (skipOptnoneFunction(L)) return false; - LoopInfo *LI = &getAnalysis<LoopInfo>(); + Function &F = *L->getHeader()->getParent(); + + LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); - const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); - const FunctionTargetTransformInfo &FTTI = - getAnalysis<FunctionTargetTransformInfo>(); - AssumptionTracker *AT = &getAnalysis<AssumptionTracker>(); + const TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() @@ -377,7 +734,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { bool HasPragma = PragmaFullUnroll || PragmaCount > 0; TargetTransformInfo::UnrollingPreferences UP; - getUnrollingPreferences(L, FTTI, UP); + getUnrollingPreferences(L, TTI, UP); // Find trip count and trip multiple if count is not available unsigned TripCount = 0; @@ -402,9 +759,13 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { unsigned NumInlineCandidates; bool notDuplicatable; unsigned LoopSize = - ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, AT); + ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, &AC); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); - uint64_t UnrolledSize = (uint64_t)LoopSize * Count; + + // When computing the unrolled size, note that the conditional branch on the + // backedge and the comparison feeding it are not replicated like the rest of + // the loop body (which is why 2 is subtracted). + uint64_t UnrolledSize = (uint64_t)(LoopSize-2) * Count + 2; if (notDuplicatable) { DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" << " instructions.\n"); @@ -415,8 +776,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { return false; } + unsigned NumberOfOptimizedInstructions = + approximateNumberOfOptimizedInstructions(L, *SE, TripCount, TTI); + DEBUG(dbgs() << " Complete unrolling could save: " + << NumberOfOptimizedInstructions << "\n"); + unsigned Threshold, PartialThreshold; - selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold); + selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold, + NumberOfOptimizedInstructions); // Given Count, TripCount and thresholds determine the type of // unrolling which is to be performed. @@ -449,7 +816,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } if (PartialThreshold != NoThreshold && UnrolledSize > PartialThreshold) { // Reduce unroll count to be modulo of TripCount for partial unrolling. - Count = PartialThreshold / LoopSize; + Count = (std::max(PartialThreshold, 3u)-2) / (LoopSize-2); while (Count != 0 && TripCount % Count != 0) Count--; } @@ -463,7 +830,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // the original count which satisfies the threshold limit. while (Count != 0 && UnrolledSize > PartialThreshold) { Count >>= 1; - UnrolledSize = LoopSize * Count; + UnrolledSize = (LoopSize-2) * Count + 2; } if (Count > UP.MaxCount) Count = UP.MaxCount; @@ -509,7 +876,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Unroll the loop. if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this, - &LPM, AT)) + &LPM, &AC)) return false; return true; diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index ef43483..987dc96 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -30,7 +30,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" @@ -105,7 +105,7 @@ namespace { // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. bool countLoop(const Loop *L, const TargetTransformInfo &TTI, - AssumptionTracker *AT); + AssumptionCache *AC); // Clean all data related to given loop. void forgetLoop(const Loop *L); @@ -128,7 +128,7 @@ namespace { class LoopUnswitch : public LoopPass { LoopInfo *LI; // Loop information LPPassManager *LPM; - AssumptionTracker *AT; + AssumptionCache *AC; // LoopProcessWorklist - Used to check if second loop needs processing // after RewriteLoopBodyWithConditionConstant rewrites first loop. @@ -167,16 +167,16 @@ namespace { /// loop preheaders be inserted into the CFG. /// void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } private: @@ -217,7 +217,7 @@ namespace { // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI, - AssumptionTracker *AT) { + AssumptionCache *AC) { LoopPropsMapIt PropsIt; bool Inserted; @@ -235,7 +235,7 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI, // This is a very ad-hoc heuristic. SmallPtrSet<const Value *, 32> EphValues; - CodeMetrics::collectEphemeralValues(L, AT, EphValues); + CodeMetrics::collectEphemeralValues(L, AC, EphValues); // FIXME: This is overly conservative because it does not take into // consideration code simplification opportunities and code that can @@ -333,10 +333,10 @@ void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop, char LoopUnswitch::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) @@ -385,8 +385,9 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { if (skipOptnoneFunction(L)) return false; - AT = &getAnalysis<AssumptionTracker>(); - LI = &getAnalysis<LoopInfo>(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + *L->getHeader()->getParent()); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); LPM = &LPM_Ref; DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); @@ -431,8 +432,10 @@ bool LoopUnswitch::processCurrentLoop() { // Probably we reach the quota of branches for this loop. If so // stop unswitching. - if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>(), - AT)) + if (!BranchesInfo.countLoop( + currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *currentLoop->getHeader()->getParent()), + AC)) return false; // Loop over all of the basic blocks in the loop. If we find an interior @@ -654,9 +657,7 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) { // Check to see if it would be profitable to unswitch current loop. // Do not do non-trivial unswitch while optimizing for size. - if (OptimizeForSize || - F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize)) + if (OptimizeForSize || F->hasFnAttribute(Attribute::OptimizeForSize)) return false; UnswitchNontrivialCondition(LoopCond, Val, currentLoop); @@ -674,7 +675,7 @@ static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) if (LI->getLoopFor(*I) == L) - New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), LI->getBase()); + New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI); // Add all of the subloops to the new loop. for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) @@ -705,8 +706,9 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, // If either edge is critical, split it. This helps preserve LoopSimplify // form for enclosing loops. - SplitCriticalEdge(BI, 0, this, false, false, true); - SplitCriticalEdge(BI, 1, this, false, false, true); + auto Options = CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA(); + SplitCriticalEdge(BI, 0, Options); + SplitCriticalEdge(BI, 1, Options); } /// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable @@ -725,7 +727,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, // First step, split the preheader, so that we know that there is a safe place // to insert the conditional branch. We will change loopPreheader to have a // conditional branch on Cond. - BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, this); + BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, DT, LI); // Now that we have a place to insert the conditional branch, create a place // to branch to: this is the exit block out of the loop that we should @@ -736,7 +738,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, // without actually branching to it (the exit block should be dominated by the // loop header, not the preheader). assert(!L->contains(ExitBlock) && "Exit block is in the loop?"); - BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this); + BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), DT, LI); // Okay, now we have a position to branch from and a position to branch to, // insert the new conditional branch. @@ -767,13 +769,9 @@ void LoopUnswitch::SplitExitEdges(Loop *L, // Although SplitBlockPredecessors doesn't preserve loop-simplify in // general, if we call it on all predecessors of all exits then it does. - if (!ExitBlock->isLandingPad()) { - SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", this); - } else { - SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa", - this, NewBBs); - } + SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", + /*AliasAnalysis*/ nullptr, DT, LI, + /*PreserveLCSSA*/ true); } } @@ -796,7 +794,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // First step, split the preheader and exit blocks, and add these blocks to // the LoopBlocks list. - BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, this); + BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, DT, LI); LoopBlocks.push_back(NewPreheader); // We want the loop to come after the preheader, but before the exit blocks. @@ -836,7 +834,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // FIXME: We could register any cloned assumptions instead of clearing the // whole function's cache. - AT->forgetCachedAssumptions(F); + AC->clear(); // Now we create the new Loop object for the versioned loop. Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM); @@ -849,14 +847,14 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, if (ParentLoop) { // Make sure to add the cloned preheader and exit blocks to the parent loop // as well. - ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase()); + ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI); } for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]); // The new exit block should be in the same loop as the old one. if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i])) - ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase()); + ExitBBLoop->addBasicBlockToLoop(NewExit, *LI); assert(NewExit->getTerminator()->getNumSuccessors() == 1 && "Exit block should have been split to have one successor!"); @@ -1042,7 +1040,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, // and hooked up so as to preserve the loop structure, because // trying to update it is complicated. So instead we preserve the // loop structure and put the block on a dead code path. - SplitEdge(Switch, SISucc, this); + SplitEdge(Switch, SISucc, DT, LI); // Compute the successors instead of relying on the return value // of SplitEdge, since it may have split the switch successor // after PHI nodes. diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp new file mode 100644 index 0000000..0c47cbd --- /dev/null +++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -0,0 +1,192 @@ +//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass lowers the 'expect' intrinsic to LLVM metadata. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +#define DEBUG_TYPE "lower-expect-intrinsic" + +STATISTIC(ExpectIntrinsicsHandled, + "Number of 'expect' intrinsic instructions handled"); + +static cl::opt<uint32_t> +LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64), + cl::desc("Weight of the branch likely to be taken (default = 64)")); +static cl::opt<uint32_t> +UnlikelyBranchWeight("unlikely-branch-weight", cl::Hidden, cl::init(4), + cl::desc("Weight of the branch unlikely to be taken (default = 4)")); + +static bool handleSwitchExpect(SwitchInst &SI) { + CallInst *CI = dyn_cast<CallInst>(SI.getCondition()); + if (!CI) + return false; + + Function *Fn = CI->getCalledFunction(); + if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect) + return false; + + Value *ArgValue = CI->getArgOperand(0); + ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + if (!ExpectedValue) + return false; + + SwitchInst::CaseIt Case = SI.findCaseValue(ExpectedValue); + unsigned n = SI.getNumCases(); // +1 for default case. + SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight); + + if (Case == SI.case_default()) + Weights[0] = LikelyBranchWeight; + else + Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight; + + SI.setMetadata(LLVMContext::MD_prof, + MDBuilder(CI->getContext()).createBranchWeights(Weights)); + + SI.setCondition(ArgValue); + return true; +} + +static bool handleBranchExpect(BranchInst &BI) { + if (BI.isUnconditional()) + return false; + + // Handle non-optimized IR code like: + // %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1) + // %tobool = icmp ne i64 %expval, 0 + // br i1 %tobool, label %if.then, label %if.end + // + // Or the following simpler case: + // %expval = call i1 @llvm.expect.i1(i1 %cmp, i1 1) + // br i1 %expval, label %if.then, label %if.end + + CallInst *CI; + + ICmpInst *CmpI = dyn_cast<ICmpInst>(BI.getCondition()); + if (!CmpI) { + CI = dyn_cast<CallInst>(BI.getCondition()); + } else { + if (CmpI->getPredicate() != CmpInst::ICMP_NE) + return false; + CI = dyn_cast<CallInst>(CmpI->getOperand(0)); + } + + if (!CI) + return false; + + Function *Fn = CI->getCalledFunction(); + if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect) + return false; + + Value *ArgValue = CI->getArgOperand(0); + ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + if (!ExpectedValue) + return false; + + MDBuilder MDB(CI->getContext()); + MDNode *Node; + + // If expect value is equal to 1 it means that we are more likely to take + // branch 0, in other case more likely is branch 1. + if (ExpectedValue->isOne()) + Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight); + else + Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight); + + BI.setMetadata(LLVMContext::MD_prof, Node); + + if (CmpI) + CmpI->setOperand(0, ArgValue); + else + BI.setCondition(ArgValue); + return true; +} + +static bool lowerExpectIntrinsic(Function &F) { + bool Changed = false; + + for (BasicBlock &BB : F) { + // Create "block_weights" metadata. + if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) { + if (handleBranchExpect(*BI)) + ExpectIntrinsicsHandled++; + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) { + if (handleSwitchExpect(*SI)) + ExpectIntrinsicsHandled++; + } + + // remove llvm.expect intrinsics. + for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { + CallInst *CI = dyn_cast<CallInst>(BI++); + if (!CI) + continue; + + Function *Fn = CI->getCalledFunction(); + if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) { + Value *Exp = CI->getArgOperand(0); + CI->replaceAllUsesWith(Exp); + CI->eraseFromParent(); + Changed = true; + } + } + } + + return Changed; +} + +PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F) { + if (lowerExpectIntrinsic(F)) + return PreservedAnalyses::none(); + + return PreservedAnalyses::all(); +} + +namespace { +/// \brief Legacy pass for lowering expect intrinsics out of the IR. +/// +/// When this pass is run over a function it uses expect intrinsics which feed +/// branches and switches to provide branch weight metadata for those +/// terminators. It then removes the expect intrinsics from the IR so the rest +/// of the optimizer can ignore them. +class LowerExpectIntrinsic : public FunctionPass { +public: + static char ID; + LowerExpectIntrinsic() : FunctionPass(ID) { + initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); } +}; +} + +char LowerExpectIntrinsic::ID = 0; +INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect", + "Lower 'expect' Intrinsics", false, false) + +FunctionPass *llvm::createLowerExpectIntrinsicPass() { + return new LowerExpectIntrinsic(); +} diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index be524be..006b885 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -16,7 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" @@ -28,7 +28,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; @@ -330,11 +330,11 @@ namespace { // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<MemoryDependenceAnalysis>(); } @@ -363,10 +363,10 @@ FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false) @@ -750,6 +750,16 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // its dependence information by changing its parameter. MD->removeInstruction(C); + // Update AA metadata + // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be + // handled here, but combineMetadata doesn't support them yet + unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, + LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + }; + combineMetadata(C, cpy, KnownIDs); + // Remove the memcpy. MD->removeInstruction(cpy); ++NumMemCpyInstr; @@ -982,11 +992,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // If it is greater than the memcpy, then we check to see if we can force the // source of the memcpy to the alignment we need. If we fail, we bail out. - AssumptionTracker *AT = &getAnalysis<AssumptionTracker>(); + AssumptionCache &AC = + getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + *CS->getParent()->getParent()); DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); if (MDep->getAlignment() < ByValAlign && - getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, - DL, AT, CS.getInstruction(), &DT) < ByValAlign) + getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &AC, + CS.getInstruction(), &DT) < ByValAlign) return false; // Verify that the copied-from memory doesn't change in between the memcpy and @@ -1067,7 +1079,7 @@ bool MemCpyOpt::runOnFunction(Function &F) { MD = &getAnalysis<MemoryDependenceAnalysis>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); // If we don't have at least memset and memcpy, there is little point of doing // anything here. These are required by a freestanding implementation, so if diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 8281c59..8fad63f 100644 --- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -86,7 +86,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <vector> @@ -115,7 +115,7 @@ public: private: // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); AU.addPreserved<AliasAnalysis>(); @@ -143,7 +143,9 @@ private: // Routines for sinking stores StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI); PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); - bool isStoreSinkBarrier(Instruction *Inst); + bool isStoreSinkBarrierInRange(const Instruction& Start, + const Instruction& End, + AliasAnalysis::Location Loc); bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst); bool mergeStores(BasicBlock *BB); // The mergeLoad/Store algorithms could have Size0 * Size1 complexity, @@ -166,7 +168,7 @@ FunctionPass *llvm::createMergedLoadStoreMotionPass() { INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) @@ -239,7 +241,7 @@ bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, const Instruction& End, LoadInst* LI) { AliasAnalysis::Location Loc = AA->getLocation(LI); - return AA->canInstructionRangeModify(Start, End, Loc); + return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Mod); } /// @@ -389,26 +391,19 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { } /// -/// \brief True when instruction is sink barrier for a store -/// -bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) { - // FIXME: Conservatively let a load instruction block the store. - // Use alias analysis instead. - if (isa<LoadInst>(Inst)) - return true; - if (isa<CallInst>(Inst)) - return true; - if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst)) - return true; - // Note: mayHaveSideEffects covers all instructions that could - // trigger a change to state. Eg. in-flight stores have to be executed - // before ordered loads or fences, calls could invoke functions that store - // data to memory etc. - if (!isa<StoreInst>(Inst) && Inst->mayHaveSideEffects()) { - return true; - } - DEBUG(dbgs() << "No Sink Barrier\n"); - return false; +/// \brief True when instruction is a sink barrier for a store +/// located in Loc +/// +/// Whenever an instruction could possibly read or modify the +/// value being stored or protect against the store from +/// happening it is considered a sink barrier. +/// + +bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction& Start, + const Instruction& End, + AliasAnalysis::Location + Loc) { + return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::ModRef); } /// @@ -416,27 +411,30 @@ bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) { /// /// \return The store in \p when it is safe to sink. Otherwise return Null. /// -StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB, - StoreInst *SI) { - StoreInst *I = 0; - DEBUG(dbgs() << "can Sink? : "; SI->dump(); dbgs() << "\n"); - for (BasicBlock::reverse_iterator RBI = BB->rbegin(), RBE = BB->rend(); +StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1, + StoreInst *Store0) { + DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n"); + BasicBlock *BB0 = Store0->getParent(); + for (BasicBlock::reverse_iterator RBI = BB1->rbegin(), RBE = BB1->rend(); RBI != RBE; ++RBI) { Instruction *Inst = &*RBI; - // Only move loads if they are used in the block. - if (isStoreSinkBarrier(Inst)) - break; - if (isa<StoreInst>(Inst)) { - AliasAnalysis::Location LocSI = AA->getLocation(SI); - AliasAnalysis::Location LocInst = AA->getLocation((StoreInst *)Inst); - if (AA->isMustAlias(LocSI, LocInst)) { - I = (StoreInst *)Inst; - break; - } + if (!isa<StoreInst>(Inst)) + continue; + + StoreInst *Store1 = cast<StoreInst>(Inst); + + AliasAnalysis::Location Loc0 = AA->getLocation(Store0); + AliasAnalysis::Location Loc1 = AA->getLocation(Store1); + if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) && + !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))), + BB1->back(), Loc1) && + !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store0))), + BB0->back(), Loc0)) { + return Store1; } } - return I; + return nullptr; } /// @@ -548,8 +546,7 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { Instruction *I = &*RBI; ++RBI; - if (isStoreSinkBarrier(I)) - break; + // Sink move non-simple (atomic, volatile) stores if (!isa<StoreInst>(I)) continue; diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 5c8bed5..31d7df3 100644 --- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -18,7 +18,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -52,16 +52,18 @@ INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls", "Partially inline calls to library functions", false, false) void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetLibraryInfo>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); FunctionPass::getAnalysisUsage(AU); } bool PartiallyInlineLibCalls::runOnFunction(Function &F) { bool Changed = false; Function::iterator CurrBB; - TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); - const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfo>(); + TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + const TargetTransformInfo *TTI = + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) { CurrBB = BB++; @@ -126,7 +128,7 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call, // Move all instructions following Call to newly created block JoinBB. // Create phi and replace all uses. - BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode(), this); + BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode()); IRBuilder<> Builder(JoinBB, JoinBB->begin()); PHINode *Phi = Builder.CreatePHI(Call->getType(), 2); Call->replaceAllUsesWith(Phi); diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp new file mode 100644 index 0000000..944725a --- /dev/null +++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -0,0 +1,989 @@ +//===- PlaceSafepoints.cpp - Place GC Safepoints --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Place garbage collection safepoints at appropriate locations in the IR. This +// does not make relocation semantics or variable liveness explicit. That's +// done by RewriteStatepointsForGC. +// +// Terminology: +// - A call is said to be "parseable" if there is a stack map generated for the +// return PC of the call. A runtime can determine where values listed in the +// deopt arguments and (after RewriteStatepointsForGC) gc arguments are located +// on the stack when the code is suspended inside such a call. Every parse +// point is represented by a call wrapped in an gc.statepoint intrinsic. +// - A "poll" is an explicit check in the generated code to determine if the +// runtime needs the generated code to cooperate by calling a helper routine +// and thus suspending its execution at a known state. The call to the helper +// routine will be parseable. The (gc & runtime specific) logic of a poll is +// assumed to be provided in a function of the name "gc.safepoint_poll". +// +// We aim to insert polls such that running code can quickly be brought to a +// well defined state for inspection by the collector. In the current +// implementation, this is done via the insertion of poll sites at method entry +// and the backedge of most loops. We try to avoid inserting more polls than +// are neccessary to ensure a finite period between poll sites. This is not +// because the poll itself is expensive in the generated code; it's not. Polls +// do tend to impact the optimizer itself in negative ways; we'd like to avoid +// perturbing the optimization of the method as much as we can. +// +// We also need to make most call sites parseable. The callee might execute a +// poll (or otherwise be inspected by the GC). If so, the entire stack +// (including the suspended frame of the current method) must be parseable. +// +// This pass will insert: +// - Call parse points ("call safepoints") for any call which may need to +// reach a safepoint during the execution of the callee function. +// - Backedge safepoint polls and entry safepoint polls to ensure that +// executing code reaches a safepoint poll in a finite amount of time. +// +// We do not currently support return statepoints, but adding them would not +// be hard. They are not required for correctness - entry safepoints are an +// alternative - but some GCs may prefer them. Patches welcome. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Pass.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Statepoint.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" + +#define DEBUG_TYPE "safepoint-placement" +STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted"); +STATISTIC(NumCallSafepoints, "Number of call safepoints inserted"); +STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted"); + +STATISTIC(CallInLoop, "Number of loops w/o safepoints due to calls in loop"); +STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution"); + +using namespace llvm; + +// Ignore oppurtunities to avoid placing safepoints on backedges, useful for +// validation +static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden, + cl::init(false)); + +/// If true, do not place backedge safepoints in counted loops. +static cl::opt<bool> SkipCounted("spp-counted", cl::Hidden, cl::init(true)); + +// If true, split the backedge of a loop when placing the safepoint, otherwise +// split the latch block itself. Both are useful to support for +// experimentation, but in practice, it looks like splitting the backedge +// optimizes better. +static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden, + cl::init(false)); + +// Print tracing output +static cl::opt<bool> TraceLSP("spp-trace", cl::Hidden, cl::init(false)); + +namespace { + +/** An analysis pass whose purpose is to identify each of the backedges in + the function which require a safepoint poll to be inserted. */ +struct PlaceBackedgeSafepointsImpl : public LoopPass { + static char ID; + + /// The output of the pass - gives a list of each backedge (described by + /// pointing at the branch) which need a poll inserted. + std::vector<TerminatorInst *> PollLocations; + + /// True unless we're running spp-no-calls in which case we need to disable + /// the call dependend placement opts. + bool CallSafepointsEnabled; + PlaceBackedgeSafepointsImpl(bool CallSafepoints = false) + : LoopPass(ID), CallSafepointsEnabled(CallSafepoints) { + initializePlaceBackedgeSafepointsImplPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *, LPPassManager &LPM) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // needed for determining if the loop is finite + AU.addRequired<ScalarEvolution>(); + // to ensure each edge has a single backedge + // TODO: is this still required? + AU.addRequiredID(LoopSimplifyID); + + // We no longer modify the IR at all in this pass. Thus all + // analysis are preserved. + AU.setPreservesAll(); + } +}; +} + +static cl::opt<bool> NoEntry("spp-no-entry", cl::Hidden, cl::init(false)); +static cl::opt<bool> NoCall("spp-no-call", cl::Hidden, cl::init(false)); +static cl::opt<bool> NoBackedge("spp-no-backedge", cl::Hidden, cl::init(false)); + +namespace { +struct PlaceSafepoints : public ModulePass { + static char ID; // Pass identification, replacement for typeid + + PlaceSafepoints() : ModulePass(ID) { + initializePlaceSafepointsPass(*PassRegistry::getPassRegistry()); + } + bool runOnModule(Module &M) override { + bool modified = false; + for (Function &F : M) { + modified |= runOnFunction(F); + } + return modified; + } + bool runOnFunction(Function &F); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // We modify the graph wholesale (inlining, block insertion, etc). We + // preserve nothing at the moment. We could potentially preserve dom tree + // if that was worth doing + } +}; +} + +// Insert a safepoint poll immediately before the given instruction. Does +// not handle the parsability of state at the runtime call, that's the +// callers job. +static void +InsertSafepointPoll(DominatorTree &DT, Instruction *after, + std::vector<CallSite> &ParsePointsNeeded /*rval*/); + +static bool isGCLeafFunction(const CallSite &CS); + +static bool needsStatepoint(const CallSite &CS) { + if (isGCLeafFunction(CS)) + return false; + if (CS.isCall()) { + CallInst *call = cast<CallInst>(CS.getInstruction()); + if (call->isInlineAsm()) + return false; + } + if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS)) { + return false; + } + return true; +} + +static Value *ReplaceWithStatepoint(const CallSite &CS, Pass *P); + +/// Returns true if this loop is known to contain a call safepoint which +/// must unconditionally execute on any iteration of the loop which returns +/// to the loop header via an edge from Pred. Returns a conservative correct +/// answer; i.e. false is always valid. +static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, + BasicBlock *Pred, + DominatorTree &DT) { + // In general, we're looking for any cut of the graph which ensures + // there's a call safepoint along every edge between Header and Pred. + // For the moment, we look only for the 'cuts' that consist of a single call + // instruction in a block which is dominated by the Header and dominates the + // loop latch (Pred) block. Somewhat surprisingly, walking the entire chain + // of such dominating blocks gets substaintially more occurences than just + // checking the Pred and Header blocks themselves. This may be due to the + // density of loop exit conditions caused by range and null checks. + // TODO: structure this as an analysis pass, cache the result for subloops, + // avoid dom tree recalculations + assert(DT.dominates(Header, Pred) && "loop latch not dominated by header?"); + + BasicBlock *Current = Pred; + while (true) { + for (Instruction &I : *Current) { + if (CallSite CS = &I) + // Note: Technically, needing a safepoint isn't quite the right + // condition here. We should instead be checking if the target method + // has an + // unconditional poll. In practice, this is only a theoretical concern + // since we don't have any methods with conditional-only safepoint + // polls. + if (needsStatepoint(CS)) + return true; + } + + if (Current == Header) + break; + Current = DT.getNode(Current)->getIDom()->getBlock(); + } + + return false; +} + +/// Returns true if this loop is known to terminate in a finite number of +/// iterations. Note that this function may return false for a loop which +/// does actual terminate in a finite constant number of iterations due to +/// conservatism in the analysis. +static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE, + BasicBlock *Pred) { + // Only used when SkipCounted is off + const unsigned upperTripBound = 8192; + + // A conservative bound on the loop as a whole. + const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L); + if (MaxTrips != SE->getCouldNotCompute()) { + if (SE->getUnsignedRange(MaxTrips).getUnsignedMax().ult(upperTripBound)) + return true; + if (SkipCounted && + SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(32)) + return true; + } + + // If this is a conditional branch to the header with the alternate path + // being outside the loop, we can ask questions about the execution frequency + // of the exit block. + if (L->isLoopExiting(Pred)) { + // This returns an exact expression only. TODO: We really only need an + // upper bound here, but SE doesn't expose that. + const SCEV *MaxExec = SE->getExitCount(L, Pred); + if (MaxExec != SE->getCouldNotCompute()) { + if (SE->getUnsignedRange(MaxExec).getUnsignedMax().ult(upperTripBound)) + return true; + if (SkipCounted && + SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(32)) + return true; + } + } + + return /* not finite */ false; +} + +static void scanOneBB(Instruction *start, Instruction *end, + std::vector<CallInst *> &calls, + std::set<BasicBlock *> &seen, + std::vector<BasicBlock *> &worklist) { + for (BasicBlock::iterator itr(start); + itr != start->getParent()->end() && itr != BasicBlock::iterator(end); + itr++) { + if (CallInst *CI = dyn_cast<CallInst>(&*itr)) { + calls.push_back(CI); + } + // FIXME: This code does not handle invokes + assert(!dyn_cast<InvokeInst>(&*itr) && + "support for invokes in poll code needed"); + // Only add the successor blocks if we reach the terminator instruction + // without encountering end first + if (itr->isTerminator()) { + BasicBlock *BB = itr->getParent(); + for (BasicBlock *Succ : successors(BB)) { + if (seen.count(Succ) == 0) { + worklist.push_back(Succ); + seen.insert(Succ); + } + } + } + } +} +static void scanInlinedCode(Instruction *start, Instruction *end, + std::vector<CallInst *> &calls, + std::set<BasicBlock *> &seen) { + calls.clear(); + std::vector<BasicBlock *> worklist; + seen.insert(start->getParent()); + scanOneBB(start, end, calls, seen, worklist); + while (!worklist.empty()) { + BasicBlock *BB = worklist.back(); + worklist.pop_back(); + scanOneBB(&*BB->begin(), end, calls, seen, worklist); + } +} + +bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L, LPPassManager &LPM) { + ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); + + // Loop through all predecessors of the loop header and identify all + // backedges. We need to place a safepoint on every backedge (potentially). + // Note: Due to LoopSimplify there should only be one. Assert? Or can we + // relax this? + BasicBlock *header = L->getHeader(); + + // TODO: Use the analysis pass infrastructure for this. There is no reason + // to recalculate this here. + DominatorTree DT; + DT.recalculate(*header->getParent()); + + bool modified = false; + for (BasicBlock *pred : predecessors(header)) { + if (!L->contains(pred)) { + // This is not a backedge, it's coming from outside the loop + continue; + } + + // Make a policy decision about whether this loop needs a safepoint or + // not. Note that this is about unburdening the optimizer in loops, not + // avoiding the runtime cost of the actual safepoint. + if (!AllBackedges) { + if (mustBeFiniteCountedLoop(L, SE, pred)) { + if (TraceLSP) + errs() << "skipping safepoint placement in finite loop\n"; + FiniteExecution++; + continue; + } + if (CallSafepointsEnabled && + containsUnconditionalCallSafepoint(L, header, pred, DT)) { + // Note: This is only semantically legal since we won't do any further + // IPO or inlining before the actual call insertion.. If we hadn't, we + // might latter loose this call safepoint. + if (TraceLSP) + errs() << "skipping safepoint placement due to unconditional call\n"; + CallInLoop++; + continue; + } + } + + // TODO: We can create an inner loop which runs a finite number of + // iterations with an outer loop which contains a safepoint. This would + // not help runtime performance that much, but it might help our ability to + // optimize the inner loop. + + // We're unconditionally going to modify this loop. + modified = true; + + // Safepoint insertion would involve creating a new basic block (as the + // target of the current backedge) which does the safepoint (of all live + // variables) and branches to the true header + TerminatorInst *term = pred->getTerminator(); + + if (TraceLSP) { + errs() << "[LSP] terminator instruction: "; + term->dump(); + } + + PollLocations.push_back(term); + } + + return modified; +} + +static Instruction *findLocationForEntrySafepoint(Function &F, + DominatorTree &DT) { + + // Conceptually, this poll needs to be on method entry, but in + // practice, we place it as late in the entry block as possible. We + // can place it as late as we want as long as it dominates all calls + // that can grow the stack. This, combined with backedge polls, + // give us all the progress guarantees we need. + + // Due to the way the frontend generates IR, we may have a couple of initial + // basic blocks before the first bytecode. These will be single-entry + // single-exit blocks which conceptually are just part of the first 'real + // basic block'. Since we don't have deopt state until the first bytecode, + // walk forward until we've found the first unconditional branch or merge. + + // hasNextInstruction and nextInstruction are used to iterate + // through a "straight line" execution sequence. + + auto hasNextInstruction = [](Instruction *I) { + if (!I->isTerminator()) { + return true; + } + BasicBlock *nextBB = I->getParent()->getUniqueSuccessor(); + return nextBB && (nextBB->getUniquePredecessor() != nullptr); + }; + + auto nextInstruction = [&hasNextInstruction](Instruction *I) { + assert(hasNextInstruction(I) && + "first check if there is a next instruction!"); + if (I->isTerminator()) { + return I->getParent()->getUniqueSuccessor()->begin(); + } else { + return std::next(BasicBlock::iterator(I)); + } + }; + + Instruction *cursor = nullptr; + for (cursor = F.getEntryBlock().begin(); hasNextInstruction(cursor); + cursor = nextInstruction(cursor)) { + + // We need to stop going forward as soon as we see a call that can + // grow the stack (i.e. the call target has a non-zero frame + // size). + if (CallSite CS = cursor) { + (void)CS; // Silence an unused variable warning by gcc 4.8.2 + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(cursor)) { + // llvm.assume(...) are not really calls. + if (II->getIntrinsicID() == Intrinsic::assume) { + continue; + } + } + break; + } + } + + assert((hasNextInstruction(cursor) || cursor->isTerminator()) && + "either we stopped because of a call, or because of terminator"); + + if (cursor->isTerminator()) { + return cursor; + } + + BasicBlock *BB = cursor->getParent(); + SplitBlock(BB, cursor, nullptr); + + // Note: SplitBlock modifies the DT. Simply passing a Pass (which is a + // module pass) is not enough. + DT.recalculate(F); +#ifndef NDEBUG + // SplitBlock updates the DT + DT.verifyDomTree(); +#endif + + return BB->getTerminator(); +} + +/// Identify the list of call sites which need to be have parseable state +static void findCallSafepoints(Function &F, + std::vector<CallSite> &Found /*rval*/) { + assert(Found.empty() && "must be empty!"); + for (Instruction &I : inst_range(F)) { + Instruction *inst = &I; + if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) { + CallSite CS(inst); + + // No safepoint needed or wanted + if (!needsStatepoint(CS)) { + continue; + } + + Found.push_back(CS); + } + } +} + +/// Implement a unique function which doesn't require we sort the input +/// vector. Doing so has the effect of changing the output of a couple of +/// tests in ways which make them less useful in testing fused safepoints. +template <typename T> static void unique_unsorted(std::vector<T> &vec) { + std::set<T> seen; + std::vector<T> tmp; + vec.reserve(vec.size()); + std::swap(tmp, vec); + for (auto V : tmp) { + if (seen.insert(V).second) { + vec.push_back(V); + } + } +} + +static std::string GCSafepointPollName("gc.safepoint_poll"); + +static bool isGCSafepointPoll(Function &F) { + return F.getName().equals(GCSafepointPollName); +} + +/// Returns true if this function should be rewritten to include safepoint +/// polls and parseable call sites. The main point of this function is to be +/// an extension point for custom logic. +static bool shouldRewriteFunction(Function &F) { + // TODO: This should check the GCStrategy + if (F.hasGC()) { + const std::string StatepointExampleName("statepoint-example"); + return StatepointExampleName == F.getGC(); + } else + return false; +} + +// TODO: These should become properties of the GCStrategy, possibly with +// command line overrides. +static bool enableEntrySafepoints(Function &F) { return !NoEntry; } +static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; } +static bool enableCallSafepoints(Function &F) { return !NoCall; } + + +bool PlaceSafepoints::runOnFunction(Function &F) { + if (F.isDeclaration() || F.empty()) { + // This is a declaration, nothing to do. Must exit early to avoid crash in + // dom tree calculation + return false; + } + + if (isGCSafepointPoll(F)) { + // Given we're inlining this inside of safepoint poll insertion, this + // doesn't make any sense. Note that we do make any contained calls + // parseable after we inline a poll. + return false; + } + + if (!shouldRewriteFunction(F)) + return false; + + bool modified = false; + + // In various bits below, we rely on the fact that uses are reachable from + // defs. When there are basic blocks unreachable from the entry, dominance + // and reachablity queries return non-sensical results. Thus, we preprocess + // the function to ensure these properties hold. + modified |= removeUnreachableBlocks(F); + + // STEP 1 - Insert the safepoint polling locations. We do not need to + // actually insert parse points yet. That will be done for all polls and + // calls in a single pass. + + // Note: With the migration, we need to recompute this for each 'pass'. Once + // we merge these, we'll do it once before the analysis + DominatorTree DT; + + std::vector<CallSite> ParsePointNeeded; + + if (enableBackedgeSafepoints(F)) { + // Construct a pass manager to run the LoopPass backedge logic. We + // need the pass manager to handle scheduling all the loop passes + // appropriately. Doing this by hand is painful and just not worth messing + // with for the moment. + legacy::FunctionPassManager FPM(F.getParent()); + bool CanAssumeCallSafepoints = enableCallSafepoints(F); + PlaceBackedgeSafepointsImpl *PBS = + new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints); + FPM.add(PBS); + // Note: While the analysis pass itself won't modify the IR, LoopSimplify + // (which it depends on) may. i.e. analysis must be recalculated after run + FPM.run(F); + + // We preserve dominance information when inserting the poll, otherwise + // we'd have to recalculate this on every insert + DT.recalculate(F); + + // Insert a poll at each point the analysis pass identified + for (size_t i = 0; i < PBS->PollLocations.size(); i++) { + // We are inserting a poll, the function is modified + modified = true; + + // The poll location must be the terminator of a loop latch block. + TerminatorInst *Term = PBS->PollLocations[i]; + + std::vector<CallSite> ParsePoints; + if (SplitBackedge) { + // Split the backedge of the loop and insert the poll within that new + // basic block. This creates a loop with two latches per original + // latch (which is non-ideal), but this appears to be easier to + // optimize in practice than inserting the poll immediately before the + // latch test. + + // Since this is a latch, at least one of the successors must dominate + // it. Its possible that we have a) duplicate edges to the same header + // and b) edges to distinct loop headers. We need to insert pools on + // each. (Note: This still relies on LoopSimplify.) + DenseSet<BasicBlock *> Headers; + for (unsigned i = 0; i < Term->getNumSuccessors(); i++) { + BasicBlock *Succ = Term->getSuccessor(i); + if (DT.dominates(Succ, Term->getParent())) { + Headers.insert(Succ); + } + } + assert(!Headers.empty() && "poll location is not a loop latch?"); + + // The split loop structure here is so that we only need to recalculate + // the dominator tree once. Alternatively, we could just keep it up to + // date and use a more natural merged loop. + DenseSet<BasicBlock *> SplitBackedges; + for (BasicBlock *Header : Headers) { + BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, nullptr); + SplitBackedges.insert(NewBB); + } + DT.recalculate(F); + for (BasicBlock *NewBB : SplitBackedges) { + InsertSafepointPoll(DT, NewBB->getTerminator(), ParsePoints); + NumBackedgeSafepoints++; + } + + } else { + // Split the latch block itself, right before the terminator. + InsertSafepointPoll(DT, Term, ParsePoints); + NumBackedgeSafepoints++; + } + + // Record the parse points for later use + ParsePointNeeded.insert(ParsePointNeeded.end(), ParsePoints.begin(), + ParsePoints.end()); + } + } + + if (enableEntrySafepoints(F)) { + DT.recalculate(F); + Instruction *term = findLocationForEntrySafepoint(F, DT); + if (!term) { + // policy choice not to insert? + } else { + std::vector<CallSite> RuntimeCalls; + InsertSafepointPoll(DT, term, RuntimeCalls); + modified = true; + NumEntrySafepoints++; + ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(), + RuntimeCalls.end()); + } + } + + if (enableCallSafepoints(F)) { + DT.recalculate(F); + std::vector<CallSite> Calls; + findCallSafepoints(F, Calls); + NumCallSafepoints += Calls.size(); + ParsePointNeeded.insert(ParsePointNeeded.end(), Calls.begin(), Calls.end()); + } + + // Unique the vectors since we can end up with duplicates if we scan the call + // site for call safepoints after we add it for entry or backedge. The + // only reason we need tracking at all is that some functions might have + // polls but not call safepoints and thus we might miss marking the runtime + // calls for the polls. (This is useful in test cases!) + unique_unsorted(ParsePointNeeded); + + // Any parse point (no matter what source) will be handled here + DT.recalculate(F); // Needed? + + // We're about to start modifying the function + if (!ParsePointNeeded.empty()) + modified = true; + + // Now run through and insert the safepoints, but do _NOT_ update or remove + // any existing uses. We have references to live variables that need to + // survive to the last iteration of this loop. + std::vector<Value *> Results; + Results.reserve(ParsePointNeeded.size()); + for (size_t i = 0; i < ParsePointNeeded.size(); i++) { + CallSite &CS = ParsePointNeeded[i]; + Value *GCResult = ReplaceWithStatepoint(CS, nullptr); + Results.push_back(GCResult); + } + assert(Results.size() == ParsePointNeeded.size()); + + // Adjust all users of the old call sites to use the new ones instead + for (size_t i = 0; i < ParsePointNeeded.size(); i++) { + CallSite &CS = ParsePointNeeded[i]; + Value *GCResult = Results[i]; + if (GCResult) { + // In case if we inserted result in a different basic block than the + // original safepoint (this can happen for invokes). We need to be sure + // that + // original result value was not used in any of the phi nodes at the + // beginning of basic block with gc result. Because we know that all such + // blocks will have single predecessor we can safely assume that all phi + // nodes have single entry (because of normalizeBBForInvokeSafepoint). + // Just remove them all here. + if (CS.isInvoke()) { + FoldSingleEntryPHINodes(cast<Instruction>(GCResult)->getParent(), + nullptr); + assert( + !isa<PHINode>(cast<Instruction>(GCResult)->getParent()->begin())); + } + + // Replace all uses with the new call + CS.getInstruction()->replaceAllUsesWith(GCResult); + } + + // Now that we've handled all uses, remove the original call itself + // Note: The insert point can't be the deleted instruction! + CS.getInstruction()->eraseFromParent(); + } + return modified; +} + +char PlaceBackedgeSafepointsImpl::ID = 0; +char PlaceSafepoints::ID = 0; + +ModulePass *llvm::createPlaceSafepointsPass() { return new PlaceSafepoints(); } + +INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl, + "place-backedge-safepoints-impl", + "Place Backedge Safepoints", false, false) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl, + "place-backedge-safepoints-impl", + "Place Backedge Safepoints", false, false) + +INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints", + false, false) +INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints", + false, false) + +static bool isGCLeafFunction(const CallSite &CS) { + Instruction *inst = CS.getInstruction(); + if (isa<IntrinsicInst>(inst)) { + // Most LLVM intrinsics are things which can never take a safepoint. + // As a result, we don't need to have the stack parsable at the + // callsite. This is a highly useful optimization since intrinsic + // calls are fairly prevelent, particularly in debug builds. + return true; + } + + // If this function is marked explicitly as a leaf call, we don't need to + // place a safepoint of it. In fact, for correctness we *can't* in many + // cases. Note: Indirect calls return Null for the called function, + // these obviously aren't runtime functions with attributes + // TODO: Support attributes on the call site as well. + const Function *F = CS.getCalledFunction(); + bool isLeaf = + F && + F->getFnAttribute("gc-leaf-function").getValueAsString().equals("true"); + if (isLeaf) { + return true; + } + return false; +} + +static void +InsertSafepointPoll(DominatorTree &DT, Instruction *term, + std::vector<CallSite> &ParsePointsNeeded /*rval*/) { + Module *M = term->getParent()->getParent()->getParent(); + assert(M); + + // Inline the safepoint poll implementation - this will get all the branch, + // control flow, etc.. Most importantly, it will introduce the actual slow + // path call - where we need to insert a safepoint (parsepoint). + FunctionType *ftype = + FunctionType::get(Type::getVoidTy(M->getContext()), false); + assert(ftype && "null?"); + // Note: This cast can fail if there's a function of the same name with a + // different type inserted previously + Function *F = + dyn_cast<Function>(M->getOrInsertFunction("gc.safepoint_poll", ftype)); + assert(F && "void @gc.safepoint_poll() must be defined"); + assert(!F->empty() && "gc.safepoint_poll must be a non-empty function"); + CallInst *poll = CallInst::Create(F, "", term); + + // Record some information about the call site we're replacing + BasicBlock *OrigBB = term->getParent(); + BasicBlock::iterator before(poll), after(poll); + bool isBegin(false); + if (before == term->getParent()->begin()) { + isBegin = true; + } else { + before--; + } + after++; + assert(after != poll->getParent()->end() && "must have successor"); + assert(DT.dominates(before, after) && "trivially true"); + + // do the actual inlining + InlineFunctionInfo IFI; + bool inlineStatus = InlineFunction(poll, IFI); + assert(inlineStatus && "inline must succeed"); + (void)inlineStatus; // suppress warning in release-asserts + + // Check post conditions + assert(IFI.StaticAllocas.empty() && "can't have allocs"); + + std::vector<CallInst *> calls; // new calls + std::set<BasicBlock *> BBs; // new BBs + insertee + // Include only the newly inserted instructions, Note: begin may not be valid + // if we inserted to the beginning of the basic block + BasicBlock::iterator start; + if (isBegin) { + start = OrigBB->begin(); + } else { + start = before; + start++; + } + + // If your poll function includes an unreachable at the end, that's not + // valid. Bugpoint likes to create this, so check for it. + assert(isPotentiallyReachable(&*start, &*after, nullptr, nullptr) && + "malformed poll function"); + + scanInlinedCode(&*(start), &*(after), calls, BBs); + + // Recompute since we've invalidated cached data. Conceptually we + // shouldn't need to do this, but implementation wise we appear to. Needed + // so we can insert safepoints correctly. + // TODO: update more cheaply + DT.recalculate(*after->getParent()->getParent()); + + assert(!calls.empty() && "slow path not found for safepoint poll"); + + // Record the fact we need a parsable state at the runtime call contained in + // the poll function. This is required so that the runtime knows how to + // parse the last frame when we actually take the safepoint (i.e. execute + // the slow path) + assert(ParsePointsNeeded.empty()); + for (size_t i = 0; i < calls.size(); i++) { + + // No safepoint needed or wanted + if (!needsStatepoint(calls[i])) { + continue; + } + + // These are likely runtime calls. Should we assert that via calling + // convention or something? + ParsePointsNeeded.push_back(CallSite(calls[i])); + } + assert(ParsePointsNeeded.size() <= calls.size()); +} + +// Normalize basic block to make it ready to be target of invoke statepoint. +// It means spliting it to have single predecessor. Return newly created BB +// ready to be successor of invoke statepoint. +static BasicBlock *normalizeBBForInvokeSafepoint(BasicBlock *BB, + BasicBlock *InvokeParent) { + BasicBlock *ret = BB; + + if (!BB->getUniquePredecessor()) { + ret = SplitBlockPredecessors(BB, InvokeParent, ""); + } + + // Another requirement for such basic blocks is to not have any phi nodes. + // Since we just ensured that new BB will have single predecessor, + // all phi nodes in it will have one value. Here it would be naturall place + // to + // remove them all. But we can not do this because we are risking to remove + // one of the values stored in liveset of another statepoint. We will do it + // later after placing all safepoints. + + return ret; +} + +/// Replaces the given call site (Call or Invoke) with a gc.statepoint +/// intrinsic with an empty deoptimization arguments list. This does +/// NOT do explicit relocation for GC support. +static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ + Pass *P) { + BasicBlock *BB = CS.getInstruction()->getParent(); + Function *F = BB->getParent(); + Module *M = F->getParent(); + assert(M && "must be set"); + + // TODO: technically, a pass is not allowed to get functions from within a + // function pass since it might trigger a new function addition. Refactor + // this logic out to the initialization of the pass. Doesn't appear to + // matter in practice. + + // Then go ahead and use the builder do actually do the inserts. We insert + // immediately before the previous instruction under the assumption that all + // arguments will be available here. We can't insert afterwards since we may + // be replacing a terminator. + Instruction *insertBefore = CS.getInstruction(); + IRBuilder<> Builder(insertBefore); + + // Note: The gc args are not filled in at this time, that's handled by + // RewriteStatepointsForGC (which is currently under review). + + // Create the statepoint given all the arguments + Instruction *token = nullptr; + AttributeSet return_attributes; + if (CS.isCall()) { + CallInst *toReplace = cast<CallInst>(CS.getInstruction()); + CallInst *Call = Builder.CreateGCStatepoint( + CS.getCalledValue(), makeArrayRef(CS.arg_begin(), CS.arg_end()), None, + None, "safepoint_token"); + Call->setTailCall(toReplace->isTailCall()); + Call->setCallingConv(toReplace->getCallingConv()); + + // Before we have to worry about GC semantics, all attributes are legal + AttributeSet new_attrs = toReplace->getAttributes(); + // In case if we can handle this set of sttributes - set up function attrs + // directly on statepoint and return attrs later for gc_result intrinsic. + Call->setAttributes(new_attrs.getFnAttributes()); + return_attributes = new_attrs.getRetAttributes(); + // TODO: handle param attributes + + token = Call; + + // Put the following gc_result and gc_relocate calls immediately after the + // the old call (which we're about to delete) + BasicBlock::iterator next(toReplace); + assert(BB->end() != next && "not a terminator, must have next"); + next++; + Instruction *IP = &*(next); + Builder.SetInsertPoint(IP); + Builder.SetCurrentDebugLocation(IP->getDebugLoc()); + + } else if (CS.isInvoke()) { + // TODO: make CreateGCStatepoint return an Instruction that we can cast to a + // Call or Invoke, instead of doing this junk here. + + // Fill in the one generic type'd argument (the function is also + // vararg) + std::vector<Type *> argTypes; + argTypes.push_back(CS.getCalledValue()->getType()); + + Function *gc_statepoint_decl = Intrinsic::getDeclaration( + M, Intrinsic::experimental_gc_statepoint, argTypes); + + // First, create the statepoint (with all live ptrs as arguments). + std::vector<llvm::Value *> args; + // target, #call args, unused, ... call parameters, #deopt args, ... deopt + // parameters, ... gc parameters + Value *Target = CS.getCalledValue(); + args.push_back(Target); + int callArgSize = CS.arg_size(); + // #call args + args.push_back(Builder.getInt32(callArgSize)); + // unused + args.push_back(Builder.getInt32(0)); + // call parameters + args.insert(args.end(), CS.arg_begin(), CS.arg_end()); + // #deopt args: 0 + args.push_back(Builder.getInt32(0)); + + InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction()); + + // Insert the new invoke into the old block. We'll remove the old one in a + // moment at which point this will become the new terminator for the + // original block. + InvokeInst *invoke = InvokeInst::Create( + gc_statepoint_decl, toReplace->getNormalDest(), + toReplace->getUnwindDest(), args, "", toReplace->getParent()); + invoke->setCallingConv(toReplace->getCallingConv()); + + // Currently we will fail on parameter attributes and on certain + // function attributes. + AttributeSet new_attrs = toReplace->getAttributes(); + // In case if we can handle this set of sttributes - set up function attrs + // directly on statepoint and return attrs later for gc_result intrinsic. + invoke->setAttributes(new_attrs.getFnAttributes()); + return_attributes = new_attrs.getRetAttributes(); + + token = invoke; + + // We'll insert the gc.result into the normal block + BasicBlock *normalDest = normalizeBBForInvokeSafepoint( + toReplace->getNormalDest(), invoke->getParent()); + Instruction *IP = &*(normalDest->getFirstInsertionPt()); + Builder.SetInsertPoint(IP); + } else { + llvm_unreachable("unexpect type of CallSite"); + } + assert(token); + + // Handle the return value of the original call - update all uses to use a + // gc_result hanging off the statepoint node we just inserted + + // Only add the gc_result iff there is actually a used result + if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) { + std::string takenName = + CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : ""; + CallInst *gc_result = + Builder.CreateGCResult(token, CS.getType(), takenName); + gc_result->setAttributes(return_attributes); + return gc_result; + } else { + // No return value for the call. + return nullptr; + } +} diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index 1bbaaf3..98016b4 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -917,10 +917,13 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, /// version of the value is returned, and BI is left pointing at the instruction /// that should be processed next by the reassociation pass. static Value *NegateValue(Value *V, Instruction *BI) { - if (ConstantFP *C = dyn_cast<ConstantFP>(V)) - return ConstantExpr::getFNeg(C); - if (Constant *C = dyn_cast<Constant>(V)) + if (Constant *C = dyn_cast<Constant>(V)) { + if (C->getType()->isFPOrFPVectorTy()) { + return ConstantExpr::getFNeg(C); + } return ConstantExpr::getNeg(C); + } + // We are trying to expose opportunity for reassociation. One of the things // that we want to do to achieve this is to push a negation as deep into an @@ -1512,7 +1515,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, ++NumFound; } while (i != Ops.size() && Ops[i].Op == TheOp); - DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n'); + DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n'); ++NumFactor; // Insert a new multiply. @@ -1650,7 +1653,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // If any factor occurred more than one time, we can pull it out. if (MaxOcc > 1) { - DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n'); + DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n'); ++NumFactor; // Create a new instruction that uses the MaxOccVal twice. If we don't do @@ -1988,7 +1991,7 @@ Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) { Constant *C = C0 ? C0 : C1; unsigned ConstIdx = C0 ? 0 : 1; if (auto *CI = dyn_cast<ConstantInt>(C)) { - if (!CI->isNegative()) + if (!CI->isNegative() || CI->isMinValue(true)) return nullptr; } else if (auto *CF = dyn_cast<ConstantFP>(C)) { if (!CF->isNegative()) diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp index b6023e2..1b46727 100644 --- a/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/lib/Transforms/Scalar/Reg2Mem.cpp @@ -73,7 +73,7 @@ bool RegToMem::runOnFunction(Function &F) { // Insert all new allocas into entry block. BasicBlock *BBEntry = &F.getEntryBlock(); - assert(pred_begin(BBEntry) == pred_end(BBEntry) && + assert(pred_empty(BBEntry) && "Entry block to function must not have predecessors!"); // Find first non-alloca instruction and create insertion point. This is diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp new file mode 100644 index 0000000..ca9ab54 --- /dev/null +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -0,0 +1,1897 @@ +//===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Rewrite an existing set of gc.statepoints such that they make potential +// relocations performed by the garbage collector explicit in the IR. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Pass.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Statepoint.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" + +#define DEBUG_TYPE "rewrite-statepoints-for-gc" + +using namespace llvm; + +// Print tracing output +static cl::opt<bool> TraceLSP("trace-rewrite-statepoints", cl::Hidden, + cl::init(false)); + +// Print the liveset found at the insert location +static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden, + cl::init(false)); +static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", + cl::Hidden, cl::init(false)); +// Print out the base pointers for debugging +static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", + cl::Hidden, cl::init(false)); + +namespace { +struct RewriteStatepointsForGC : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + + RewriteStatepointsForGC() : FunctionPass(ID) { + initializeRewriteStatepointsForGCPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // We add and rewrite a bunch of instructions, but don't really do much + // else. We could in theory preserve a lot more analyses here. + AU.addRequired<DominatorTreeWrapperPass>(); + } +}; +} // namespace + +char RewriteStatepointsForGC::ID = 0; + +FunctionPass *llvm::createRewriteStatepointsForGCPass() { + return new RewriteStatepointsForGC(); +} + +INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc", + "Make relocations explicit at statepoints", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc", + "Make relocations explicit at statepoints", false, false) + +namespace { +// The type of the internal cache used inside the findBasePointers family +// of functions. From the callers perspective, this is an opaque type and +// should not be inspected. +// +// In the actual implementation this caches two relations: +// - The base relation itself (i.e. this pointer is based on that one) +// - The base defining value relation (i.e. before base_phi insertion) +// Generally, after the execution of a full findBasePointer call, only the +// base relation will remain. Internally, we add a mixture of the two +// types, then update all the second type to the first type +typedef DenseMap<Value *, Value *> DefiningValueMapTy; +typedef DenseSet<llvm::Value *> StatepointLiveSetTy; + +struct PartiallyConstructedSafepointRecord { + /// The set of values known to be live accross this safepoint + StatepointLiveSetTy liveset; + + /// Mapping from live pointers to a base-defining-value + DenseMap<llvm::Value *, llvm::Value *> PointerToBase; + + /// Any new values which were added to the IR during base pointer analysis + /// for this safepoint + DenseSet<llvm::Value *> NewInsertedDefs; + + /// The *new* gc.statepoint instruction itself. This produces the token + /// that normal path gc.relocates and the gc.result are tied to. + Instruction *StatepointToken; + + /// Instruction to which exceptional gc relocates are attached + /// Makes it easier to iterate through them during relocationViaAlloca. + Instruction *UnwindToken; +}; +} + +// TODO: Once we can get to the GCStrategy, this becomes +// Optional<bool> isGCManagedPointer(const Value *V) const override { + +static bool isGCPointerType(const Type *T) { + if (const PointerType *PT = dyn_cast<PointerType>(T)) + // For the sake of this example GC, we arbitrarily pick addrspace(1) as our + // GC managed heap. We know that a pointer into this heap needs to be + // updated and that no other pointer does. + return (1 == PT->getAddressSpace()); + return false; +} + +/// Return true if the Value is a gc reference type which is potentially used +/// after the instruction 'loc'. This is only used with the edge reachability +/// liveness code. Note: It is assumed the V dominates loc. +static bool isLiveGCReferenceAt(Value &V, Instruction *loc, DominatorTree &DT, + LoopInfo *LI) { + if (!isGCPointerType(V.getType())) + return false; + + if (V.use_empty()) + return false; + + // Given assumption that V dominates loc, this may be live + return true; +} + +#ifndef NDEBUG +static bool isAggWhichContainsGCPtrType(Type *Ty) { + if (VectorType *VT = dyn_cast<VectorType>(Ty)) + return isGCPointerType(VT->getScalarType()); + if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) + return isGCPointerType(AT->getElementType()) || + isAggWhichContainsGCPtrType(AT->getElementType()); + if (StructType *ST = dyn_cast<StructType>(Ty)) + return std::any_of(ST->subtypes().begin(), ST->subtypes().end(), + [](Type *SubType) { + return isGCPointerType(SubType) || + isAggWhichContainsGCPtrType(SubType); + }); + return false; +} +#endif + +// Conservatively identifies any definitions which might be live at the +// given instruction. The analysis is performed immediately before the +// given instruction. Values defined by that instruction are not considered +// live. Values used by that instruction are considered live. +// +// preconditions: valid IR graph, term is either a terminator instruction or +// a call instruction, pred is the basic block of term, DT, LI are valid +// +// side effects: none, does not mutate IR +// +// postconditions: populates liveValues as discussed above +static void findLiveGCValuesAtInst(Instruction *term, BasicBlock *pred, + DominatorTree &DT, LoopInfo *LI, + StatepointLiveSetTy &liveValues) { + liveValues.clear(); + + assert(isa<CallInst>(term) || isa<InvokeInst>(term) || term->isTerminator()); + + Function *F = pred->getParent(); + + auto is_live_gc_reference = + [&](Value &V) { return isLiveGCReferenceAt(V, term, DT, LI); }; + + // Are there any gc pointer arguments live over this point? This needs to be + // special cased since arguments aren't defined in basic blocks. + for (Argument &arg : F->args()) { + assert(!isAggWhichContainsGCPtrType(arg.getType()) && + "support for FCA unimplemented"); + + if (is_live_gc_reference(arg)) { + liveValues.insert(&arg); + } + } + + // Walk through all dominating blocks - the ones which can contain + // definitions used in this block - and check to see if any of the values + // they define are used in locations potentially reachable from the + // interesting instruction. + BasicBlock *BBI = pred; + while (true) { + if (TraceLSP) { + errs() << "[LSP] Looking at dominating block " << pred->getName() << "\n"; + } + assert(DT.dominates(BBI, pred)); + assert(isPotentiallyReachable(BBI, pred, &DT) && + "dominated block must be reachable"); + + // Walk through the instructions in dominating blocks and keep any + // that have a use potentially reachable from the block we're + // considering putting the safepoint in + for (Instruction &inst : *BBI) { + if (TraceLSP) { + errs() << "[LSP] Looking at instruction "; + inst.dump(); + } + + if (pred == BBI && (&inst) == term) { + if (TraceLSP) { + errs() << "[LSP] stopped because we encountered the safepoint " + "instruction.\n"; + } + + // If we're in the block which defines the interesting instruction, + // we don't want to include any values as live which are defined + // _after_ the interesting line or as part of the line itself + // i.e. "term" is the call instruction for a call safepoint, the + // results of the call should not be considered live in that stackmap + break; + } + + assert(!isAggWhichContainsGCPtrType(inst.getType()) && + "support for FCA unimplemented"); + + if (is_live_gc_reference(inst)) { + if (TraceLSP) { + errs() << "[LSP] found live value for this safepoint "; + inst.dump(); + term->dump(); + } + liveValues.insert(&inst); + } + } + if (!DT.getNode(BBI)->getIDom()) { + assert(BBI == &F->getEntryBlock() && + "failed to find a dominator for something other than " + "the entry block"); + break; + } + BBI = DT.getNode(BBI)->getIDom()->getBlock(); + } +} + +static bool order_by_name(llvm::Value *a, llvm::Value *b) { + if (a->hasName() && b->hasName()) { + return -1 == a->getName().compare(b->getName()); + } else if (a->hasName() && !b->hasName()) { + return true; + } else if (!a->hasName() && b->hasName()) { + return false; + } else { + // Better than nothing, but not stable + return a < b; + } +} + +/// Find the initial live set. Note that due to base pointer +/// insertion, the live set may be incomplete. +static void +analyzeParsePointLiveness(DominatorTree &DT, const CallSite &CS, + PartiallyConstructedSafepointRecord &result) { + Instruction *inst = CS.getInstruction(); + + BasicBlock *BB = inst->getParent(); + StatepointLiveSetTy liveset; + findLiveGCValuesAtInst(inst, BB, DT, nullptr, liveset); + + if (PrintLiveSet) { + // Note: This output is used by several of the test cases + // The order of elemtns in a set is not stable, put them in a vec and sort + // by name + SmallVector<Value *, 64> temp; + temp.insert(temp.end(), liveset.begin(), liveset.end()); + std::sort(temp.begin(), temp.end(), order_by_name); + errs() << "Live Variables:\n"; + for (Value *V : temp) { + errs() << " " << V->getName(); // no newline + V->dump(); + } + } + if (PrintLiveSetSize) { + errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n"; + errs() << "Number live values: " << liveset.size() << "\n"; + } + result.liveset = liveset; +} + +/// True iff this value is the null pointer constant (of any pointer type) +static bool LLVM_ATTRIBUTE_UNUSED isNullConstant(Value *V) { + return isa<Constant>(V) && isa<PointerType>(V->getType()) && + cast<Constant>(V)->isNullValue(); +} + +/// Helper function for findBasePointer - Will return a value which either a) +/// defines the base pointer for the input or b) blocks the simple search +/// (i.e. a PHI or Select of two derived pointers) +static Value *findBaseDefiningValue(Value *I) { + assert(I->getType()->isPointerTy() && + "Illegal to ask for the base pointer of a non-pointer type"); + + // There are instructions which can never return gc pointer values. Sanity + // check + // that this is actually true. + assert(!isa<InsertElementInst>(I) && !isa<ExtractElementInst>(I) && + !isa<ShuffleVectorInst>(I) && "Vector types are not gc pointers"); + assert((!isa<Instruction>(I) || isa<InvokeInst>(I) || + !cast<Instruction>(I)->isTerminator()) && + "With the exception of invoke terminators don't define values"); + assert(!isa<StoreInst>(I) && !isa<FenceInst>(I) && + "Can't be definitions to start with"); + assert(!isa<ICmpInst>(I) && !isa<FCmpInst>(I) && + "Comparisons don't give ops"); + // There's a bunch of instructions which just don't make sense to apply to + // a pointer. The only valid reason for this would be pointer bit + // twiddling which we're just not going to support. + assert((!isa<Instruction>(I) || !cast<Instruction>(I)->isBinaryOp()) && + "Binary ops on pointer values are meaningless. Unless your " + "bit-twiddling which we don't support"); + + if (Argument *Arg = dyn_cast<Argument>(I)) { + // An incoming argument to the function is a base pointer + // We should have never reached here if this argument isn't an gc value + assert(Arg->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + return Arg; + } + + if (GlobalVariable *global = dyn_cast<GlobalVariable>(I)) { + // base case + assert(global->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + return global; + } + + // inlining could possibly introduce phi node that contains + // undef if callee has multiple returns + if (UndefValue *undef = dyn_cast<UndefValue>(I)) { + assert(undef->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + return undef; // utterly meaningless, but useful for dealing with + // partially optimized code. + } + + // Due to inheritance, this must be _after_ the global variable and undef + // checks + if (Constant *con = dyn_cast<Constant>(I)) { + assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) && + "order of checks wrong!"); + // Note: Finding a constant base for something marked for relocation + // doesn't really make sense. The most likely case is either a) some + // screwed up the address space usage or b) your validating against + // compiled C++ code w/o the proper separation. The only real exception + // is a null pointer. You could have generic code written to index of + // off a potentially null value and have proven it null. We also use + // null pointers in dead paths of relocation phis (which we might later + // want to find a base pointer for). + assert(con->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + assert(con->isNullValue() && "null is the only case which makes sense"); + return con; + } + + if (CastInst *CI = dyn_cast<CastInst>(I)) { + Value *def = CI->stripPointerCasts(); + assert(def->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + // If we find a cast instruction here, it means we've found a cast which is + // not simply a pointer cast (i.e. an inttoptr). We don't know how to + // handle int->ptr conversion. + assert(!isa<CastInst>(def) && "shouldn't find another cast here"); + return findBaseDefiningValue(def); + } + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + if (LI->getType()->isPointerTy()) { + Value *Op = LI->getOperand(0); + (void)Op; + // Has to be a pointer to an gc object, or possibly an array of such? + assert(Op->getType()->isPointerTy()); + return LI; // The value loaded is an gc base itself + } + } + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { + Value *Op = GEP->getOperand(0); + if (Op->getType()->isPointerTy()) { + return findBaseDefiningValue(Op); // The base of this GEP is the base + } + } + + if (AllocaInst *alloc = dyn_cast<AllocaInst>(I)) { + // An alloca represents a conceptual stack slot. It's the slot itself + // that the GC needs to know about, not the value in the slot. + assert(alloc->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + return alloc; + } + + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: + // fall through to general call handling + break; + case Intrinsic::experimental_gc_statepoint: + case Intrinsic::experimental_gc_result_float: + case Intrinsic::experimental_gc_result_int: + llvm_unreachable("these don't produce pointers"); + case Intrinsic::experimental_gc_result_ptr: + // This is just a special case of the CallInst check below to handle a + // statepoint with deopt args which hasn't been rewritten for GC yet. + // TODO: Assert that the statepoint isn't rewritten yet. + return II; + case Intrinsic::experimental_gc_relocate: { + // Rerunning safepoint insertion after safepoints are already + // inserted is not supported. It could probably be made to work, + // but why are you doing this? There's no good reason. + llvm_unreachable("repeat safepoint insertion is not supported"); + } + case Intrinsic::gcroot: + // Currently, this mechanism hasn't been extended to work with gcroot. + // There's no reason it couldn't be, but I haven't thought about the + // implications much. + llvm_unreachable( + "interaction with the gcroot mechanism is not supported"); + } + } + // We assume that functions in the source language only return base + // pointers. This should probably be generalized via attributes to support + // both source language and internal functions. + if (CallInst *call = dyn_cast<CallInst>(I)) { + assert(call->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + return call; + } + if (InvokeInst *invoke = dyn_cast<InvokeInst>(I)) { + assert(invoke->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + return invoke; + } + + // I have absolutely no idea how to implement this part yet. It's not + // neccessarily hard, I just haven't really looked at it yet. + assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented"); + + if (AtomicCmpXchgInst *cas = dyn_cast<AtomicCmpXchgInst>(I)) { + // A CAS is effectively a atomic store and load combined under a + // predicate. From the perspective of base pointers, we just treat it + // like a load. We loaded a pointer from a address in memory, that value + // had better be a valid base pointer. + return cas->getPointerOperand(); + } + if (AtomicRMWInst *atomic = dyn_cast<AtomicRMWInst>(I)) { + assert(AtomicRMWInst::Xchg == atomic->getOperation() && + "All others are binary ops which don't apply to base pointers"); + // semantically, a load, store pair. Treat it the same as a standard load + return atomic->getPointerOperand(); + } + + // The aggregate ops. Aggregates can either be in the heap or on the + // stack, but in either case, this is simply a field load. As a result, + // this is a defining definition of the base just like a load is. + if (ExtractValueInst *ev = dyn_cast<ExtractValueInst>(I)) { + return ev; + } + + // We should never see an insert vector since that would require we be + // tracing back a struct value not a pointer value. + assert(!isa<InsertValueInst>(I) && + "Base pointer for a struct is meaningless"); + + // The last two cases here don't return a base pointer. Instead, they + // return a value which dynamically selects from amoung several base + // derived pointers (each with it's own base potentially). It's the job of + // the caller to resolve these. + if (SelectInst *select = dyn_cast<SelectInst>(I)) { + return select; + } + + return cast<PHINode>(I); +} + +/// Returns the base defining value for this value. +static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &cache) { + Value *&Cached = cache[I]; + if (!Cached) { + Cached = findBaseDefiningValue(I); + } + assert(cache[I] != nullptr); + + if (TraceLSP) { + errs() << "fBDV-cached: " << I->getName() << " -> " << Cached->getName() + << "\n"; + } + return Cached; +} + +/// Return a base pointer for this value if known. Otherwise, return it's +/// base defining value. +static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &cache) { + Value *def = findBaseDefiningValueCached(I, cache); + auto Found = cache.find(def); + if (Found != cache.end()) { + // Either a base-of relation, or a self reference. Caller must check. + return Found->second; + } + // Only a BDV available + return def; +} + +/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV, +/// is it known to be a base pointer? Or do we need to continue searching. +static bool isKnownBaseResult(Value *v) { + if (!isa<PHINode>(v) && !isa<SelectInst>(v)) { + // no recursion possible + return true; + } + if (cast<Instruction>(v)->getMetadata("is_base_value")) { + // This is a previously inserted base phi or select. We know + // that this is a base value. + return true; + } + + // We need to keep searching + return false; +} + +// TODO: find a better name for this +namespace { +class PhiState { +public: + enum Status { Unknown, Base, Conflict }; + + PhiState(Status s, Value *b = nullptr) : status(s), base(b) { + assert(status != Base || b); + } + PhiState(Value *b) : status(Base), base(b) {} + PhiState() : status(Unknown), base(nullptr) {} + PhiState(const PhiState &other) : status(other.status), base(other.base) { + assert(status != Base || base); + } + + Status getStatus() const { return status; } + Value *getBase() const { return base; } + + bool isBase() const { return getStatus() == Base; } + bool isUnknown() const { return getStatus() == Unknown; } + bool isConflict() const { return getStatus() == Conflict; } + + bool operator==(const PhiState &other) const { + return base == other.base && status == other.status; + } + + bool operator!=(const PhiState &other) const { return !(*this == other); } + + void dump() { + errs() << status << " (" << base << " - " + << (base ? base->getName() : "nullptr") << "): "; + } + +private: + Status status; + Value *base; // non null only if status == base +}; + +typedef DenseMap<Value *, PhiState> ConflictStateMapTy; +// Values of type PhiState form a lattice, and this is a helper +// class that implementes the meet operation. The meat of the meet +// operation is implemented in MeetPhiStates::pureMeet +class MeetPhiStates { +public: + // phiStates is a mapping from PHINodes and SelectInst's to PhiStates. + explicit MeetPhiStates(const ConflictStateMapTy &phiStates) + : phiStates(phiStates) {} + + // Destructively meet the current result with the base V. V can + // either be a merge instruction (SelectInst / PHINode), in which + // case its status is looked up in the phiStates map; or a regular + // SSA value, in which case it is assumed to be a base. + void meetWith(Value *V) { + PhiState otherState = getStateForBDV(V); + assert((MeetPhiStates::pureMeet(otherState, currentResult) == + MeetPhiStates::pureMeet(currentResult, otherState)) && + "math is wrong: meet does not commute!"); + currentResult = MeetPhiStates::pureMeet(otherState, currentResult); + } + + PhiState getResult() const { return currentResult; } + +private: + const ConflictStateMapTy &phiStates; + PhiState currentResult; + + /// Return a phi state for a base defining value. We'll generate a new + /// base state for known bases and expect to find a cached state otherwise + PhiState getStateForBDV(Value *baseValue) { + if (isKnownBaseResult(baseValue)) { + return PhiState(baseValue); + } else { + return lookupFromMap(baseValue); + } + } + + PhiState lookupFromMap(Value *V) { + auto I = phiStates.find(V); + assert(I != phiStates.end() && "lookup failed!"); + return I->second; + } + + static PhiState pureMeet(const PhiState &stateA, const PhiState &stateB) { + switch (stateA.getStatus()) { + case PhiState::Unknown: + return stateB; + + case PhiState::Base: + assert(stateA.getBase() && "can't be null"); + if (stateB.isUnknown()) + return stateA; + + if (stateB.isBase()) { + if (stateA.getBase() == stateB.getBase()) { + assert(stateA == stateB && "equality broken!"); + return stateA; + } + return PhiState(PhiState::Conflict); + } + assert(stateB.isConflict() && "only three states!"); + return PhiState(PhiState::Conflict); + + case PhiState::Conflict: + return stateA; + } + llvm_unreachable("only three states!"); + } +}; +} +/// For a given value or instruction, figure out what base ptr it's derived +/// from. For gc objects, this is simply itself. On success, returns a value +/// which is the base pointer. (This is reliable and can be used for +/// relocation.) On failure, returns nullptr. +static Value *findBasePointer(Value *I, DefiningValueMapTy &cache, + DenseSet<llvm::Value *> &NewInsertedDefs) { + Value *def = findBaseOrBDV(I, cache); + + if (isKnownBaseResult(def)) { + return def; + } + + // Here's the rough algorithm: + // - For every SSA value, construct a mapping to either an actual base + // pointer or a PHI which obscures the base pointer. + // - Construct a mapping from PHI to unknown TOP state. Use an + // optimistic algorithm to propagate base pointer information. Lattice + // looks like: + // UNKNOWN + // b1 b2 b3 b4 + // CONFLICT + // When algorithm terminates, all PHIs will either have a single concrete + // base or be in a conflict state. + // - For every conflict, insert a dummy PHI node without arguments. Add + // these to the base[Instruction] = BasePtr mapping. For every + // non-conflict, add the actual base. + // - For every conflict, add arguments for the base[a] of each input + // arguments. + // + // Note: A simpler form of this would be to add the conflict form of all + // PHIs without running the optimistic algorithm. This would be + // analougous to pessimistic data flow and would likely lead to an + // overall worse solution. + + ConflictStateMapTy states; + states[def] = PhiState(); + // Recursively fill in all phis & selects reachable from the initial one + // for which we don't already know a definite base value for + // PERF: Yes, this is as horribly inefficient as it looks. + bool done = false; + while (!done) { + done = true; + for (auto Pair : states) { + Value *v = Pair.first; + assert(!isKnownBaseResult(v) && "why did it get added?"); + if (PHINode *phi = dyn_cast<PHINode>(v)) { + assert(phi->getNumIncomingValues() > 0 && + "zero input phis are illegal"); + for (Value *InVal : phi->incoming_values()) { + Value *local = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(local) && states.find(local) == states.end()) { + states[local] = PhiState(); + done = false; + } + } + } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) { + Value *local = findBaseOrBDV(sel->getTrueValue(), cache); + if (!isKnownBaseResult(local) && states.find(local) == states.end()) { + states[local] = PhiState(); + done = false; + } + local = findBaseOrBDV(sel->getFalseValue(), cache); + if (!isKnownBaseResult(local) && states.find(local) == states.end()) { + states[local] = PhiState(); + done = false; + } + } + } + } + + if (TraceLSP) { + errs() << "States after initialization:\n"; + for (auto Pair : states) { + Instruction *v = cast<Instruction>(Pair.first); + PhiState state = Pair.second; + state.dump(); + v->dump(); + } + } + + // TODO: come back and revisit the state transitions around inputs which + // have reached conflict state. The current version seems too conservative. + + bool progress = true; + size_t oldSize = 0; + while (progress) { + oldSize = states.size(); + progress = false; + for (auto Pair : states) { + MeetPhiStates calculateMeet(states); + Value *v = Pair.first; + assert(!isKnownBaseResult(v) && "why did it get added?"); + if (SelectInst *select = dyn_cast<SelectInst>(v)) { + calculateMeet.meetWith(findBaseOrBDV(select->getTrueValue(), cache)); + calculateMeet.meetWith(findBaseOrBDV(select->getFalseValue(), cache)); + } else + for (Value *Val : cast<PHINode>(v)->incoming_values()) + calculateMeet.meetWith(findBaseOrBDV(Val, cache)); + + PhiState oldState = states[v]; + PhiState newState = calculateMeet.getResult(); + if (oldState != newState) { + progress = true; + states[v] = newState; + } + } + + assert(oldSize <= states.size()); + assert(oldSize == states.size() || progress); + } + + if (TraceLSP) { + errs() << "States after meet iteration:\n"; + for (auto Pair : states) { + Instruction *v = cast<Instruction>(Pair.first); + PhiState state = Pair.second; + state.dump(); + v->dump(); + } + } + + // Insert Phis for all conflicts + for (auto Pair : states) { + Instruction *v = cast<Instruction>(Pair.first); + PhiState state = Pair.second; + assert(!isKnownBaseResult(v) && "why did it get added?"); + assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); + if (state.isConflict()) { + if (isa<PHINode>(v)) { + int num_preds = + std::distance(pred_begin(v->getParent()), pred_end(v->getParent())); + assert(num_preds > 0 && "how did we reach here"); + PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v); + NewInsertedDefs.insert(phi); + // Add metadata marking this as a base value + auto *const_1 = ConstantInt::get( + Type::getInt32Ty( + v->getParent()->getParent()->getParent()->getContext()), + 1); + auto MDConst = ConstantAsMetadata::get(const_1); + MDNode *md = MDNode::get( + v->getParent()->getParent()->getParent()->getContext(), MDConst); + phi->setMetadata("is_base_value", md); + states[v] = PhiState(PhiState::Conflict, phi); + } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) { + // The undef will be replaced later + UndefValue *undef = UndefValue::get(sel->getType()); + SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef, + undef, "base_select", sel); + NewInsertedDefs.insert(basesel); + // Add metadata marking this as a base value + auto *const_1 = ConstantInt::get( + Type::getInt32Ty( + v->getParent()->getParent()->getParent()->getContext()), + 1); + auto MDConst = ConstantAsMetadata::get(const_1); + MDNode *md = MDNode::get( + v->getParent()->getParent()->getParent()->getContext(), MDConst); + basesel->setMetadata("is_base_value", md); + states[v] = PhiState(PhiState::Conflict, basesel); + } else + llvm_unreachable("unknown conflict type"); + } + } + + // Fixup all the inputs of the new PHIs + for (auto Pair : states) { + Instruction *v = cast<Instruction>(Pair.first); + PhiState state = Pair.second; + + assert(!isKnownBaseResult(v) && "why did it get added?"); + assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); + if (state.isConflict()) { + if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) { + PHINode *phi = cast<PHINode>(v); + unsigned NumPHIValues = phi->getNumIncomingValues(); + for (unsigned i = 0; i < NumPHIValues; i++) { + Value *InVal = phi->getIncomingValue(i); + BasicBlock *InBB = phi->getIncomingBlock(i); + + // If we've already seen InBB, add the same incoming value + // we added for it earlier. The IR verifier requires phi + // nodes with multiple entries from the same basic block + // to have the same incoming value for each of those + // entries. If we don't do this check here and basephi + // has a different type than base, we'll end up adding two + // bitcasts (and hence two distinct values) as incoming + // values for the same basic block. + + int blockIndex = basephi->getBasicBlockIndex(InBB); + if (blockIndex != -1) { + Value *oldBase = basephi->getIncomingValue(blockIndex); + basephi->addIncoming(oldBase, InBB); +#ifndef NDEBUG + Value *base = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(base)) { + // Either conflict or base. + assert(states.count(base)); + base = states[base].getBase(); + assert(base != nullptr && "unknown PhiState!"); + assert(NewInsertedDefs.count(base) && + "should have already added this in a prev. iteration!"); + } + + // In essense this assert states: the only way two + // values incoming from the same basic block may be + // different is by being different bitcasts of the same + // value. A cleanup that remains TODO is changing + // findBaseOrBDV to return an llvm::Value of the correct + // type (and still remain pure). This will remove the + // need to add bitcasts. + assert(base->stripPointerCasts() == oldBase->stripPointerCasts() && + "sanity -- findBaseOrBDV should be pure!"); +#endif + continue; + } + + // Find either the defining value for the PHI or the normal base for + // a non-phi node + Value *base = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(base)) { + // Either conflict or base. + assert(states.count(base)); + base = states[base].getBase(); + assert(base != nullptr && "unknown PhiState!"); + } + assert(base && "can't be null"); + // Must use original input BB since base may not be Instruction + // The cast is needed since base traversal may strip away bitcasts + if (base->getType() != basephi->getType()) { + base = new BitCastInst(base, basephi->getType(), "cast", + InBB->getTerminator()); + NewInsertedDefs.insert(base); + } + basephi->addIncoming(base, InBB); + } + assert(basephi->getNumIncomingValues() == NumPHIValues); + } else if (SelectInst *basesel = dyn_cast<SelectInst>(state.getBase())) { + SelectInst *sel = cast<SelectInst>(v); + // Operand 1 & 2 are true, false path respectively. TODO: refactor to + // something more safe and less hacky. + for (int i = 1; i <= 2; i++) { + Value *InVal = sel->getOperand(i); + // Find either the defining value for the PHI or the normal base for + // a non-phi node + Value *base = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(base)) { + // Either conflict or base. + assert(states.count(base)); + base = states[base].getBase(); + assert(base != nullptr && "unknown PhiState!"); + } + assert(base && "can't be null"); + // Must use original input BB since base may not be Instruction + // The cast is needed since base traversal may strip away bitcasts + if (base->getType() != basesel->getType()) { + base = new BitCastInst(base, basesel->getType(), "cast", basesel); + NewInsertedDefs.insert(base); + } + basesel->setOperand(i, base); + } + } else + llvm_unreachable("unexpected conflict type"); + } + } + + // Cache all of our results so we can cheaply reuse them + // NOTE: This is actually two caches: one of the base defining value + // relation and one of the base pointer relation! FIXME + for (auto item : states) { + Value *v = item.first; + Value *base = item.second.getBase(); + assert(v && base); + assert(!isKnownBaseResult(v) && "why did it get added?"); + + if (TraceLSP) { + std::string fromstr = + cache.count(v) ? (cache[v]->hasName() ? cache[v]->getName() : "") + : "none"; + errs() << "Updating base value cache" + << " for: " << (v->hasName() ? v->getName() : "") + << " from: " << fromstr + << " to: " << (base->hasName() ? base->getName() : "") << "\n"; + } + + assert(isKnownBaseResult(base) && + "must be something we 'know' is a base pointer"); + if (cache.count(v)) { + // Once we transition from the BDV relation being store in the cache to + // the base relation being stored, it must be stable + assert((!isKnownBaseResult(cache[v]) || cache[v] == base) && + "base relation should be stable"); + } + cache[v] = base; + } + assert(cache.find(def) != cache.end()); + return cache[def]; +} + +// For a set of live pointers (base and/or derived), identify the base +// pointer of the object which they are derived from. This routine will +// mutate the IR graph as needed to make the 'base' pointer live at the +// definition site of 'derived'. This ensures that any use of 'derived' can +// also use 'base'. This may involve the insertion of a number of +// additional PHI nodes. +// +// preconditions: live is a set of pointer type Values +// +// side effects: may insert PHI nodes into the existing CFG, will preserve +// CFG, will not remove or mutate any existing nodes +// +// post condition: PointerToBase contains one (derived, base) pair for every +// pointer in live. Note that derived can be equal to base if the original +// pointer was a base pointer. +static void findBasePointers(const StatepointLiveSetTy &live, + DenseMap<llvm::Value *, llvm::Value *> &PointerToBase, + DominatorTree *DT, DefiningValueMapTy &DVCache, + DenseSet<llvm::Value *> &NewInsertedDefs) { + for (Value *ptr : live) { + Value *base = findBasePointer(ptr, DVCache, NewInsertedDefs); + assert(base && "failed to find base pointer"); + PointerToBase[ptr] = base; + assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) || + DT->dominates(cast<Instruction>(base)->getParent(), + cast<Instruction>(ptr)->getParent())) && + "The base we found better dominate the derived pointer"); + + // If you see this trip and like to live really dangerously, the code should + // be correct, just with idioms the verifier can't handle. You can try + // disabling the verifier at your own substaintial risk. + assert(!isNullConstant(base) && "the relocation code needs adjustment to " + "handle the relocation of a null pointer " + "constant without causing false positives " + "in the safepoint ir verifier."); + } +} + +/// Find the required based pointers (and adjust the live set) for the given +/// parse point. +static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, + const CallSite &CS, + PartiallyConstructedSafepointRecord &result) { + DenseMap<llvm::Value *, llvm::Value *> PointerToBase; + DenseSet<llvm::Value *> NewInsertedDefs; + findBasePointers(result.liveset, PointerToBase, &DT, DVCache, NewInsertedDefs); + + if (PrintBasePointers) { + errs() << "Base Pairs (w/o Relocation):\n"; + for (auto Pair : PointerToBase) { + errs() << " derived %" << Pair.first->getName() << " base %" + << Pair.second->getName() << "\n"; + } + } + + result.PointerToBase = PointerToBase; + result.NewInsertedDefs = NewInsertedDefs; +} + +/// Check for liveness of items in the insert defs and add them to the live +/// and base pointer sets +static void fixupLiveness(DominatorTree &DT, const CallSite &CS, + const DenseSet<Value *> &allInsertedDefs, + PartiallyConstructedSafepointRecord &result) { + Instruction *inst = CS.getInstruction(); + + auto liveset = result.liveset; + auto PointerToBase = result.PointerToBase; + + auto is_live_gc_reference = + [&](Value &V) { return isLiveGCReferenceAt(V, inst, DT, nullptr); }; + + // For each new definition, check to see if a) the definition dominates the + // instruction we're interested in, and b) one of the uses of that definition + // is edge-reachable from the instruction we're interested in. This is the + // same definition of liveness we used in the intial liveness analysis + for (Value *newDef : allInsertedDefs) { + if (liveset.count(newDef)) { + // already live, no action needed + continue; + } + + // PERF: Use DT to check instruction domination might not be good for + // compilation time, and we could change to optimal solution if this + // turn to be a issue + if (!DT.dominates(cast<Instruction>(newDef), inst)) { + // can't possibly be live at inst + continue; + } + + if (is_live_gc_reference(*newDef)) { + // Add the live new defs into liveset and PointerToBase + liveset.insert(newDef); + PointerToBase[newDef] = newDef; + } + } + + result.liveset = liveset; + result.PointerToBase = PointerToBase; +} + +static void fixupLiveReferences( + Function &F, DominatorTree &DT, Pass *P, + const DenseSet<llvm::Value *> &allInsertedDefs, + ArrayRef<CallSite> toUpdate, + MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + const CallSite &CS = toUpdate[i]; + fixupLiveness(DT, CS, allInsertedDefs, info); + } +} + +// Normalize basic block to make it ready to be target of invoke statepoint. +// It means spliting it to have single predecessor. Return newly created BB +// ready to be successor of invoke statepoint. +static BasicBlock *normalizeBBForInvokeSafepoint(BasicBlock *BB, + BasicBlock *InvokeParent, + Pass *P) { + BasicBlock *ret = BB; + + if (!BB->getUniquePredecessor()) { + ret = SplitBlockPredecessors(BB, InvokeParent, ""); + } + + // Another requirement for such basic blocks is to not have any phi nodes. + // Since we just ensured that new BB will have single predecessor, + // all phi nodes in it will have one value. Here it would be naturall place + // to + // remove them all. But we can not do this because we are risking to remove + // one of the values stored in liveset of another statepoint. We will do it + // later after placing all safepoints. + + return ret; +} + +static int find_index(ArrayRef<Value *> livevec, Value *val) { + auto itr = std::find(livevec.begin(), livevec.end(), val); + assert(livevec.end() != itr); + size_t index = std::distance(livevec.begin(), itr); + assert(index < livevec.size()); + return index; +} + +// Create new attribute set containing only attributes which can be transfered +// from original call to the safepoint. +static AttributeSet legalizeCallAttributes(AttributeSet AS) { + AttributeSet ret; + + for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) { + unsigned index = AS.getSlotIndex(Slot); + + if (index == AttributeSet::ReturnIndex || + index == AttributeSet::FunctionIndex) { + + for (auto it = AS.begin(Slot), it_end = AS.end(Slot); it != it_end; + ++it) { + Attribute attr = *it; + + // Do not allow certain attributes - just skip them + // Safepoint can not be read only or read none. + if (attr.hasAttribute(Attribute::ReadNone) || + attr.hasAttribute(Attribute::ReadOnly)) + continue; + + ret = ret.addAttributes( + AS.getContext(), index, + AttributeSet::get(AS.getContext(), index, AttrBuilder(attr))); + } + } + + // Just skip parameter attributes for now + } + + return ret; +} + +/// Helper function to place all gc relocates necessary for the given +/// statepoint. +/// Inputs: +/// liveVariables - list of variables to be relocated. +/// liveStart - index of the first live variable. +/// basePtrs - base pointers. +/// statepointToken - statepoint instruction to which relocates should be +/// bound. +/// Builder - Llvm IR builder to be used to construct new calls. +void CreateGCRelocates(ArrayRef<llvm::Value *> liveVariables, + const int liveStart, + ArrayRef<llvm::Value *> basePtrs, + Instruction *statepointToken, IRBuilder<> Builder) { + + SmallVector<Instruction *, 64> NewDefs; + NewDefs.reserve(liveVariables.size()); + + Module *M = statepointToken->getParent()->getParent()->getParent(); + + for (unsigned i = 0; i < liveVariables.size(); i++) { + // We generate a (potentially) unique declaration for every pointer type + // combination. This results is some blow up the function declarations in + // the IR, but removes the need for argument bitcasts which shrinks the IR + // greatly and makes it much more readable. + SmallVector<Type *, 1> types; // one per 'any' type + types.push_back(liveVariables[i]->getType()); // result type + Value *gc_relocate_decl = Intrinsic::getDeclaration( + M, Intrinsic::experimental_gc_relocate, types); + + // Generate the gc.relocate call and save the result + Value *baseIdx = + ConstantInt::get(Type::getInt32Ty(M->getContext()), + liveStart + find_index(liveVariables, basePtrs[i])); + Value *liveIdx = ConstantInt::get( + Type::getInt32Ty(M->getContext()), + liveStart + find_index(liveVariables, liveVariables[i])); + + // only specify a debug name if we can give a useful one + Value *reloc = Builder.CreateCall3( + gc_relocate_decl, statepointToken, baseIdx, liveIdx, + liveVariables[i]->hasName() ? liveVariables[i]->getName() + ".relocated" + : ""); + // Trick CodeGen into thinking there are lots of free registers at this + // fake call. + cast<CallInst>(reloc)->setCallingConv(CallingConv::Cold); + + NewDefs.push_back(cast<Instruction>(reloc)); + } + assert(NewDefs.size() == liveVariables.size() && + "missing or extra redefinition at safepoint"); +} + +static void +makeStatepointExplicitImpl(const CallSite &CS, /* to replace */ + const SmallVectorImpl<llvm::Value *> &basePtrs, + const SmallVectorImpl<llvm::Value *> &liveVariables, + Pass *P, + PartiallyConstructedSafepointRecord &result) { + assert(basePtrs.size() == liveVariables.size()); + assert(isStatepoint(CS) && + "This method expects to be rewriting a statepoint"); + + BasicBlock *BB = CS.getInstruction()->getParent(); + assert(BB); + Function *F = BB->getParent(); + assert(F && "must be set"); + Module *M = F->getParent(); + (void)M; + assert(M && "must be set"); + + // We're not changing the function signature of the statepoint since the gc + // arguments go into the var args section. + Function *gc_statepoint_decl = CS.getCalledFunction(); + + // Then go ahead and use the builder do actually do the inserts. We insert + // immediately before the previous instruction under the assumption that all + // arguments will be available here. We can't insert afterwards since we may + // be replacing a terminator. + Instruction *insertBefore = CS.getInstruction(); + IRBuilder<> Builder(insertBefore); + // Copy all of the arguments from the original statepoint - this includes the + // target, call args, and deopt args + SmallVector<llvm::Value *, 64> args; + args.insert(args.end(), CS.arg_begin(), CS.arg_end()); + // TODO: Clear the 'needs rewrite' flag + + // add all the pointers to be relocated (gc arguments) + // Capture the start of the live variable list for use in the gc_relocates + const int live_start = args.size(); + args.insert(args.end(), liveVariables.begin(), liveVariables.end()); + + // Create the statepoint given all the arguments + Instruction *token = nullptr; + AttributeSet return_attributes; + if (CS.isCall()) { + CallInst *toReplace = cast<CallInst>(CS.getInstruction()); + CallInst *call = + Builder.CreateCall(gc_statepoint_decl, args, "safepoint_token"); + call->setTailCall(toReplace->isTailCall()); + call->setCallingConv(toReplace->getCallingConv()); + + // Currently we will fail on parameter attributes and on certain + // function attributes. + AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); + // In case if we can handle this set of sttributes - set up function attrs + // directly on statepoint and return attrs later for gc_result intrinsic. + call->setAttributes(new_attrs.getFnAttributes()); + return_attributes = new_attrs.getRetAttributes(); + + token = call; + + // Put the following gc_result and gc_relocate calls immediately after the + // the old call (which we're about to delete) + BasicBlock::iterator next(toReplace); + assert(BB->end() != next && "not a terminator, must have next"); + next++; + Instruction *IP = &*(next); + Builder.SetInsertPoint(IP); + Builder.SetCurrentDebugLocation(IP->getDebugLoc()); + + } else { + InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction()); + + // Insert the new invoke into the old block. We'll remove the old one in a + // moment at which point this will become the new terminator for the + // original block. + InvokeInst *invoke = InvokeInst::Create( + gc_statepoint_decl, toReplace->getNormalDest(), + toReplace->getUnwindDest(), args, "", toReplace->getParent()); + invoke->setCallingConv(toReplace->getCallingConv()); + + // Currently we will fail on parameter attributes and on certain + // function attributes. + AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); + // In case if we can handle this set of sttributes - set up function attrs + // directly on statepoint and return attrs later for gc_result intrinsic. + invoke->setAttributes(new_attrs.getFnAttributes()); + return_attributes = new_attrs.getRetAttributes(); + + token = invoke; + + // Generate gc relocates in exceptional path + BasicBlock *unwindBlock = normalizeBBForInvokeSafepoint( + toReplace->getUnwindDest(), invoke->getParent(), P); + + Instruction *IP = &*(unwindBlock->getFirstInsertionPt()); + Builder.SetInsertPoint(IP); + Builder.SetCurrentDebugLocation(toReplace->getDebugLoc()); + + // Extract second element from landingpad return value. We will attach + // exceptional gc relocates to it. + const unsigned idx = 1; + Instruction *exceptional_token = + cast<Instruction>(Builder.CreateExtractValue( + unwindBlock->getLandingPadInst(), idx, "relocate_token")); + result.UnwindToken = exceptional_token; + + // Just throw away return value. We will use the one we got for normal + // block. + (void)CreateGCRelocates(liveVariables, live_start, basePtrs, + exceptional_token, Builder); + + // Generate gc relocates and returns for normal block + BasicBlock *normalDest = normalizeBBForInvokeSafepoint( + toReplace->getNormalDest(), invoke->getParent(), P); + + IP = &*(normalDest->getFirstInsertionPt()); + Builder.SetInsertPoint(IP); + + // gc relocates will be generated later as if it were regular call + // statepoint + } + assert(token); + + // Take the name of the original value call if it had one. + token->takeName(CS.getInstruction()); + + // The GCResult is already inserted, we just need to find it +#ifndef NDEBUG + Instruction *toReplace = CS.getInstruction(); + assert((toReplace->hasNUses(0) || toReplace->hasNUses(1)) && + "only valid use before rewrite is gc.result"); + assert(!toReplace->hasOneUse() || + isGCResult(cast<Instruction>(*toReplace->user_begin()))); +#endif + + // Update the gc.result of the original statepoint (if any) to use the newly + // inserted statepoint. This is safe to do here since the token can't be + // considered a live reference. + CS.getInstruction()->replaceAllUsesWith(token); + + result.StatepointToken = token; + + // Second, create a gc.relocate for every live variable + CreateGCRelocates(liveVariables, live_start, basePtrs, token, Builder); + +} + +namespace { +struct name_ordering { + Value *base; + Value *derived; + bool operator()(name_ordering const &a, name_ordering const &b) { + return -1 == a.derived->getName().compare(b.derived->getName()); + } +}; +} +static void stablize_order(SmallVectorImpl<Value *> &basevec, + SmallVectorImpl<Value *> &livevec) { + assert(basevec.size() == livevec.size()); + + SmallVector<name_ordering, 64> temp; + for (size_t i = 0; i < basevec.size(); i++) { + name_ordering v; + v.base = basevec[i]; + v.derived = livevec[i]; + temp.push_back(v); + } + std::sort(temp.begin(), temp.end(), name_ordering()); + for (size_t i = 0; i < basevec.size(); i++) { + basevec[i] = temp[i].base; + livevec[i] = temp[i].derived; + } +} + +// Replace an existing gc.statepoint with a new one and a set of gc.relocates +// which make the relocations happening at this safepoint explicit. +// +// WARNING: Does not do any fixup to adjust users of the original live +// values. That's the callers responsibility. +static void +makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, Pass *P, + PartiallyConstructedSafepointRecord &result) { + auto liveset = result.liveset; + auto PointerToBase = result.PointerToBase; + + // Convert to vector for efficient cross referencing. + SmallVector<Value *, 64> basevec, livevec; + livevec.reserve(liveset.size()); + basevec.reserve(liveset.size()); + for (Value *L : liveset) { + livevec.push_back(L); + + assert(PointerToBase.find(L) != PointerToBase.end()); + Value *base = PointerToBase[L]; + basevec.push_back(base); + } + assert(livevec.size() == basevec.size()); + + // To make the output IR slightly more stable (for use in diffs), ensure a + // fixed order of the values in the safepoint (by sorting the value name). + // The order is otherwise meaningless. + stablize_order(basevec, livevec); + + // Do the actual rewriting and delete the old statepoint + makeStatepointExplicitImpl(CS, basevec, livevec, P, result); + CS.getInstruction()->eraseFromParent(); +} + +// Helper function for the relocationViaAlloca. +// It receives iterator to the statepoint gc relocates and emits store to the +// assigned +// location (via allocaMap) for the each one of them. +// Add visited values into the visitedLiveValues set we will later use them +// for sanity check. +static void +insertRelocationStores(iterator_range<Value::user_iterator> gcRelocs, + DenseMap<Value *, Value *> &allocaMap, + DenseSet<Value *> &visitedLiveValues) { + + for (User *U : gcRelocs) { + if (!isa<IntrinsicInst>(U)) + continue; + + IntrinsicInst *relocatedValue = cast<IntrinsicInst>(U); + + // We only care about relocates + if (relocatedValue->getIntrinsicID() != + Intrinsic::experimental_gc_relocate) { + continue; + } + + GCRelocateOperands relocateOperands(relocatedValue); + Value *originalValue = const_cast<Value *>(relocateOperands.derivedPtr()); + assert(allocaMap.count(originalValue)); + Value *alloca = allocaMap[originalValue]; + + // Emit store into the related alloca + StoreInst *store = new StoreInst(relocatedValue, alloca); + store->insertAfter(relocatedValue); + +#ifndef NDEBUG + visitedLiveValues.insert(originalValue); +#endif + } +} + +/// do all the relocation update via allocas and mem2reg +static void relocationViaAlloca( + Function &F, DominatorTree &DT, ArrayRef<Value *> live, + ArrayRef<struct PartiallyConstructedSafepointRecord> records) { +#ifndef NDEBUG + int initialAllocaNum = 0; + + // record initial number of allocas + for (inst_iterator itr = inst_begin(F), end = inst_end(F); itr != end; + itr++) { + if (isa<AllocaInst>(*itr)) + initialAllocaNum++; + } +#endif + + // TODO-PERF: change data structures, reserve + DenseMap<Value *, Value *> allocaMap; + SmallVector<AllocaInst *, 200> PromotableAllocas; + PromotableAllocas.reserve(live.size()); + + // emit alloca for each live gc pointer + for (unsigned i = 0; i < live.size(); i++) { + Value *liveValue = live[i]; + AllocaInst *alloca = new AllocaInst(liveValue->getType(), "", + F.getEntryBlock().getFirstNonPHI()); + allocaMap[liveValue] = alloca; + PromotableAllocas.push_back(alloca); + } + + // The next two loops are part of the same conceptual operation. We need to + // insert a store to the alloca after the original def and at each + // redefinition. We need to insert a load before each use. These are split + // into distinct loops for performance reasons. + + // update gc pointer after each statepoint + // either store a relocated value or null (if no relocated value found for + // this gc pointer and it is not a gc_result) + // this must happen before we update the statepoint with load of alloca + // otherwise we lose the link between statepoint and old def + for (size_t i = 0; i < records.size(); i++) { + const struct PartiallyConstructedSafepointRecord &info = records[i]; + Value *Statepoint = info.StatepointToken; + + // This will be used for consistency check + DenseSet<Value *> visitedLiveValues; + + // Insert stores for normal statepoint gc relocates + insertRelocationStores(Statepoint->users(), allocaMap, visitedLiveValues); + + // In case if it was invoke statepoint + // we will insert stores for exceptional path gc relocates. + if (isa<InvokeInst>(Statepoint)) { + insertRelocationStores(info.UnwindToken->users(), + allocaMap, visitedLiveValues); + } + +#ifndef NDEBUG + // As a debuging aid, pretend that an unrelocated pointer becomes null at + // the gc.statepoint. This will turn some subtle GC problems into slightly + // easier to debug SEGVs + SmallVector<AllocaInst *, 64> ToClobber; + for (auto Pair : allocaMap) { + Value *Def = Pair.first; + AllocaInst *Alloca = cast<AllocaInst>(Pair.second); + + // This value was relocated + if (visitedLiveValues.count(Def)) { + continue; + } + ToClobber.push_back(Alloca); + } + + auto InsertClobbersAt = [&](Instruction *IP) { + for (auto *AI : ToClobber) { + auto AIType = cast<PointerType>(AI->getType()); + auto PT = cast<PointerType>(AIType->getElementType()); + Constant *CPN = ConstantPointerNull::get(PT); + StoreInst *store = new StoreInst(CPN, AI); + store->insertBefore(IP); + } + }; + + // Insert the clobbering stores. These may get intermixed with the + // gc.results and gc.relocates, but that's fine. + if (auto II = dyn_cast<InvokeInst>(Statepoint)) { + InsertClobbersAt(II->getNormalDest()->getFirstInsertionPt()); + InsertClobbersAt(II->getUnwindDest()->getFirstInsertionPt()); + } else { + BasicBlock::iterator Next(cast<CallInst>(Statepoint)); + Next++; + InsertClobbersAt(Next); + } +#endif + } + // update use with load allocas and add store for gc_relocated + for (auto Pair : allocaMap) { + Value *def = Pair.first; + Value *alloca = Pair.second; + + // we pre-record the uses of allocas so that we dont have to worry about + // later update + // that change the user information. + SmallVector<Instruction *, 20> uses; + // PERF: trade a linear scan for repeated reallocation + uses.reserve(std::distance(def->user_begin(), def->user_end())); + for (User *U : def->users()) { + if (!isa<ConstantExpr>(U)) { + // If the def has a ConstantExpr use, then the def is either a + // ConstantExpr use itself or null. In either case + // (recursively in the first, directly in the second), the oop + // it is ultimately dependent on is null and this particular + // use does not need to be fixed up. + uses.push_back(cast<Instruction>(U)); + } + } + + std::sort(uses.begin(), uses.end()); + auto last = std::unique(uses.begin(), uses.end()); + uses.erase(last, uses.end()); + + for (Instruction *use : uses) { + if (isa<PHINode>(use)) { + PHINode *phi = cast<PHINode>(use); + for (unsigned i = 0; i < phi->getNumIncomingValues(); i++) { + if (def == phi->getIncomingValue(i)) { + LoadInst *load = new LoadInst( + alloca, "", phi->getIncomingBlock(i)->getTerminator()); + phi->setIncomingValue(i, load); + } + } + } else { + LoadInst *load = new LoadInst(alloca, "", use); + use->replaceUsesOfWith(def, load); + } + } + + // emit store for the initial gc value + // store must be inserted after load, otherwise store will be in alloca's + // use list and an extra load will be inserted before it + StoreInst *store = new StoreInst(def, alloca); + if (isa<Instruction>(def)) { + store->insertAfter(cast<Instruction>(def)); + } else { + assert((isa<Argument>(def) || isa<GlobalVariable>(def) || + (isa<Constant>(def) && cast<Constant>(def)->isNullValue())) && + "Must be argument or global"); + store->insertAfter(cast<Instruction>(alloca)); + } + } + + assert(PromotableAllocas.size() == live.size() && + "we must have the same allocas with lives"); + if (!PromotableAllocas.empty()) { + // apply mem2reg to promote alloca to SSA + PromoteMemToReg(PromotableAllocas, DT); + } + +#ifndef NDEBUG + for (inst_iterator itr = inst_begin(F), end = inst_end(F); itr != end; + itr++) { + if (isa<AllocaInst>(*itr)) + initialAllocaNum--; + } + assert(initialAllocaNum == 0 && "We must not introduce any extra allocas"); +#endif +} + +/// Implement a unique function which doesn't require we sort the input +/// vector. Doing so has the effect of changing the output of a couple of +/// tests in ways which make them less useful in testing fused safepoints. +template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) { + DenseSet<T> Seen; + SmallVector<T, 128> TempVec; + TempVec.reserve(Vec.size()); + for (auto Element : Vec) + TempVec.push_back(Element); + Vec.clear(); + for (auto V : TempVec) { + if (Seen.insert(V).second) { + Vec.push_back(V); + } + } +} + +static Function *getUseHolder(Module &M) { + FunctionType *ftype = + FunctionType::get(Type::getVoidTy(M.getContext()), true); + Function *Func = cast<Function>(M.getOrInsertFunction("__tmp_use", ftype)); + return Func; +} + +/// Insert holders so that each Value is obviously live through the entire +/// liftetime of the call. +static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values, + SmallVectorImpl<CallInst *> &holders) { + Module *M = CS.getInstruction()->getParent()->getParent()->getParent(); + Function *Func = getUseHolder(*M); + if (CS.isCall()) { + // For call safepoints insert dummy calls right after safepoint + BasicBlock::iterator next(CS.getInstruction()); + next++; + CallInst *base_holder = CallInst::Create(Func, Values, "", next); + holders.push_back(base_holder); + } else if (CS.isInvoke()) { + // For invoke safepooints insert dummy calls both in normal and + // exceptional destination blocks + InvokeInst *invoke = cast<InvokeInst>(CS.getInstruction()); + CallInst *normal_holder = CallInst::Create( + Func, Values, "", invoke->getNormalDest()->getFirstInsertionPt()); + CallInst *unwind_holder = CallInst::Create( + Func, Values, "", invoke->getUnwindDest()->getFirstInsertionPt()); + holders.push_back(normal_holder); + holders.push_back(unwind_holder); + } else + llvm_unreachable("unsupported call type"); +} + +static void findLiveReferences( + Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate, + MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + const CallSite &CS = toUpdate[i]; + analyzeParsePointLiveness(DT, CS, info); + } +} + +static void addBasesAsLiveValues(StatepointLiveSetTy &liveset, + DenseMap<Value *, Value *> &PointerToBase) { + // Identify any base pointers which are used in this safepoint, but not + // themselves relocated. We need to relocate them so that later inserted + // safepoints can get the properly relocated base register. + DenseSet<Value *> missing; + for (Value *L : liveset) { + assert(PointerToBase.find(L) != PointerToBase.end()); + Value *base = PointerToBase[L]; + assert(base); + if (liveset.find(base) == liveset.end()) { + assert(PointerToBase.find(base) == PointerToBase.end()); + // uniqued by set insert + missing.insert(base); + } + } + + // Note that we want these at the end of the list, otherwise + // register placement gets screwed up once we lower to STATEPOINT + // instructions. This is an utter hack, but there doesn't seem to be a + // better one. + for (Value *base : missing) { + assert(base); + liveset.insert(base); + PointerToBase[base] = base; + } + assert(liveset.size() == PointerToBase.size()); +} + +static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, + SmallVectorImpl<CallSite> &toUpdate) { +#ifndef NDEBUG + // sanity check the input + std::set<CallSite> uniqued; + uniqued.insert(toUpdate.begin(), toUpdate.end()); + assert(uniqued.size() == toUpdate.size() && "no duplicates please!"); + + for (size_t i = 0; i < toUpdate.size(); i++) { + CallSite &CS = toUpdate[i]; + assert(CS.getInstruction()->getParent()->getParent() == &F); + assert(isStatepoint(CS) && "expected to already be a deopt statepoint"); + } +#endif + + // A list of dummy calls added to the IR to keep various values obviously + // live in the IR. We'll remove all of these when done. + SmallVector<CallInst *, 64> holders; + + // Insert a dummy call with all of the arguments to the vm_state we'll need + // for the actual safepoint insertion. This ensures reference arguments in + // the deopt argument list are considered live through the safepoint (and + // thus makes sure they get relocated.) + for (size_t i = 0; i < toUpdate.size(); i++) { + CallSite &CS = toUpdate[i]; + Statepoint StatepointCS(CS); + + SmallVector<Value *, 64> DeoptValues; + for (Use &U : StatepointCS.vm_state_args()) { + Value *Arg = cast<Value>(&U); + if (isGCPointerType(Arg->getType())) + DeoptValues.push_back(Arg); + } + insertUseHolderAfter(CS, DeoptValues, holders); + } + + SmallVector<struct PartiallyConstructedSafepointRecord, 64> records; + records.reserve(toUpdate.size()); + for (size_t i = 0; i < toUpdate.size(); i++) { + struct PartiallyConstructedSafepointRecord info; + records.push_back(info); + } + assert(records.size() == toUpdate.size()); + + // A) Identify all gc pointers which are staticly live at the given call + // site. + findLiveReferences(F, DT, P, toUpdate, records); + + // B) Find the base pointers for each live pointer + /* scope for caching */ { + // Cache the 'defining value' relation used in the computation and + // insertion of base phis and selects. This ensures that we don't insert + // large numbers of duplicate base_phis. + DefiningValueMapTy DVCache; + + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + CallSite &CS = toUpdate[i]; + findBasePointers(DT, DVCache, CS, info); + } + } // end of cache scope + + // The base phi insertion logic (for any safepoint) may have inserted new + // instructions which are now live at some safepoint. The simplest such + // example is: + // loop: + // phi a <-- will be a new base_phi here + // safepoint 1 <-- that needs to be live here + // gep a + 1 + // safepoint 2 + // br loop + DenseSet<llvm::Value *> allInsertedDefs; + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + allInsertedDefs.insert(info.NewInsertedDefs.begin(), + info.NewInsertedDefs.end()); + } + + // We insert some dummy calls after each safepoint to definitely hold live + // the base pointers which were identified for that safepoint. We'll then + // ask liveness for _every_ base inserted to see what is now live. Then we + // remove the dummy calls. + holders.reserve(holders.size() + records.size()); + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + CallSite &CS = toUpdate[i]; + + SmallVector<Value *, 128> Bases; + for (auto Pair : info.PointerToBase) { + Bases.push_back(Pair.second); + } + insertUseHolderAfter(CS, Bases, holders); + } + + // Add the bases explicitly to the live vector set. This may result in a few + // extra relocations, but the base has to be available whenever a pointer + // derived from it is used. Thus, we need it to be part of the statepoint's + // gc arguments list. TODO: Introduce an explicit notion (in the following + // code) of the GC argument list as seperate from the live Values at a + // given statepoint. + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + addBasesAsLiveValues(info.liveset, info.PointerToBase); + } + + // If we inserted any new values, we need to adjust our notion of what is + // live at a particular safepoint. + if (!allInsertedDefs.empty()) { + fixupLiveReferences(F, DT, P, allInsertedDefs, toUpdate, records); + } + if (PrintBasePointers) { + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + errs() << "Base Pairs: (w/Relocation)\n"; + for (auto Pair : info.PointerToBase) { + errs() << " derived %" << Pair.first->getName() << " base %" + << Pair.second->getName() << "\n"; + } + } + } + for (size_t i = 0; i < holders.size(); i++) { + holders[i]->eraseFromParent(); + holders[i] = nullptr; + } + holders.clear(); + + // Now run through and replace the existing statepoints with new ones with + // the live variables listed. We do not yet update uses of the values being + // relocated. We have references to live variables that need to + // survive to the last iteration of this loop. (By construction, the + // previous statepoint can not be a live variable, thus we can and remove + // the old statepoint calls as we go.) + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + CallSite &CS = toUpdate[i]; + makeStatepointExplicit(DT, CS, P, info); + } + toUpdate.clear(); // prevent accident use of invalid CallSites + + // In case if we inserted relocates in a different basic block than the + // original safepoint (this can happen for invokes). We need to be sure that + // original values were not used in any of the phi nodes at the + // beginning of basic block containing them. Because we know that all such + // blocks will have single predecessor we can safely assume that all phi + // nodes have single entry (because of normalizeBBForInvokeSafepoint). + // Just remove them all here. + for (size_t i = 0; i < records.size(); i++) { + Instruction *I = records[i].StatepointToken; + + if (InvokeInst *invoke = dyn_cast<InvokeInst>(I)) { + FoldSingleEntryPHINodes(invoke->getNormalDest()); + assert(!isa<PHINode>(invoke->getNormalDest()->begin())); + + FoldSingleEntryPHINodes(invoke->getUnwindDest()); + assert(!isa<PHINode>(invoke->getUnwindDest()->begin())); + } + } + + // Do all the fixups of the original live variables to their relocated selves + SmallVector<Value *, 128> live; + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + // We can't simply save the live set from the original insertion. One of + // the live values might be the result of a call which needs a safepoint. + // That Value* no longer exists and we need to use the new gc_result. + // Thankfully, the liveset is embedded in the statepoint (and updated), so + // we just grab that. + Statepoint statepoint(info.StatepointToken); + live.insert(live.end(), statepoint.gc_args_begin(), + statepoint.gc_args_end()); + } + unique_unsorted(live); + +#ifndef NDEBUG + // sanity check + for (auto ptr : live) { + assert(isGCPointerType(ptr->getType()) && "must be a gc pointer type"); + } +#endif + + relocationViaAlloca(F, DT, live, records); + return !records.empty(); +} + +/// Returns true if this function should be rewritten by this pass. The main +/// point of this function is as an extension point for custom logic. +static bool shouldRewriteStatepointsIn(Function &F) { + // TODO: This should check the GCStrategy + if (F.hasGC()) { + const std::string StatepointExampleName("statepoint-example"); + return StatepointExampleName == F.getGC(); + } else + return false; +} + +bool RewriteStatepointsForGC::runOnFunction(Function &F) { + // Nothing to do for declarations. + if (F.isDeclaration() || F.empty()) + return false; + + // Policy choice says not to rewrite - the most common reason is that we're + // compiling code without a GCStrategy. + if (!shouldRewriteStatepointsIn(F)) + return false; + + // Gather all the statepoints which need rewritten. + SmallVector<CallSite, 64> ParsePointNeeded; + for (Instruction &I : inst_range(F)) { + // TODO: only the ones with the flag set! + if (isStatepoint(I)) + ParsePointNeeded.push_back(CallSite(&I)); + } + + // Return early if no work to do. + if (ParsePointNeeded.empty()) + return false; + + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + return insertParsePoints(F, DT, this, ParsePointNeeded); +} diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index cfc9a8e..05b9608 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -35,7 +35,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> @@ -1504,7 +1504,7 @@ namespace { /// struct SCCP : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } static char ID; // Pass identification, replacement for typeid SCCP() : FunctionPass(ID) { @@ -1563,7 +1563,8 @@ bool SCCP::runOnFunction(Function &F) { DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); const DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); SCCPSolver Solver(DL, TLI); // Mark the first block of the function as being executable. @@ -1637,7 +1638,7 @@ namespace { /// struct IPSCCP : public ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } static char ID; IPSCCP() : ModulePass(ID) { @@ -1651,7 +1652,7 @@ char IPSCCP::ID = 0; INITIALIZE_PASS_BEGIN(IPSCCP, "ipsccp", "Interprocedural Sparse Conditional Constant Propagation", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(IPSCCP, "ipsccp", "Interprocedural Sparse Conditional Constant Propagation", false, false) @@ -1692,7 +1693,8 @@ static bool AddressIsTaken(const GlobalValue *GV) { bool IPSCCP::runOnModule(Module &M) { DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); SCCPSolver Solver(DL, TLI); // AddressTakenFunctions - This set keeps track of the address-taken functions diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 6135114..f69c750 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -28,7 +28,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" @@ -79,8 +79,8 @@ STATISTIC(NumVectorized, "Number of vectorized aggregates"); /// Hidden option to force the pass to not use DomTree and mem2reg, instead /// forming SSA values through the SSAUpdater infrastructure. -static cl::opt<bool> -ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); +static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false), + cl::Hidden); /// Hidden option to enable randomly shuffling the slices to help uncover /// instability in their order. @@ -89,15 +89,15 @@ static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices", /// Hidden option to experiment with completely strict handling of inbounds /// GEPs. -static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", - cl::init(false), cl::Hidden); +static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false), + cl::Hidden); namespace { /// \brief A custom IRBuilder inserter which prefixes all names if they are /// preserved. template <bool preserveNames = true> -class IRBuilderPrefixedInserter : - public IRBuilderDefaultInserter<preserveNames> { +class IRBuilderPrefixedInserter + : public IRBuilderDefaultInserter<preserveNames> { std::string Prefix; public: @@ -113,19 +113,19 @@ protected: // Specialization for not preserving the name is trivial. template <> -class IRBuilderPrefixedInserter<false> : - public IRBuilderDefaultInserter<false> { +class IRBuilderPrefixedInserter<false> + : public IRBuilderDefaultInserter<false> { public: void SetNamePrefix(const Twine &P) {} }; /// \brief Provide a typedef for IRBuilder that drops names in release builds. #ifndef NDEBUG -typedef llvm::IRBuilder<true, ConstantFolder, - IRBuilderPrefixedInserter<true> > IRBuilderTy; +typedef llvm::IRBuilder<true, ConstantFolder, IRBuilderPrefixedInserter<true>> + IRBuilderTy; #else -typedef llvm::IRBuilder<false, ConstantFolder, - IRBuilderPrefixedInserter<false> > IRBuilderTy; +typedef llvm::IRBuilder<false, ConstantFolder, IRBuilderPrefixedInserter<false>> + IRBuilderTy; #endif } @@ -171,10 +171,14 @@ public: /// decreasing. Thus the spanning range comes first in a cluster with the /// same start position. bool operator<(const Slice &RHS) const { - if (beginOffset() < RHS.beginOffset()) return true; - if (beginOffset() > RHS.beginOffset()) return false; - if (isSplittable() != RHS.isSplittable()) return !isSplittable(); - if (endOffset() > RHS.endOffset()) return true; + if (beginOffset() < RHS.beginOffset()) + return true; + if (beginOffset() > RHS.beginOffset()) + return false; + if (isSplittable() != RHS.isSplittable()) + return !isSplittable(); + if (endOffset() > RHS.endOffset()) + return true; return false; } @@ -198,9 +202,7 @@ public: namespace llvm { template <typename T> struct isPodLike; -template <> struct isPodLike<Slice> { - static const bool value = true; -}; +template <> struct isPodLike<Slice> { static const bool value = true; }; } namespace { @@ -235,6 +237,298 @@ public: const_iterator end() const { return Slices.end(); } /// @} + /// \brief Erase a range of slices. + void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); } + + /// \brief Insert new slices for this alloca. + /// + /// This moves the slices into the alloca's slices collection, and re-sorts + /// everything so that the usual ordering properties of the alloca's slices + /// hold. + void insert(ArrayRef<Slice> NewSlices) { + int OldSize = Slices.size(); + std::move(NewSlices.begin(), NewSlices.end(), std::back_inserter(Slices)); + auto SliceI = Slices.begin() + OldSize; + std::sort(SliceI, Slices.end()); + std::inplace_merge(Slices.begin(), SliceI, Slices.end()); + } + + // Forward declare an iterator to befriend it. + class partition_iterator; + + /// \brief A partition of the slices. + /// + /// An ephemeral representation for a range of slices which can be viewed as + /// a partition of the alloca. This range represents a span of the alloca's + /// memory which cannot be split, and provides access to all of the slices + /// overlapping some part of the partition. + /// + /// Objects of this type are produced by traversing the alloca's slices, but + /// are only ephemeral and not persistent. + class Partition { + private: + friend class AllocaSlices; + friend class AllocaSlices::partition_iterator; + + /// \brief The begining and ending offsets of the alloca for this partition. + uint64_t BeginOffset, EndOffset; + + /// \brief The start end end iterators of this partition. + iterator SI, SJ; + + /// \brief A collection of split slice tails overlapping the partition. + SmallVector<Slice *, 4> SplitTails; + + /// \brief Raw constructor builds an empty partition starting and ending at + /// the given iterator. + Partition(iterator SI) : SI(SI), SJ(SI) {} + + public: + /// \brief The start offset of this partition. + /// + /// All of the contained slices start at or after this offset. + uint64_t beginOffset() const { return BeginOffset; } + + /// \brief The end offset of this partition. + /// + /// All of the contained slices end at or before this offset. + uint64_t endOffset() const { return EndOffset; } + + /// \brief The size of the partition. + /// + /// Note that this can never be zero. + uint64_t size() const { + assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); + return EndOffset - BeginOffset; + } + + /// \brief Test whether this partition contains no slices, and merely spans + /// a region occupied by split slices. + bool empty() const { return SI == SJ; } + + /// \name Iterate slices that start within the partition. + /// These may be splittable or unsplittable. They have a begin offset >= the + /// partition begin offset. + /// @{ + // FIXME: We should probably define a "concat_iterator" helper and use that + // to stitch together pointee_iterators over the split tails and the + // contiguous iterators of the partition. That would give a much nicer + // interface here. We could then additionally expose filtered iterators for + // split, unsplit, and unsplittable splices based on the usage patterns. + iterator begin() const { return SI; } + iterator end() const { return SJ; } + /// @} + + /// \brief Get the sequence of split slice tails. + /// + /// These tails are of slices which start before this partition but are + /// split and overlap into the partition. We accumulate these while forming + /// partitions. + ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } + }; + + /// \brief An iterator over partitions of the alloca's slices. + /// + /// This iterator implements the core algorithm for partitioning the alloca's + /// slices. It is a forward iterator as we don't support backtracking for + /// efficiency reasons, and re-use a single storage area to maintain the + /// current set of split slices. + /// + /// It is templated on the slice iterator type to use so that it can operate + /// with either const or non-const slice iterators. + class partition_iterator + : public iterator_facade_base<partition_iterator, + std::forward_iterator_tag, Partition> { + friend class AllocaSlices; + + /// \brief Most of the state for walking the partitions is held in a class + /// with a nice interface for examining them. + Partition P; + + /// \brief We need to keep the end of the slices to know when to stop. + AllocaSlices::iterator SE; + + /// \brief We also need to keep track of the maximum split end offset seen. + /// FIXME: Do we really? + uint64_t MaxSplitSliceEndOffset; + + /// \brief Sets the partition to be empty at given iterator, and sets the + /// end iterator. + partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) + : P(SI), SE(SE), MaxSplitSliceEndOffset(0) { + // If not already at the end, advance our state to form the initial + // partition. + if (SI != SE) + advance(); + } + + /// \brief Advance the iterator to the next partition. + /// + /// Requires that the iterator not be at the end of the slices. + void advance() { + assert((P.SI != SE || !P.SplitTails.empty()) && + "Cannot advance past the end of the slices!"); + + // Clear out any split uses which have ended. + if (!P.SplitTails.empty()) { + if (P.EndOffset >= MaxSplitSliceEndOffset) { + // If we've finished all splits, this is easy. + P.SplitTails.clear(); + MaxSplitSliceEndOffset = 0; + } else { + // Remove the uses which have ended in the prior partition. This + // cannot change the max split slice end because we just checked that + // the prior partition ended prior to that max. + P.SplitTails.erase( + std::remove_if( + P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), + P.SplitTails.end()); + assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { + return S->endOffset() == MaxSplitSliceEndOffset; + }) && + "Could not find the current max split slice offset!"); + assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { + return S->endOffset() <= MaxSplitSliceEndOffset; + }) && + "Max split slice end offset is not actually the max!"); + } + } + + // If P.SI is already at the end, then we've cleared the split tail and + // now have an end iterator. + if (P.SI == SE) { + assert(P.SplitTails.empty() && "Failed to clear the split slices!"); + return; + } + + // If we had a non-empty partition previously, set up the state for + // subsequent partitions. + if (P.SI != P.SJ) { + // Accumulate all the splittable slices which started in the old + // partition into the split list. + for (Slice &S : P) + if (S.isSplittable() && S.endOffset() > P.EndOffset) { + P.SplitTails.push_back(&S); + MaxSplitSliceEndOffset = + std::max(S.endOffset(), MaxSplitSliceEndOffset); + } + + // Start from the end of the previous partition. + P.SI = P.SJ; + + // If P.SI is now at the end, we at most have a tail of split slices. + if (P.SI == SE) { + P.BeginOffset = P.EndOffset; + P.EndOffset = MaxSplitSliceEndOffset; + return; + } + + // If the we have split slices and the next slice is after a gap and is + // not splittable immediately form an empty partition for the split + // slices up until the next slice begins. + if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && + !P.SI->isSplittable()) { + P.BeginOffset = P.EndOffset; + P.EndOffset = P.SI->beginOffset(); + return; + } + } + + // OK, we need to consume new slices. Set the end offset based on the + // current slice, and step SJ past it. The beginning offset of the + // parttion is the beginning offset of the next slice unless we have + // pre-existing split slices that are continuing, in which case we begin + // at the prior end offset. + P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; + P.EndOffset = P.SI->endOffset(); + ++P.SJ; + + // There are two strategies to form a partition based on whether the + // partition starts with an unsplittable slice or a splittable slice. + if (!P.SI->isSplittable()) { + // When we're forming an unsplittable region, it must always start at + // the first slice and will extend through its end. + assert(P.BeginOffset == P.SI->beginOffset()); + + // Form a partition including all of the overlapping slices with this + // unsplittable slice. + while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { + if (!P.SJ->isSplittable()) + P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); + ++P.SJ; + } + + // We have a partition across a set of overlapping unsplittable + // partitions. + return; + } + + // If we're starting with a splittable slice, then we need to form + // a synthetic partition spanning it and any other overlapping splittable + // splices. + assert(P.SI->isSplittable() && "Forming a splittable partition!"); + + // Collect all of the overlapping splittable slices. + while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && + P.SJ->isSplittable()) { + P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); + ++P.SJ; + } + + // Back upiP.EndOffset if we ended the span early when encountering an + // unsplittable slice. This synthesizes the early end offset of + // a partition spanning only splittable slices. + if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { + assert(!P.SJ->isSplittable()); + P.EndOffset = P.SJ->beginOffset(); + } + } + + public: + bool operator==(const partition_iterator &RHS) const { + assert(SE == RHS.SE && + "End iterators don't match between compared partition iterators!"); + + // The observed positions of partitions is marked by the P.SI iterator and + // the emptyness of the split slices. The latter is only relevant when + // P.SI == SE, as the end iterator will additionally have an empty split + // slices list, but the prior may have the same P.SI and a tail of split + // slices. + if (P.SI == RHS.P.SI && + P.SplitTails.empty() == RHS.P.SplitTails.empty()) { + assert(P.SJ == RHS.P.SJ && + "Same set of slices formed two different sized partitions!"); + assert(P.SplitTails.size() == RHS.P.SplitTails.size() && + "Same slice position with differently sized non-empty split " + "slice tails!"); + return true; + } + return false; + } + + partition_iterator &operator++() { + advance(); + return *this; + } + + Partition &operator*() { return P; } + }; + + /// \brief A forward range over the partitions of the alloca's slices. + /// + /// This accesses an iterator range over the partitions of the alloca's + /// slices. It computes these partitions on the fly based on the overlapping + /// offsets of the slices and the ability to split them. It will visit "empty" + /// partitions to cover regions of the alloca only accessed via split + /// slices. + iterator_range<partition_iterator> partitions() { + return make_range(partition_iterator(begin(), end()), + partition_iterator(end(), end())); + } + /// \brief Access the dead users for this alloca. ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; } @@ -308,7 +602,7 @@ static Value *foldSelectInst(SelectInst &SI) { // being selected between, fold the select. Yes this does (rarely) happen // early on. if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) - return SI.getOperand(1+CI->isZero()); + return SI.getOperand(1 + CI->isZero()); if (SI.getOperand(1) == SI.getOperand(2)) return SI.getOperand(1); @@ -421,7 +715,8 @@ private: GEPOffset += APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx)); } else { - // For array or vector indices, scale the index by the size of the type. + // For array or vector indices, scale the index by the size of the + // type. APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth()); GEPOffset += Index * APInt(Offset.getBitWidth(), DL.getTypeAllocSize(GTI.getIndexedType())); @@ -440,16 +735,10 @@ private: void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, uint64_t Size, bool IsVolatile) { - // We allow splitting of loads and stores where the type is an integer type - // and cover the entire alloca. This prevents us from splitting over - // eagerly. - // FIXME: In the great blue eventually, we should eagerly split all integer - // loads and stores, and then have a separate step that merges adjacent - // alloca partitions into a single partition suitable for integer widening. - // Or we should skip the merge step and rely on GVN and other passes to - // merge adjacent loads and stores that survive mem2reg. - bool IsSplittable = - Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize; + // We allow splitting of non-volatile loads and stores where the type is an + // integer type. These may be used to implement 'memcpy' or other "transfer + // of bits" patterns. + bool IsSplittable = Ty->isIntegerTy() && !IsVolatile; insertUse(I, Offset, Size, IsSplittable); } @@ -495,7 +784,6 @@ private: handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile()); } - void visitMemSetInst(MemSetInst &II) { assert(II.getRawDest() == *U && "Pointer use is not the destination?"); ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); @@ -507,9 +795,8 @@ private: if (!IsOffsetKnown) return PI.setAborted(&II); - insertUse(II, Offset, - Length ? Length->getLimitedValue() - : AllocSize - Offset.getLimitedValue(), + insertUse(II, Offset, Length ? Length->getLimitedValue() + : AllocSize - Offset.getLimitedValue(), (bool)Length); } @@ -533,15 +820,15 @@ private: // FIXME: Yet another place we really should bypass this when // instrumenting for ASan. if (Offset.uge(AllocSize)) { - SmallDenseMap<Instruction *, unsigned>::iterator MTPI = MemTransferSliceMap.find(&II); + SmallDenseMap<Instruction *, unsigned>::iterator MTPI = + MemTransferSliceMap.find(&II); if (MTPI != MemTransferSliceMap.end()) AS.Slices[MTPI->second].kill(); return markAsDead(II); } uint64_t RawOffset = Offset.getLimitedValue(); - uint64_t Size = Length ? Length->getLimitedValue() - : AllocSize - RawOffset; + uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset; // Check for the special case where the same exact value is used for both // source and dest. @@ -697,18 +984,12 @@ private: insertUse(I, Offset, Size); } - void visitPHINode(PHINode &PN) { - visitPHINodeOrSelectInst(PN); - } + void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); } - void visitSelectInst(SelectInst &SI) { - visitPHINodeOrSelectInst(SI); - } + void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); } /// \brief Disable SROA entirely if there are unhandled users of the alloca. - void visitInstruction(Instruction &I) { - PI.setAborted(&I); - } + void visitInstruction(Instruction &I) { PI.setAborted(&I); } }; AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) @@ -729,7 +1010,9 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) } Slices.erase(std::remove_if(Slices.begin(), Slices.end(), - std::mem_fun_ref(&Slice::isDead)), + [](const Slice &S) { + return S.isDead(); + }), Slices.end()); #if __cplusplus >= 201103L && !defined(NDEBUG) @@ -749,6 +1032,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) void AllocaSlices::print(raw_ostream &OS, const_iterator I, StringRef Indent) const { printSlice(OS, I, Indent); + OS << "\n"; printUse(OS, I, Indent); } @@ -756,7 +1040,7 @@ void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I, StringRef Indent) const { OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")" << " slice #" << (I - begin()) - << (I->isSplittable() ? " (splittable)" : "") << "\n"; + << (I->isSplittable() ? " (splittable)" : ""); } void AllocaSlices::printUse(raw_ostream &OS, const_iterator I, @@ -804,15 +1088,17 @@ public: AllocaInst &AI, DIBuilder &DIB) : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} - void run(const SmallVectorImpl<Instruction*> &Insts) { + void run(const SmallVectorImpl<Instruction *> &Insts) { // Retain the debug information attached to the alloca for use when // rewriting loads and stores. - if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { - for (User *U : DebugNode->users()) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) - DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) - DVIs.push_back(DVI); + if (auto *L = LocalAsMetadata::getIfExists(&AI)) { + if (auto *DebugNode = MetadataAsValue::getIfExists(AI.getContext(), L)) { + for (User *U : DebugNode->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) + DDIs.push_back(DDI); + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) + DVIs.push_back(DVI); + } } LoadAndStorePromoter::run(Insts); @@ -825,8 +1111,9 @@ public: DVIs.pop_back_val()->eraseFromParent(); } - bool isInstInList(Instruction *I, - const SmallVectorImpl<Instruction*> &Insts) const override { + bool + isInstInList(Instruction *I, + const SmallVectorImpl<Instruction *> &Insts) const override { Value *Ptr; if (LoadInst *LI = dyn_cast<LoadInst>(I)) Ptr = LI->getOperand(0); @@ -884,7 +1171,6 @@ public: }; } // end anon namespace - namespace { /// \brief An optimization pass providing Scalar Replacement of Aggregates. /// @@ -910,7 +1196,7 @@ class SROA : public FunctionPass { LLVMContext *C; const DataLayout *DL; DominatorTree *DT; - AssumptionTracker *AT; + AssumptionCache *AC; /// \brief Worklist of alloca instructions to simplify. /// @@ -919,12 +1205,12 @@ class SROA : public FunctionPass { /// directly promoted. Finally, each time we rewrite a use of an alloca other /// the one being actively rewritten, we add it back onto the list if not /// already present to ensure it is re-visited. - SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; + SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> Worklist; /// \brief A collection of instructions to delete. /// We try to batch deletions to simplify code and make things a bit more /// efficient. - SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts; + SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts; /// \brief Post-promotion worklist. /// @@ -934,7 +1220,7 @@ class SROA : public FunctionPass { /// /// Note that we have to be very careful to clear allocas out of this list in /// the event they are deleted. - SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; + SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> PostPromotionWorklist; /// \brief A collection of alloca instructions we can directly promote. std::vector<AllocaInst *> PromotableAllocas; @@ -944,7 +1230,7 @@ class SROA : public FunctionPass { /// All of these PHIs have been checked for the safety of speculation and by /// being speculated will allow promoting allocas currently in the promotable /// queue. - SetVector<PHINode *, SmallVector<PHINode *, 2> > SpeculatablePHIs; + SetVector<PHINode *, SmallVector<PHINode *, 2>> SpeculatablePHIs; /// \brief A worklist of select instructions to speculate prior to promoting /// allocas. @@ -952,12 +1238,12 @@ class SROA : public FunctionPass { /// All of these select instructions have been checked for the safety of /// speculation and by being speculated will allow promoting allocas /// currently in the promotable queue. - SetVector<SelectInst *, SmallVector<SelectInst *, 2> > SpeculatableSelects; + SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects; public: SROA(bool RequiresDomTree = true) - : FunctionPass(ID), RequiresDomTree(RequiresDomTree), - C(nullptr), DL(nullptr), DT(nullptr) { + : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr), + DL(nullptr), DT(nullptr) { initializeSROAPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -970,10 +1256,9 @@ private: friend class PHIOrSelectSpeculator; friend class AllocaSliceRewriter; - bool rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::iterator B, AllocaSlices::iterator E, - int64_t BeginOffset, int64_t EndOffset, - ArrayRef<AllocaSlices::iterator> SplitUses); + bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS); + AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, + AllocaSlices::Partition &P); bool splitAlloca(AllocaInst &AI, AllocaSlices &AS); bool runOnAlloca(AllocaInst &AI); void clobberUse(Use &U); @@ -988,12 +1273,12 @@ FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { return new SROA(RequiresDomTree); } -INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", - false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false, + false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", - false, false) +INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false, + false) /// Walk the range of a partitioning looking for a common type to cover this /// sequence of slices. @@ -1064,8 +1349,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B, /// /// FIXME: This should be hoisted into a generic utility, likely in /// Transforms/Util/Local.h -static bool isSafePHIToSpeculate(PHINode &PN, - const DataLayout *DL = nullptr) { +static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) { // For now, we can only do this promotion if the load is in the same block // as the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. @@ -1325,7 +1609,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, SmallVectorImpl<Value *> &Indices, Twine NamePrefix) { if (Offset == 0) - return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, NamePrefix); + return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, + NamePrefix); // We can't recurse through pointer types. if (Ty->isPointerTy()) @@ -1433,8 +1718,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, /// a single GEP as possible, thus making each GEP more independent of the /// surrounding code. static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, - APInt Offset, Type *PointerTy, - Twine NamePrefix) { + APInt Offset, Type *PointerTy, Twine NamePrefix) { // Even though we don't look through PHI nodes, we could be called on an // instruction in an unreachable block, which may be on a cycle. SmallPtrSet<Value *, 4> Visited; @@ -1443,8 +1727,9 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, // We may end up computing an offset pointer that has the wrong type. If we // never are able to compute one directly that has the correct type, we'll - // fall back to it, so keep it around here. + // fall back to it, so keep it and the base it was computed from around here. Value *OffsetPtr = nullptr; + Value *OffsetBasePtr; // Remember any i8 pointer we come across to re-use if we need to do a raw // byte offset. @@ -1469,16 +1754,19 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Indices.clear(); if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy, Indices, NamePrefix)) { - if (P->getType() == PointerTy) { - // Zap any offset pointer that we ended up computing in previous rounds. - if (OffsetPtr && OffsetPtr->use_empty()) - if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) - I->eraseFromParent(); + // If we have a new natural pointer at the offset, clear out any old + // offset pointer we computed. Unless it is the base pointer or + // a non-instruction, we built a GEP we don't need. Zap it. + if (OffsetPtr && OffsetPtr != OffsetBasePtr) + if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) { + assert(I->use_empty() && "Built a GEP with uses some how!"); + I->eraseFromParent(); + } + OffsetPtr = P; + OffsetBasePtr = Ptr; + // If we also found a pointer of the right type, we're done. + if (P->getType() == PointerTy) return P; - } - if (!OffsetPtr) { - OffsetPtr = P; - } } // Stash this pointer if we've found an i8*. @@ -1508,9 +1796,10 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Int8PtrOffset = Offset; } - OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : - IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), - NamePrefix + "sroa_raw_idx"); + OffsetPtr = Int8PtrOffset == 0 + ? Int8Ptr + : IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), + NamePrefix + "sroa_raw_idx"); } Ptr = OffsetPtr; @@ -1521,6 +1810,27 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, return Ptr; } +/// \brief Compute the adjusted alignment for a load or store from an offset. +static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset, + const DataLayout &DL) { + unsigned Alignment; + Type *Ty; + if (auto *LI = dyn_cast<LoadInst>(I)) { + Alignment = LI->getAlignment(); + Ty = LI->getType(); + } else if (auto *SI = dyn_cast<StoreInst>(I)) { + Alignment = SI->getAlignment(); + Ty = SI->getValueOperand()->getType(); + } else { + llvm_unreachable("Only loads and stores are allowed!"); + } + + if (!Alignment) + Alignment = DL.getABITypeAlignment(Ty); + + return MinAlign(Alignment, Offset); +} + /// \brief Test whether we can convert a value from the old to the new type. /// /// This predicate should be used to guard calls to convertValue in order to @@ -1614,19 +1924,19 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, /// /// This function is called to test each entry in a partioning which is slated /// for a single slice. -static bool -isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset, - uint64_t SliceEndOffset, VectorType *Ty, - uint64_t ElementSize, const Slice &S) { +static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P, + const Slice &S, VectorType *Ty, + uint64_t ElementSize, + const DataLayout &DL) { // First validate the slice offsets. uint64_t BeginOffset = - std::max(S.beginOffset(), SliceBeginOffset) - SliceBeginOffset; + std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset(); uint64_t BeginIndex = BeginOffset / ElementSize; if (BeginIndex * ElementSize != BeginOffset || BeginIndex >= Ty->getNumElements()) return false; uint64_t EndOffset = - std::min(S.endOffset(), SliceEndOffset) - SliceBeginOffset; + std::min(S.endOffset(), P.endOffset()) - P.beginOffset(); uint64_t EndIndex = EndOffset / ElementSize; if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements()) return false; @@ -1658,7 +1968,7 @@ isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset, if (LI->isVolatile()) return false; Type *LTy = LI->getType(); - if (SliceBeginOffset > S.beginOffset() || SliceEndOffset < S.endOffset()) { + if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { assert(LTy->isIntegerTy()); LTy = SplitIntTy; } @@ -1668,7 +1978,7 @@ isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset, if (SI->isVolatile()) return false; Type *STy = SI->getValueOperand()->getType(); - if (SliceBeginOffset > S.beginOffset() || SliceEndOffset < S.endOffset()) { + if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { assert(STy->isIntegerTy()); STy = SplitIntTy; } @@ -1690,11 +2000,8 @@ isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset, /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static VectorType * -isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, - uint64_t SliceBeginOffset, uint64_t SliceEndOffset, - AllocaSlices::const_range Slices, - ArrayRef<AllocaSlices::iterator> SplitUses) { +static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P, + const DataLayout &DL) { // Collect the candidate types for vector-based promotion. Also track whether // we have different element types. SmallVector<VectorType *, 4> CandidateTys; @@ -1709,11 +2016,10 @@ isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, HaveCommonEltTy = false; } }; - CheckCandidateType(AllocaTy); // Consider any loads or stores that are the exact size of the slice. - for (const auto &S : Slices) - if (S.beginOffset() == SliceBeginOffset && - S.endOffset() == SliceEndOffset) { + for (const Slice &S : P) + if (S.beginOffset() == P.beginOffset() && + S.endOffset() == P.endOffset()) { if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser())) CheckCandidateType(LI->getType()); else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) @@ -1780,14 +2086,12 @@ isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, "vector size not a multiple of element size?"); ElementSize /= 8; - for (const auto &S : Slices) - if (!isVectorPromotionViableForSlice(DL, SliceBeginOffset, SliceEndOffset, - VTy, ElementSize, S)) + for (const Slice &S : P) + if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) return false; - for (const auto &SI : SplitUses) - if (!isVectorPromotionViableForSlice(DL, SliceBeginOffset, SliceEndOffset, - VTy, ElementSize, *SI)) + for (const Slice *S : P.splitSliceTails()) + if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) return false; return true; @@ -1803,12 +2107,13 @@ isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, /// /// This implements the necessary checking for the \c isIntegerWideningViable /// test below on a single slice of the alloca. -static bool isIntegerWideningViableForSlice(const DataLayout &DL, - Type *AllocaTy, +static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, - uint64_t Size, - const Slice &S, + Type *AllocaTy, + const DataLayout &DL, bool &WholeAllocaOp) { + uint64_t Size = DL.getTypeStoreSize(AllocaTy); + uint64_t RelBegin = S.beginOffset() - AllocBeginOffset; uint64_t RelEnd = S.endOffset() - AllocBeginOffset; @@ -1876,11 +2181,8 @@ static bool isIntegerWideningViableForSlice(const DataLayout &DL, /// This is a quick test to check whether we can rewrite the integer loads and /// stores to a particular alloca into wider loads and stores and be able to /// promote the resulting alloca. -static bool -isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy, - uint64_t AllocBeginOffset, - AllocaSlices::const_range Slices, - ArrayRef<AllocaSlices::iterator> SplitUses) { +static bool isIntegerWideningViable(AllocaSlices::Partition &P, Type *AllocaTy, + const DataLayout &DL) { uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy); // Don't create integer types larger than the maximum bitwidth. if (SizeInBits > IntegerType::MAX_INT_BITS) @@ -1898,24 +2200,24 @@ isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy, !canConvertValue(DL, IntTy, AllocaTy)) return false; - uint64_t Size = DL.getTypeStoreSize(AllocaTy); - // While examining uses, we ensure that the alloca has a covering load or // store. We don't want to widen the integer operations only to fail to // promote due to some other unsplittable entry (which we may make splittable // later). However, if there are only splittable uses, go ahead and assume // that we cover the alloca. + // FIXME: We shouldn't consider split slices that happen to start in the + // partition here... bool WholeAllocaOp = - Slices.begin() != Slices.end() ? false : DL.isLegalInteger(SizeInBits); + P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits); - for (const auto &S : Slices) - if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, - S, WholeAllocaOp)) + for (const Slice &S : P) + if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL, + WholeAllocaOp)) return false; - for (const auto &SI : SplitUses) - if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, - *SI, WholeAllocaOp)) + for (const Slice *S : P.splitSliceTails()) + if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL, + WholeAllocaOp)) return false; return WholeAllocaOp; @@ -1928,9 +2230,9 @@ static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *IntTy = cast<IntegerType>(V->getType()); assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && "Element extends past full value"); - uint64_t ShAmt = 8*Offset; + uint64_t ShAmt = 8 * Offset; if (DL.isBigEndian()) - ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); if (ShAmt) { V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); DEBUG(dbgs() << " shifted: " << *V << "\n"); @@ -1957,9 +2259,9 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, } assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && "Element store outside of alloca store"); - uint64_t ShAmt = 8*Offset; + uint64_t ShAmt = 8 * Offset; if (DL.isBigEndian()) - ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); if (ShAmt) { V = IRB.CreateShl(V, ShAmt, Name + ".shift"); DEBUG(dbgs() << " shifted: " << *V << "\n"); @@ -1975,9 +2277,8 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, return V; } -static Value *extractVector(IRBuilderTy &IRB, Value *V, - unsigned BeginIndex, unsigned EndIndex, - const Twine &Name) { +static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, + unsigned EndIndex, const Twine &Name) { VectorType *VecTy = cast<VectorType>(V->getType()); unsigned NumElements = EndIndex - BeginIndex; assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); @@ -1992,13 +2293,12 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, return V; } - SmallVector<Constant*, 8> Mask; + SmallVector<Constant *, 8> Mask; Mask.reserve(NumElements); for (unsigned i = BeginIndex; i != EndIndex; ++i) Mask.push_back(IRB.getInt32(i)); V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), - ConstantVector::get(Mask), - Name + ".extract"); + ConstantVector::get(Mask), Name + ".extract"); DEBUG(dbgs() << " shuffle: " << *V << "\n"); return V; } @@ -2013,7 +2313,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, // Single element to insert. V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), Name + ".insert"); - DEBUG(dbgs() << " insert: " << *V << "\n"); + DEBUG(dbgs() << " insert: " << *V << "\n"); return V; } @@ -2029,7 +2329,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, // use a shuffle vector to widen it with undef elements, and then // a second shuffle vector to select between the loaded vector and the // incoming vector. - SmallVector<Constant*, 8> Mask; + SmallVector<Constant *, 8> Mask; Mask.reserve(VecTy->getNumElements()); for (unsigned i = 0; i != VecTy->getNumElements(); ++i) if (i >= BeginIndex && i < EndIndex) @@ -2037,8 +2337,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, else Mask.push_back(UndefValue::get(IRB.getInt32Ty())); V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), - ConstantVector::get(Mask), - Name + ".expand"); + ConstantVector::get(Mask), Name + ".expand"); DEBUG(dbgs() << " shuffle: " << *V << "\n"); Mask.clear(); @@ -2148,6 +2447,9 @@ public: IsSplittable = I->isSplittable(); IsSplit = BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; + DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : "")); + DEBUG(AS.printSlice(dbgs(), I, "")); + DEBUG(dbgs() << "\n"); // Compute the intersecting offset range. assert(BeginOffset < NewAllocaEndOffset); @@ -2218,7 +2520,8 @@ private: ); } - /// \brief Compute suitable alignment to access this slice of the *new* alloca. + /// \brief Compute suitable alignment to access this slice of the *new* + /// alloca. /// /// You can optionally pass a type to this routine and if that type's ABI /// alignment is itself suitable, this will return zero. @@ -2226,7 +2529,8 @@ private: unsigned NewAIAlign = NewAI.getAlignment(); if (!NewAIAlign) NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType()); - unsigned Align = MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); + unsigned Align = + MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align; } @@ -2250,16 +2554,14 @@ private: unsigned EndIndex = getIndex(NewEndOffset); assert(EndIndex > BeginIndex && "Empty vector!"); - Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); return extractVector(IRB, V, BeginIndex, EndIndex, "vec"); } Value *rewriteIntegerLoad(LoadInst &LI) { assert(IntTy && "We cannot insert an integer to the alloca"); assert(!LI.isVolatile()); - Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); V = convertValue(DL, IRB, V, IntTy); assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; @@ -2284,8 +2586,8 @@ private: V = rewriteIntegerLoad(LI); } else if (NewBeginOffset == NewAllocaBeginOffset && canConvertValue(DL, NewAllocaTy, LI.getType())) { - V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - LI.isVolatile(), LI.getName()); + V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(), + LI.getName()); } else { Type *LTy = TargetTy->getPointerTo(); V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), @@ -2302,7 +2604,7 @@ private: assert(SliceSize < DL.getTypeStoreSize(LI.getType()) && "Split load isn't smaller than original load"); assert(LI.getType()->getIntegerBitWidth() == - DL.getTypeStoreSizeInBits(LI.getType()) && + DL.getTypeStoreSizeInBits(LI.getType()) && "Non-byte-multiple bit width"); // Move the insertion point just past the load so that we can refer to it. IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI))); @@ -2310,9 +2612,9 @@ private: // basis for the new value. This allows us to replace the uses of LI with // the computed value, and then replace the placeholder with LI, leaving // LI only used for this computation. - Value *Placeholder - = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); - V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset, + Value *Placeholder = + new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); + V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset, "insert"); LI.replaceAllUsesWith(V); Placeholder->replaceAllUsesWith(&LI); @@ -2334,15 +2636,14 @@ private: assert(EndIndex > BeginIndex && "Empty vector!"); unsigned NumElements = EndIndex - BeginIndex; assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); - Type *SliceTy = - (NumElements == 1) ? ElementTy - : VectorType::get(ElementTy, NumElements); + Type *SliceTy = (NumElements == 1) + ? ElementTy + : VectorType::get(ElementTy, NumElements); if (V->getType() != SliceTy) V = convertValue(DL, IRB, V, SliceTy); // Mix in the existing elements. - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); V = insertVector(IRB, Old, V, BeginIndex, "vec"); } StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); @@ -2357,13 +2658,12 @@ private: assert(IntTy && "We cannot extract an integer from the alloca"); assert(!SI.isVolatile()); if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Old = convertValue(DL, IRB, Old, IntTy); assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = BeginOffset - NewAllocaBeginOffset; - V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, - "insert"); + V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert"); } V = convertValue(DL, IRB, V, NewAllocaTy); StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); @@ -2391,10 +2691,10 @@ private: assert(V->getType()->isIntegerTy() && "Only integer type loads and stores are split"); assert(V->getType()->getIntegerBitWidth() == - DL.getTypeStoreSizeInBits(V->getType()) && + DL.getTypeStoreSizeInBits(V->getType()) && "Non-byte-multiple bit width"); IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8); - V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, + V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset, "extract"); } @@ -2439,14 +2739,14 @@ private: if (Size == 1) return V; - Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); - V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, "zext"), - ConstantExpr::getUDiv( - Constant::getAllOnesValue(SplatIntTy), - ConstantExpr::getZExt( - Constant::getAllOnesValue(V->getType()), - SplatIntTy)), - "isplat"); + Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8); + V = IRB.CreateMul( + IRB.CreateZExt(V, SplatIntTy, "zext"), + ConstantExpr::getUDiv( + Constant::getAllOnesValue(SplatIntTy), + ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()), + SplatIntTy)), + "isplat"); return V; } @@ -2483,12 +2783,11 @@ private: // If this doesn't map cleanly onto the alloca type, and that type isn't // a single value type, just emit a memset. if (!VecTy && !IntTy && - (BeginOffset > NewAllocaBeginOffset || - EndOffset < NewAllocaEndOffset || + (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset || SliceSize != DL.getTypeStoreSize(AllocaTy) || !AllocaTy->isSingleValueType() || !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) || - DL.getTypeSizeInBits(ScalarTy)%8 != 0)) { + DL.getTypeSizeInBits(ScalarTy) % 8 != 0)) { Type *SizeTy = II.getLength()->getType(); Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); CallInst *New = IRB.CreateMemSet( @@ -2522,8 +2821,8 @@ private: if (NumElements > 1) Splat = getVectorSplat(Splat, NumElements); - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); V = insertVector(IRB, Old, Splat, BeginIndex, "vec"); } else if (IntTy) { // If this is a memset on an alloca where we can widen stores, insert the @@ -2535,8 +2834,8 @@ private: if (IntTy && (BeginOffset != NewAllocaBeginOffset || EndOffset != NewAllocaBeginOffset)) { - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Old = convertValue(DL, IRB, Old, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; V = insertInteger(DL, IRB, Old, V, Offset, "insert"); @@ -2633,8 +2932,8 @@ private: // Strip all inbounds GEPs and pointer casts to try to dig out any root // alloca that should be re-examined after rewriting this instruction. Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); - if (AllocaInst *AI - = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) { + if (AllocaInst *AI = + dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) { assert(AI != &OldAI && AI != &NewAI && "Splittable transfers cannot reach the same alloca on both ends."); Pass.Worklist.insert(AI); @@ -2673,8 +2972,8 @@ private: unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0; unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0; unsigned NumElements = EndIndex - BeginIndex; - IntegerType *SubIntTy - = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : nullptr; + IntegerType *SubIntTy = + IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr; // Reset the other pointer type to match the register type we're going to // use, but using the address space of the original other pointer. @@ -2703,27 +3002,25 @@ private: Value *Src; if (VecTy && !IsWholeAlloca && !IsDest) { - Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec"); } else if (IntTy && !IsWholeAlloca && !IsDest) { - Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); Src = convertValue(DL, IRB, Src, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); } else { - Src = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), - "copyload"); + Src = + IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload"); } if (VecTy && !IsWholeAlloca && IsDest) { - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Src = insertVector(IRB, Old, Src, BeginIndex, "vec"); } else if (IntTy && !IsWholeAlloca && IsDest) { - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Old = convertValue(DL, IRB, Old, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = insertInteger(DL, IRB, Old, Src, Offset, "insert"); @@ -2746,8 +3043,8 @@ private: // Record this instruction for deletion. Pass.DeadInsts.insert(&II); - ConstantInt *Size - = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), + ConstantInt *Size = + ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), NewEndOffset - NewBeginOffset); Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); Value *New; @@ -2814,7 +3111,6 @@ private: SelectUsers.insert(&SI); return true; } - }; } @@ -2869,8 +3165,7 @@ private: bool visitInstruction(Instruction &I) { return false; } /// \brief Generic recursive split emission class. - template <typename Derived> - class OpSplitter { + template <typename Derived> class OpSplitter { protected: /// The builder used to form new instructions. IRBuilderTy IRB; @@ -2887,7 +3182,7 @@ private: /// Initialize the splitter with an insertion point, Ptr and start with a /// single zero GEP index. OpSplitter(Instruction *InsertionPoint, Value *Ptr) - : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} + : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} public: /// \brief Generic recursive split emission routine. @@ -2943,7 +3238,7 @@ private: struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) - : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} + : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} /// Emit a leaf load of a single value. This is called at the leaves of the /// recursive emission to actually load values. @@ -2974,7 +3269,7 @@ private: struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) - : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} + : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} /// Emit a leaf store of a single value. This is called at the leaves of the /// recursive emission to actually produce stores. @@ -2982,8 +3277,8 @@ private: assert(Ty->isSingleValueType()); // Extract the single value and store it using the indices. Value *Store = IRB.CreateStore( - IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), - IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); + IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), + IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); } @@ -3069,8 +3364,8 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { /// when the size or offset cause either end of type-based partition to be off. /// Also, this is a best-effort routine. It is reasonable to give up and not /// return a type if necessary. -static Type *getTypePartition(const DataLayout &DL, Type *Ty, - uint64_t Offset, uint64_t Size) { +static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, + uint64_t Size) { if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size) return stripAggregateTypeWrapping(DL, Ty); if (Offset > DL.getTypeAllocSize(Ty) || @@ -3162,8 +3457,8 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, } // Try to build up a sub-structure. - StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), - STy->isPacked()); + StructType *SubTy = + StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked()); const StructLayout *SubSL = DL.getStructLayout(SubTy); if (Size != SubSL->getSizeInBytes()) return nullptr; // The sub-struct doesn't have quite the size needed. @@ -3171,6 +3466,494 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, return SubTy; } +/// \brief Pre-split loads and stores to simplify rewriting. +/// +/// We want to break up the splittable load+store pairs as much as +/// possible. This is important to do as a preprocessing step, as once we +/// start rewriting the accesses to partitions of the alloca we lose the +/// necessary information to correctly split apart paired loads and stores +/// which both point into this alloca. The case to consider is something like +/// the following: +/// +/// %a = alloca [12 x i8] +/// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0 +/// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4 +/// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8 +/// %iptr1 = bitcast i8* %gep1 to i64* +/// %iptr2 = bitcast i8* %gep2 to i64* +/// %fptr1 = bitcast i8* %gep1 to float* +/// %fptr2 = bitcast i8* %gep2 to float* +/// %fptr3 = bitcast i8* %gep3 to float* +/// store float 0.0, float* %fptr1 +/// store float 1.0, float* %fptr2 +/// %v = load i64* %iptr1 +/// store i64 %v, i64* %iptr2 +/// %f1 = load float* %fptr2 +/// %f2 = load float* %fptr3 +/// +/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and +/// promote everything so we recover the 2 SSA values that should have been +/// there all along. +/// +/// \returns true if any changes are made. +bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { + DEBUG(dbgs() << "Pre-splitting loads and stores\n"); + + // Track the loads and stores which are candidates for pre-splitting here, in + // the order they first appear during the partition scan. These give stable + // iteration order and a basis for tracking which loads and stores we + // actually split. + SmallVector<LoadInst *, 4> Loads; + SmallVector<StoreInst *, 4> Stores; + + // We need to accumulate the splits required of each load or store where we + // can find them via a direct lookup. This is important to cross-check loads + // and stores against each other. We also track the slice so that we can kill + // all the slices that end up split. + struct SplitOffsets { + Slice *S; + std::vector<uint64_t> Splits; + }; + SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap; + + // Track loads out of this alloca which cannot, for any reason, be pre-split. + // This is important as we also cannot pre-split stores of those loads! + // FIXME: This is all pretty gross. It means that we can be more aggressive + // in pre-splitting when the load feeding the store happens to come from + // a separate alloca. Put another way, the effectiveness of SROA would be + // decreased by a frontend which just concatenated all of its local allocas + // into one big flat alloca. But defeating such patterns is exactly the job + // SROA is tasked with! Sadly, to not have this discrepancy we would have + // change store pre-splitting to actually force pre-splitting of the load + // that feeds it *and all stores*. That makes pre-splitting much harder, but + // maybe it would make it more principled? + SmallPtrSet<LoadInst *, 8> UnsplittableLoads; + + DEBUG(dbgs() << " Searching for candidate loads and stores\n"); + for (auto &P : AS.partitions()) { + for (Slice &S : P) { + Instruction *I = cast<Instruction>(S.getUse()->getUser()); + if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) { + // If this was a load we have to track that it can't participate in any + // pre-splitting! + if (auto *LI = dyn_cast<LoadInst>(I)) + UnsplittableLoads.insert(LI); + continue; + } + assert(P.endOffset() > S.beginOffset() && + "Empty or backwards partition!"); + + // Determine if this is a pre-splittable slice. + if (auto *LI = dyn_cast<LoadInst>(I)) { + assert(!LI->isVolatile() && "Cannot split volatile loads!"); + + // The load must be used exclusively to store into other pointers for + // us to be able to arbitrarily pre-split it. The stores must also be + // simple to avoid changing semantics. + auto IsLoadSimplyStored = [](LoadInst *LI) { + for (User *LU : LI->users()) { + auto *SI = dyn_cast<StoreInst>(LU); + if (!SI || !SI->isSimple()) + return false; + } + return true; + }; + if (!IsLoadSimplyStored(LI)) { + UnsplittableLoads.insert(LI); + continue; + } + + Loads.push_back(LI); + } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) { + if (!SI || + S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex())) + continue; + auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand()); + if (!StoredLoad || !StoredLoad->isSimple()) + continue; + assert(!SI->isVolatile() && "Cannot split volatile stores!"); + + Stores.push_back(SI); + } else { + // Other uses cannot be pre-split. + continue; + } + + // Record the initial split. + DEBUG(dbgs() << " Candidate: " << *I << "\n"); + auto &Offsets = SplitOffsetsMap[I]; + assert(Offsets.Splits.empty() && + "Should not have splits the first time we see an instruction!"); + Offsets.S = &S; + Offsets.Splits.push_back(P.endOffset() - S.beginOffset()); + } + + // Now scan the already split slices, and add a split for any of them which + // we're going to pre-split. + for (Slice *S : P.splitSliceTails()) { + auto SplitOffsetsMapI = + SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser())); + if (SplitOffsetsMapI == SplitOffsetsMap.end()) + continue; + auto &Offsets = SplitOffsetsMapI->second; + + assert(Offsets.S == S && "Found a mismatched slice!"); + assert(!Offsets.Splits.empty() && + "Cannot have an empty set of splits on the second partition!"); + assert(Offsets.Splits.back() == + P.beginOffset() - Offsets.S->beginOffset() && + "Previous split does not end where this one begins!"); + + // Record each split. The last partition's end isn't needed as the size + // of the slice dictates that. + if (S->endOffset() > P.endOffset()) + Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset()); + } + } + + // We may have split loads where some of their stores are split stores. For + // such loads and stores, we can only pre-split them if their splits exactly + // match relative to their starting offset. We have to verify this prior to + // any rewriting. + Stores.erase( + std::remove_if(Stores.begin(), Stores.end(), + [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) { + // Lookup the load we are storing in our map of split + // offsets. + auto *LI = cast<LoadInst>(SI->getValueOperand()); + // If it was completely unsplittable, then we're done, + // and this store can't be pre-split. + if (UnsplittableLoads.count(LI)) + return true; + + auto LoadOffsetsI = SplitOffsetsMap.find(LI); + if (LoadOffsetsI == SplitOffsetsMap.end()) + return false; // Unrelated loads are definitely safe. + auto &LoadOffsets = LoadOffsetsI->second; + + // Now lookup the store's offsets. + auto &StoreOffsets = SplitOffsetsMap[SI]; + + // If the relative offsets of each split in the load and + // store match exactly, then we can split them and we + // don't need to remove them here. + if (LoadOffsets.Splits == StoreOffsets.Splits) + return false; + + DEBUG(dbgs() + << " Mismatched splits for load and store:\n" + << " " << *LI << "\n" + << " " << *SI << "\n"); + + // We've found a store and load that we need to split + // with mismatched relative splits. Just give up on them + // and remove both instructions from our list of + // candidates. + UnsplittableLoads.insert(LI); + return true; + }), + Stores.end()); + // Now we have to go *back* through all te stores, because a later store may + // have caused an earlier store's load to become unsplittable and if it is + // unsplittable for the later store, then we can't rely on it being split in + // the earlier store either. + Stores.erase(std::remove_if(Stores.begin(), Stores.end(), + [&UnsplittableLoads](StoreInst *SI) { + auto *LI = + cast<LoadInst>(SI->getValueOperand()); + return UnsplittableLoads.count(LI); + }), + Stores.end()); + // Once we've established all the loads that can't be split for some reason, + // filter any that made it into our list out. + Loads.erase(std::remove_if(Loads.begin(), Loads.end(), + [&UnsplittableLoads](LoadInst *LI) { + return UnsplittableLoads.count(LI); + }), + Loads.end()); + + + // If no loads or stores are left, there is no pre-splitting to be done for + // this alloca. + if (Loads.empty() && Stores.empty()) + return false; + + // From here on, we can't fail and will be building new accesses, so rig up + // an IR builder. + IRBuilderTy IRB(&AI); + + // Collect the new slices which we will merge into the alloca slices. + SmallVector<Slice, 4> NewSlices; + + // Track any allocas we end up splitting loads and stores for so we iterate + // on them. + SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas; + + // At this point, we have collected all of the loads and stores we can + // pre-split, and the specific splits needed for them. We actually do the + // splitting in a specific order in order to handle when one of the loads in + // the value operand to one of the stores. + // + // First, we rewrite all of the split loads, and just accumulate each split + // load in a parallel structure. We also build the slices for them and append + // them to the alloca slices. + SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap; + std::vector<LoadInst *> SplitLoads; + for (LoadInst *LI : Loads) { + SplitLoads.clear(); + + IntegerType *Ty = cast<IntegerType>(LI->getType()); + uint64_t LoadSize = Ty->getBitWidth() / 8; + assert(LoadSize > 0 && "Cannot have a zero-sized integer load!"); + + auto &Offsets = SplitOffsetsMap[LI]; + assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && + "Slice size should always match load size exactly!"); + uint64_t BaseOffset = Offsets.S->beginOffset(); + assert(BaseOffset + LoadSize > BaseOffset && + "Cannot represent alloca access size using 64-bit integers!"); + + Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand()); + IRB.SetInsertPoint(BasicBlock::iterator(LI)); + + DEBUG(dbgs() << " Splitting load: " << *LI << "\n"); + + uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); + int Idx = 0, Size = Offsets.Splits.size(); + for (;;) { + auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); + auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); + LoadInst *PLoad = IRB.CreateAlignedLoad( + getAdjustedPtr(IRB, *DL, BasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, BasePtr->getName() + "."), + getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false, + LI->getName()); + + // Append this load onto the list of split loads so we can find it later + // to rewrite the stores. + SplitLoads.push_back(PLoad); + + // Now build a new slice for the alloca. + NewSlices.push_back( + Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, + &PLoad->getOperandUse(PLoad->getPointerOperandIndex()), + /*IsSplittable*/ false)); + DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() + << ", " << NewSlices.back().endOffset() << "): " << *PLoad + << "\n"); + + // See if we've handled all the splits. + if (Idx >= Size) + break; + + // Setup the next partition. + PartOffset = Offsets.Splits[Idx]; + ++Idx; + PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset; + } + + // Now that we have the split loads, do the slow walk over all uses of the + // load and rewrite them as split stores, or save the split loads to use + // below if the store is going to be split there anyways. + bool DeferredStores = false; + for (User *LU : LI->users()) { + StoreInst *SI = cast<StoreInst>(LU); + if (!Stores.empty() && SplitOffsetsMap.count(SI)) { + DeferredStores = true; + DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n"); + continue; + } + + Value *StoreBasePtr = SI->getPointerOperand(); + IRB.SetInsertPoint(BasicBlock::iterator(SI)); + + DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); + + for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) { + LoadInst *PLoad = SplitLoads[Idx]; + uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1]; + auto *PartPtrTy = + PLoad->getType()->getPointerTo(SI->getPointerAddressSpace()); + + StoreInst *PStore = IRB.CreateAlignedStore( + PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, StoreBasePtr->getName() + "."), + getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false); + (void)PStore; + DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); + } + + // We want to immediately iterate on any allocas impacted by splitting + // this store, and we have to track any promotable alloca (indicated by + // a direct store) as needing to be resplit because it is no longer + // promotable. + if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) { + ResplitPromotableAllocas.insert(OtherAI); + Worklist.insert(OtherAI); + } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( + StoreBasePtr->stripInBoundsOffsets())) { + Worklist.insert(OtherAI); + } + + // Mark the original store as dead. + DeadInsts.insert(SI); + } + + // Save the split loads if there are deferred stores among the users. + if (DeferredStores) + SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads))); + + // Mark the original load as dead and kill the original slice. + DeadInsts.insert(LI); + Offsets.S->kill(); + } + + // Second, we rewrite all of the split stores. At this point, we know that + // all loads from this alloca have been split already. For stores of such + // loads, we can simply look up the pre-existing split loads. For stores of + // other loads, we split those loads first and then write split stores of + // them. + for (StoreInst *SI : Stores) { + auto *LI = cast<LoadInst>(SI->getValueOperand()); + IntegerType *Ty = cast<IntegerType>(LI->getType()); + uint64_t StoreSize = Ty->getBitWidth() / 8; + assert(StoreSize > 0 && "Cannot have a zero-sized integer store!"); + + auto &Offsets = SplitOffsetsMap[SI]; + assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && + "Slice size should always match load size exactly!"); + uint64_t BaseOffset = Offsets.S->beginOffset(); + assert(BaseOffset + StoreSize > BaseOffset && + "Cannot represent alloca access size using 64-bit integers!"); + + Value *LoadBasePtr = LI->getPointerOperand(); + Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand()); + + DEBUG(dbgs() << " Splitting store: " << *SI << "\n"); + + // Check whether we have an already split load. + auto SplitLoadsMapI = SplitLoadsMap.find(LI); + std::vector<LoadInst *> *SplitLoads = nullptr; + if (SplitLoadsMapI != SplitLoadsMap.end()) { + SplitLoads = &SplitLoadsMapI->second; + assert(SplitLoads->size() == Offsets.Splits.size() + 1 && + "Too few split loads for the number of splits in the store!"); + } else { + DEBUG(dbgs() << " of load: " << *LI << "\n"); + } + + uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); + int Idx = 0, Size = Offsets.Splits.size(); + for (;;) { + auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); + auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace()); + + // Either lookup a split load or create one. + LoadInst *PLoad; + if (SplitLoads) { + PLoad = (*SplitLoads)[Idx]; + } else { + IRB.SetInsertPoint(BasicBlock::iterator(LI)); + PLoad = IRB.CreateAlignedLoad( + getAdjustedPtr(IRB, *DL, LoadBasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, LoadBasePtr->getName() + "."), + getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false, + LI->getName()); + } + + // And store this partition. + IRB.SetInsertPoint(BasicBlock::iterator(SI)); + StoreInst *PStore = IRB.CreateAlignedStore( + PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, StoreBasePtr->getName() + "."), + getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false); + + // Now build a new slice for the alloca. + NewSlices.push_back( + Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, + &PStore->getOperandUse(PStore->getPointerOperandIndex()), + /*IsSplittable*/ false)); + DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() + << ", " << NewSlices.back().endOffset() << "): " << *PStore + << "\n"); + if (!SplitLoads) { + DEBUG(dbgs() << " of split load: " << *PLoad << "\n"); + } + + // See if we've finished all the splits. + if (Idx >= Size) + break; + + // Setup the next partition. + PartOffset = Offsets.Splits[Idx]; + ++Idx; + PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset; + } + + // We want to immediately iterate on any allocas impacted by splitting + // this load, which is only relevant if it isn't a load of this alloca and + // thus we didn't already split the loads above. We also have to keep track + // of any promotable allocas we split loads on as they can no longer be + // promoted. + if (!SplitLoads) { + if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) { + assert(OtherAI != &AI && "We can't re-split our own alloca!"); + ResplitPromotableAllocas.insert(OtherAI); + Worklist.insert(OtherAI); + } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( + LoadBasePtr->stripInBoundsOffsets())) { + assert(OtherAI != &AI && "We can't re-split our own alloca!"); + Worklist.insert(OtherAI); + } + } + + // Mark the original store as dead now that we've split it up and kill its + // slice. Note that we leave the original load in place unless this store + // was its ownly use. It may in turn be split up if it is an alloca load + // for some other alloca, but it may be a normal load. This may introduce + // redundant loads, but where those can be merged the rest of the optimizer + // should handle the merging, and this uncovers SSA splits which is more + // important. In practice, the original loads will almost always be fully + // split and removed eventually, and the splits will be merged by any + // trivial CSE, including instcombine. + if (LI->hasOneUse()) { + assert(*LI->user_begin() == SI && "Single use isn't this store!"); + DeadInsts.insert(LI); + } + DeadInsts.insert(SI); + Offsets.S->kill(); + } + + // Remove the killed slices that have ben pre-split. + AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) { + return S.isDead(); + }), AS.end()); + + // Insert our new slices. This will sort and merge them into the sorted + // sequence. + AS.insert(NewSlices); + + DEBUG(dbgs() << " Pre-split slices:\n"); +#ifndef NDEBUG + for (auto I = AS.begin(), E = AS.end(); I != E; ++I) + DEBUG(AS.print(dbgs(), I, " ")); +#endif + + // Finally, don't try to promote any allocas that new require re-splitting. + // They have already been added to the worklist above. + PromotableAllocas.erase( + std::remove_if( + PromotableAllocas.begin(), PromotableAllocas.end(), + [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }), + PromotableAllocas.end()); + + return true; +} + /// \brief Rewrite an alloca partition's users. /// /// This routine drives both of the rewriting goals of the SROA pass. It tries @@ -3181,40 +3964,31 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, /// appropriate new offsets. It also evaluates how successful the rewrite was /// at enabling promotion and if it was successful queues the alloca to be /// promoted. -bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::iterator B, AllocaSlices::iterator E, - int64_t BeginOffset, int64_t EndOffset, - ArrayRef<AllocaSlices::iterator> SplitUses) { - assert(BeginOffset < EndOffset); - uint64_t SliceSize = EndOffset - BeginOffset; - +AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, + AllocaSlices::Partition &P) { // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. Type *SliceTy = nullptr; - if (Type *CommonUseTy = findCommonType(B, E, EndOffset)) - if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize) + if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset())) + if (DL->getTypeAllocSize(CommonUseTy) >= P.size()) SliceTy = CommonUseTy; if (!SliceTy) if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(), - BeginOffset, SliceSize)) + P.beginOffset(), P.size())) SliceTy = TypePartitionTy; if ((!SliceTy || (SliceTy->isArrayTy() && SliceTy->getArrayElementType()->isIntegerTy())) && - DL->isLegalInteger(SliceSize * 8)) - SliceTy = Type::getIntNTy(*C, SliceSize * 8); + DL->isLegalInteger(P.size() * 8)) + SliceTy = Type::getIntNTy(*C, P.size() * 8); if (!SliceTy) - SliceTy = ArrayType::get(Type::getInt8Ty(*C), SliceSize); - assert(DL->getTypeAllocSize(SliceTy) >= SliceSize); + SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size()); + assert(DL->getTypeAllocSize(SliceTy) >= P.size()); - bool IsIntegerPromotable = isIntegerWideningViable( - *DL, SliceTy, BeginOffset, AllocaSlices::const_range(B, E), SplitUses); + bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, *DL); VectorType *VecTy = - IsIntegerPromotable - ? nullptr - : isVectorPromotionViable(*DL, SliceTy, BeginOffset, EndOffset, - AllocaSlices::const_range(B, E), SplitUses); + IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, *DL); if (VecTy) SliceTy = VecTy; @@ -3224,11 +3998,12 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // perform phi and select speculation. AllocaInst *NewAI; if (SliceTy == AI.getAllocatedType()) { - assert(BeginOffset == 0 && + assert(P.beginOffset() == 0 && "Non-zero begin offset but same alloca type"); NewAI = &AI; // FIXME: We should be able to bail at this point with "nothing changed". // FIXME: We might want to defer PHI speculation until after here. + // FIXME: return nullptr; } else { unsigned Alignment = AI.getAlignment(); if (!Alignment) { @@ -3237,20 +4012,20 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // type. Alignment = DL->getABITypeAlignment(AI.getAllocatedType()); } - Alignment = MinAlign(Alignment, BeginOffset); + Alignment = MinAlign(Alignment, P.beginOffset()); // If we will get at least this much alignment from the type alone, leave // the alloca's alignment unconstrained. if (Alignment <= DL->getABITypeAlignment(SliceTy)) Alignment = 0; - NewAI = - new AllocaInst(SliceTy, nullptr, Alignment, - AI.getName() + ".sroa." + Twine(B - AS.begin()), &AI); + NewAI = new AllocaInst( + SliceTy, nullptr, Alignment, + AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI); ++NumNewAllocas; } DEBUG(dbgs() << "Rewriting alloca partition " - << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI - << "\n"); + << "[" << P.beginOffset() << "," << P.endOffset() + << ") to: " << *NewAI << "\n"); // Track the high watermark on the worklist as it is only relevant for // promoted allocas. We will reset it to this point if the alloca is not in @@ -3260,20 +4035,16 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, SmallPtrSet<PHINode *, 8> PHIUsers; SmallPtrSet<SelectInst *, 8> SelectUsers; - AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, BeginOffset, - EndOffset, IsIntegerPromotable, VecTy, PHIUsers, - SelectUsers); + AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, P.beginOffset(), + P.endOffset(), IsIntegerPromotable, VecTy, + PHIUsers, SelectUsers); bool Promotable = true; - for (auto & SplitUse : SplitUses) { - DEBUG(dbgs() << " rewriting split "); - DEBUG(AS.printSlice(dbgs(), SplitUse, "")); - Promotable &= Rewriter.visit(SplitUse); + for (Slice *S : P.splitSliceTails()) { + Promotable &= Rewriter.visit(S); ++NumUses; } - for (AllocaSlices::iterator I = B; I != E; ++I) { - DEBUG(dbgs() << " rewriting "); - DEBUG(AS.printSlice(dbgs(), I, "")); - Promotable &= Rewriter.visit(I); + for (Slice &S : P) { + Promotable &= Rewriter.visit(&S); ++NumUses; } @@ -3328,32 +4099,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, PostPromotionWorklist.pop_back(); } - return true; -} - -static void -removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses, - uint64_t &MaxSplitUseEndOffset, uint64_t Offset) { - if (Offset >= MaxSplitUseEndOffset) { - SplitUses.clear(); - MaxSplitUseEndOffset = 0; - return; - } - - size_t SplitUsesOldSize = SplitUses.size(); - SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(), - [Offset](const AllocaSlices::iterator &I) { - return I->endOffset() <= Offset; - }), - SplitUses.end()); - if (SplitUsesOldSize == SplitUses.size()) - return; - - // Recompute the max. While this is linear, so is remove_if. - MaxSplitUseEndOffset = 0; - for (AllocaSlices::iterator SplitUse : SplitUses) - MaxSplitUseEndOffset = - std::max(SplitUse->endOffset(), MaxSplitUseEndOffset); + return NewAI; } /// \brief Walks the slices of an alloca and form partitions based on them, @@ -3364,108 +4110,100 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { unsigned NumPartitions = 0; bool Changed = false; - SmallVector<AllocaSlices::iterator, 4> SplitUses; - uint64_t MaxSplitUseEndOffset = 0; - - uint64_t BeginOffset = AS.begin()->beginOffset(); - - for (AllocaSlices::iterator SI = AS.begin(), SJ = std::next(SI), - SE = AS.end(); - SI != SE; SI = SJ) { - uint64_t MaxEndOffset = SI->endOffset(); - - if (!SI->isSplittable()) { - // When we're forming an unsplittable region, it must always start at the - // first slice and will extend through its end. - assert(BeginOffset == SI->beginOffset()); - - // Form a partition including all of the overlapping slices with this - // unsplittable slice. - while (SJ != SE && SJ->beginOffset() < MaxEndOffset) { - if (!SJ->isSplittable()) - MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); - ++SJ; - } - } else { - assert(SI->isSplittable()); // Established above. - - // Collect all of the overlapping splittable slices. - while (SJ != SE && SJ->beginOffset() < MaxEndOffset && - SJ->isSplittable()) { - MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); - ++SJ; - } - - // Back up MaxEndOffset and SJ if we ended the span early when - // encountering an unsplittable slice. - if (SJ != SE && SJ->beginOffset() < MaxEndOffset) { - assert(!SJ->isSplittable()); - MaxEndOffset = SJ->beginOffset(); - } - } - - // Check if we have managed to move the end offset forward yet. If so, - // we'll have to rewrite uses and erase old split uses. - if (BeginOffset < MaxEndOffset) { - // Rewrite a sequence of overlapping slices. - Changed |= rewritePartition(AI, AS, SI, SJ, BeginOffset, MaxEndOffset, - SplitUses); - ++NumPartitions; - - removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, MaxEndOffset); - } - // Accumulate all the splittable slices from the [SI,SJ) region which - // overlap going forward. - for (AllocaSlices::iterator SK = SI; SK != SJ; ++SK) - if (SK->isSplittable() && SK->endOffset() > MaxEndOffset) { - SplitUses.push_back(SK); - MaxSplitUseEndOffset = std::max(SK->endOffset(), MaxSplitUseEndOffset); - } - - // If we're already at the end and we have no split uses, we're done. - if (SJ == SE && SplitUses.empty()) - break; + // First try to pre-split loads and stores. + Changed |= presplitLoadsAndStores(AI, AS); - // If we have no split uses or no gap in offsets, we're ready to move to - // the next slice. - if (SplitUses.empty() || (SJ != SE && MaxEndOffset == SJ->beginOffset())) { - BeginOffset = SJ->beginOffset(); + // Now that we have identified any pre-splitting opportunities, mark any + // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail + // to split these during pre-splitting, we want to force them to be + // rewritten into a partition. + bool IsSorted = true; + for (Slice &S : AS) { + if (!S.isSplittable()) continue; - } - - // Even if we have split slices, if the next slice is splittable and the - // split slices reach it, we can simply set up the beginning offset of the - // next iteration to bridge between them. - if (SJ != SE && SJ->isSplittable() && - MaxSplitUseEndOffset > SJ->beginOffset()) { - BeginOffset = MaxEndOffset; + // FIXME: We currently leave whole-alloca splittable loads and stores. This + // used to be the only splittable loads and stores and we need to be + // confident that the above handling of splittable loads and stores is + // completely sufficient before we forcibly disable the remaining handling. + if (S.beginOffset() == 0 && + S.endOffset() >= DL->getTypeAllocSize(AI.getAllocatedType())) continue; + if (isa<LoadInst>(S.getUse()->getUser()) || + isa<StoreInst>(S.getUse()->getUser())) { + S.makeUnsplittable(); + IsSorted = false; + } + } + if (!IsSorted) + std::sort(AS.begin(), AS.end()); + + /// \brief Describes the allocas introduced by rewritePartition + /// in order to migrate the debug info. + struct Piece { + AllocaInst *Alloca; + uint64_t Offset; + uint64_t Size; + Piece(AllocaInst *AI, uint64_t O, uint64_t S) + : Alloca(AI), Offset(O), Size(S) {} + }; + SmallVector<Piece, 4> Pieces; + + // Rewrite each partition. + for (auto &P : AS.partitions()) { + if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) { + Changed = true; + if (NewAI != &AI) { + uint64_t SizeOfByte = 8; + uint64_t AllocaSize = DL->getTypeSizeInBits(NewAI->getAllocatedType()); + // Don't include any padding. + uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte); + Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size)); + } } - - // Otherwise, we have a tail of split slices. Rewrite them with an empty - // range of slices. - uint64_t PostSplitEndOffset = - SJ == SE ? MaxSplitUseEndOffset : SJ->beginOffset(); - - Changed |= rewritePartition(AI, AS, SJ, SJ, MaxEndOffset, - PostSplitEndOffset, SplitUses); ++NumPartitions; - - if (SJ == SE) - break; // Skip the rest, we don't need to do any cleanup. - - removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, - PostSplitEndOffset); - - // Now just reset the begin offset for the next iteration. - BeginOffset = SJ->beginOffset(); } NumAllocaPartitions += NumPartitions; MaxPartitionsPerAlloca = std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca); + // Migrate debug information from the old alloca to the new alloca(s) + // and the individial partitions. + if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) { + DIVariable Var(DbgDecl->getVariable()); + DIExpression Expr(DbgDecl->getExpression()); + DIBuilder DIB(*AI.getParent()->getParent()->getParent(), + /*AllowUnresolved*/ false); + bool IsSplit = Pieces.size() > 1; + for (auto Piece : Pieces) { + // Create a piece expression describing the new partition or reuse AI's + // expression if there is only one partition. + DIExpression PieceExpr = Expr; + if (IsSplit || Expr.isBitPiece()) { + // If this alloca is already a scalar replacement of a larger aggregate, + // Piece.Offset describes the offset inside the scalar. + uint64_t Offset = Expr.isBitPiece() ? Expr.getBitPieceOffset() : 0; + uint64_t Start = Offset + Piece.Offset; + uint64_t Size = Piece.Size; + if (Expr.isBitPiece()) { + uint64_t AbsEnd = Expr.getBitPieceOffset() + Expr.getBitPieceSize(); + if (Start >= AbsEnd) + // No need to describe a SROAed padding. + continue; + Size = std::min(Size, AbsEnd - Start); + } + PieceExpr = DIB.createBitPieceExpression(Start, Size); + } + + // Remove any existing dbg.declare intrinsic describing the same alloca. + if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca)) + OldDDI->eraseFromParent(); + + auto *NewDDI = DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, &AI); + NewDDI->setDebugLoc(DbgDecl->getDebugLoc()); + } + } return Changed; } @@ -3561,7 +4299,8 @@ bool SROA::runOnAlloca(AllocaInst &AI) { /// /// We also record the alloca instructions deleted here so that they aren't /// subsequently handed to mem2reg to promote. -void SROA::deleteDeadInstructions(SmallPtrSetImpl<AllocaInst*> &DeletedAllocas) { +void SROA::deleteDeadInstructions( + SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) { while (!DeadInsts.empty()) { Instruction *I = DeadInsts.pop_back_val(); DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); @@ -3576,8 +4315,11 @@ void SROA::deleteDeadInstructions(SmallPtrSetImpl<AllocaInst*> &DeletedAllocas) DeadInsts.insert(U); } - if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { DeletedAllocas.insert(AI); + if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI)) + DbgDecl->eraseFromParent(); + } ++NumDeleted; I->eraseFromParent(); @@ -3608,14 +4350,14 @@ bool SROA::promoteAllocas(Function &F) { if (DT && !ForceSSAUpdater) { DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); - PromoteMemToReg(PromotableAllocas, *DT, nullptr, AT); + PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC); PromotableAllocas.clear(); return true; } DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); SSAUpdater SSA; - DIBuilder DIB(*F.getParent()); + DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); SmallVector<Instruction *, 64> Insts; // We need a worklist to walk the uses of each alloca. @@ -3690,13 +4432,14 @@ bool SROA::runOnFunction(Function &F) { DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; - AT = &getAnalysis<AssumptionTracker>(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); - I != E; ++I) + I != E; ++I) { if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) Worklist.insert(AI); + } bool Changed = false; // A set of deleted alloca instruction pointers which should be removed from @@ -3711,9 +4454,7 @@ bool SROA::runOnFunction(Function &F) { // Remove the deleted allocas from various lists so that we don't try to // continue processing them. if (!DeletedAllocas.empty()) { - auto IsInSet = [&](AllocaInst *AI) { - return DeletedAllocas.count(AI); - }; + auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); }; Worklist.remove_if(IsInSet); PostPromotionWorklist.remove_if(IsInSet); PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), @@ -3734,7 +4475,7 @@ bool SROA::runOnFunction(Function &F) { } void SROA::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); if (RequiresDomTree) AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp index 179bbf7..c7232a9 100644 --- a/lib/Transforms/Scalar/SampleProfile.cpp +++ b/lib/Transforms/Scalar/SampleProfile.cpp @@ -95,7 +95,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<PostDominatorTree>(); } @@ -731,7 +731,7 @@ INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile", "Sample Profile loader", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AddDiscriminators) INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile", "Sample Profile loader", false, false) @@ -762,7 +762,7 @@ bool SampleProfileLoader::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); PDT = &getAnalysis<PostDominatorTree>(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); Ctx = &F.getParent()->getContext(); Samples = Reader->getSamplesFor(F); if (!Samples->empty()) diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index a16e9e2..621633b 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -20,7 +20,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" using namespace llvm; @@ -28,6 +28,7 @@ using namespace llvm; /// ScalarOpts library. void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCEPass(Registry); + initializeBDCEPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); initializeSampleProfileLoaderPass(Registry); initializeConstantHoistingPass(Registry); @@ -38,12 +39,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeScalarizerPass(Registry); initializeDSEPass(Registry); initializeGVNPass(Registry); - initializeEarlyCSEPass(Registry); + initializeEarlyCSELegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); + initializeInductiveRangeCheckEliminationPass(Registry); initializeIndVarSimplifyPass(Registry); initializeJumpThreadingPass(Registry); initializeLICMPass(Registry); initializeLoopDeletionPass(Registry); + initializeLoopAccessAnalysisPass(Registry); initializeLoopInstSimplifyPass(Registry); initializeLoopRotatePass(Registry); initializeLoopStrengthReducePass(Registry); @@ -58,6 +61,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializePartiallyInlineLibCallsPass(Registry); initializeReassociatePass(Registry); initializeRegToMemPass(Registry); + initializeRewriteStatepointsForGCPass(Registry); initializeSCCPPass(Registry); initializeIPSCCPPass(Registry); initializeSROAPass(Registry); @@ -68,7 +72,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeSinkingPass(Registry); initializeTailCallElimPass(Registry); initializeSeparateConstOffsetFromGEPPass(Registry); + initializeStraightLineStrengthReducePass(Registry); initializeLoadCombinePass(Registry); + initializePlaceBackedgeSafepointsImplPass(Registry); + initializePlaceSafepointsPass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { @@ -79,6 +86,10 @@ void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createAggressiveDCEPass()); } +void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createBitTrackingDCEPass()); +} + void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createAlignmentFromAssumptionsPass()); } diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index f7fa917..5c49a55 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -23,7 +23,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" @@ -198,7 +198,7 @@ namespace { // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } @@ -216,7 +216,7 @@ namespace { // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.setPreservesCFG(); } }; @@ -228,14 +228,14 @@ char SROA_SSAUp::ID = 0; INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa", "Scalar Replacement of Aggregates (SSAUp)", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa", "Scalar Replacement of Aggregates (SSAUp)", false, false) @@ -1068,12 +1068,14 @@ public: void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) { // Remember which alloca we're promoting (for isInstInList). this->AI = AI; - if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI)) { - for (User *U : DebugNode->users()) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) - DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) - DVIs.push_back(DVI); + if (auto *L = LocalAsMetadata::getIfExists(AI)) { + if (auto *DebugNode = MetadataAsValue::getIfExists(AI->getContext(), L)) { + for (User *U : DebugNode->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) + DDIs.push_back(DDI); + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) + DVIs.push_back(DVI); + } } LoadAndStorePromoter::run(Insts); @@ -1417,10 +1419,11 @@ bool SROA::performPromotion(Function &F) { DominatorTree *DT = nullptr; if (HasDomTree) DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - AssumptionTracker *AT = &getAnalysis<AssumptionTracker>(); + AssumptionCache &AC = + getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function - DIBuilder DIB(*F.getParent()); + DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); bool Changed = false; SmallVector<Instruction*, 64> Insts; while (1) { @@ -1436,7 +1439,7 @@ bool SROA::performPromotion(Function &F) { if (Allocas.empty()) break; if (HasDomTree) - PromoteMemToReg(Allocas, *DT, nullptr, AT); + PromoteMemToReg(Allocas, *DT, nullptr, &AC); else { SSAUpdater SSA; for (unsigned i = 0, e = Allocas.size(); i != e; ++i) { diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 6157746..bffe8df 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -313,7 +313,8 @@ class SeparateConstOffsetFromGEP : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DataLayoutPass>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); } bool doInitialization(Module &M) override { @@ -384,7 +385,7 @@ INITIALIZE_PASS_BEGIN( SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", "Split GEPs to a variadic base and a constant offset for better CSE", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DataLayoutPass) INITIALIZE_PASS_END( SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", @@ -857,7 +858,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // of variable indices. Therefore, we don't check for addressing modes in that // case. if (!LowerGEP) { - TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); + TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *GEP->getParent()->getParent()); if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(), /*BaseGV=*/nullptr, AccumulativeByteOffset, /*HasBaseReg=*/true, /*Scale=*/0)) { @@ -910,7 +913,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { if (LowerGEP) { // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to // arithmetic operations if the target uses alias analysis in codegen. - if (TM && TM->getSubtarget<TargetSubtargetInfo>().useAA()) + if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA()) lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset); else lowerToArithmetics(GEP, AccumulativeByteOffset); @@ -996,6 +999,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { } bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + if (DisableSeparateConstOffsetFromGEP) return false; diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 046a7cb..fb8fe38 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -21,11 +21,11 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/SimplifyCFG.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CFG.h" @@ -37,6 +37,7 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; #define DEBUG_TYPE "simplifycfg" @@ -47,36 +48,6 @@ UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1), STATISTIC(NumSimpl, "Number of blocks simplified"); -namespace { -struct CFGSimplifyPass : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - unsigned BonusInstThreshold; - CFGSimplifyPass(int T = -1) : FunctionPass(ID) { - BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T); - initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); - } - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); - AU.addRequired<TargetTransformInfo>(); - } -}; -} - -char CFGSimplifyPass::ID = 0; -INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, - false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) -INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, - false) - -// Public interface to the CFGSimplification pass -FunctionPass *llvm::createCFGSimplificationPass(int Threshold) { - return new CFGSimplifyPass(Threshold); -} - /// mergeEmptyReturnBlocks - If we have more than one empty (other than phi /// node) return blocks, merge them together to promote recursive block merging. static bool mergeEmptyReturnBlocks(Function &F) { @@ -156,8 +127,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, - const DataLayout *DL, - AssumptionTracker *AT, + const DataLayout *DL, AssumptionCache *AC, unsigned BonusInstThreshold) { bool Changed = false; bool LocalChange = true; @@ -167,7 +137,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AT)) { + if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AC)) { LocalChange = true; ++NumSimpl; } @@ -177,20 +147,12 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, return Changed; } -// It is possible that we may require multiple passes over the code to fully -// simplify the CFG. -// -bool CFGSimplifyPass::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - - AssumptionTracker *AT = &getAnalysis<AssumptionTracker>(); - const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; +static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI, + const DataLayout *DL, AssumptionCache *AC, + int BonusInstThreshold) { bool EverChanged = removeUnreachableBlocks(F); EverChanged |= mergeEmptyReturnBlocks(F); - EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AT, BonusInstThreshold); + EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -204,9 +166,69 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { return true; do { - EverChanged = iterativelySimplifyCFG(F, TTI, DL, AT, BonusInstThreshold); + EverChanged = iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold); EverChanged |= removeUnreachableBlocks(F); } while (EverChanged); return true; } + +SimplifyCFGPass::SimplifyCFGPass() + : BonusInstThreshold(UserBonusInstThreshold) {} + +SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold) + : BonusInstThreshold(BonusInstThreshold) {} + +PreservedAnalyses SimplifyCFGPass::run(Function &F, + AnalysisManager<Function> *AM) { + auto *DL = F.getParent()->getDataLayout(); + auto &TTI = AM->getResult<TargetIRAnalysis>(F); + auto &AC = AM->getResult<AssumptionAnalysis>(F); + + if (!simplifyFunctionCFG(F, TTI, DL, &AC, BonusInstThreshold)) + return PreservedAnalyses::none(); + + return PreservedAnalyses::all(); +} + +namespace { +struct CFGSimplifyPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + unsigned BonusInstThreshold; + CFGSimplifyPass(int T = -1) : FunctionPass(ID) { + BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T); + initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override { + if (skipOptnoneFunction(F)) + return false; + + AssumptionCache *AC = + &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + const TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; + return simplifyFunctionCFG(F, TTI, DL, AC, BonusInstThreshold); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + } +}; +} + +char CFGSimplifyPass::ID = 0; +INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, + false) + +// Public interface to the CFGSimplification pass +FunctionPass *llvm::createCFGSimplificationPass(int Threshold) { + return new CFGSimplifyPass(Threshold); +} + diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index 903b675..d0ee0a6 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -50,9 +50,9 @@ namespace { FunctionPass::getAnalysisUsage(AU); AU.addRequired<AliasAnalysis>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); } private: bool ProcessBlock(BasicBlock &BB); @@ -64,7 +64,7 @@ namespace { char Sinking::ID = 0; INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false) @@ -98,7 +98,7 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, bool Sinking::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); AA = &getAnalysis<AliasAnalysis>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp new file mode 100644 index 0000000..4edc86c --- /dev/null +++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -0,0 +1,274 @@ +//===-- StraightLineStrengthReduce.cpp - ------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements straight-line strength reduction (SLSR). Unlike loop +// strength reduction, this algorithm is designed to reduce arithmetic +// redundancy in straight-line code instead of loops. It has proven to be +// effective in simplifying arithmetic statements derived from an unrolled loop. +// It can also simplify the logic of SeparateConstOffsetFromGEP. +// +// There are many optimizations we can perform in the domain of SLSR. This file +// for now contains only an initial step. Specifically, we look for strength +// reduction candidate in the form of +// +// (B + i) * S +// +// where B and S are integer constants or variables, and i is a constant +// integer. If we found two such candidates +// +// S1: X = (B + i) * S S2: Y = (B + i') * S +// +// and S1 dominates S2, we call S1 a basis of S2, and can replace S2 with +// +// Y = X + (i' - i) * S +// +// where (i' - i) * S is folded to the extent possible. When S2 has multiple +// bases, we pick the one that is closest to S2, or S2's "immediate" basis. +// +// TODO: +// +// - Handle candidates in the form of B + i * S +// +// - Handle candidates in the form of pointer arithmetics. e.g., B[i * S] +// +// - Floating point arithmetics when fast math is enabled. +// +// - SLSR may decrease ILP at the architecture level. Targets that are very +// sensitive to ILP may want to disable it. Having SLSR to consider ILP is +// left as future work. +#include <vector> + +#include "llvm/ADT/DenseSet.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; +using namespace PatternMatch; + +namespace { + +class StraightLineStrengthReduce : public FunctionPass { + public: + // SLSR candidate. Such a candidate must be in the form of + // (Base + Index) * Stride + struct Candidate : public ilist_node<Candidate> { + Candidate(Value *B = nullptr, ConstantInt *Idx = nullptr, + Value *S = nullptr, Instruction *I = nullptr) + : Base(B), Index(Idx), Stride(S), Ins(I), Basis(nullptr) {} + Value *Base; + ConstantInt *Index; + Value *Stride; + // The instruction this candidate corresponds to. It helps us to rewrite a + // candidate with respect to its immediate basis. Note that one instruction + // can corresponds to multiple candidates depending on how you associate the + // expression. For instance, + // + // (a + 1) * (b + 2) + // + // can be treated as + // + // <Base: a, Index: 1, Stride: b + 2> + // + // or + // + // <Base: b, Index: 2, Stride: a + 1> + Instruction *Ins; + // Points to the immediate basis of this candidate, or nullptr if we cannot + // find any basis for this candidate. + Candidate *Basis; + }; + + static char ID; + + StraightLineStrengthReduce() : FunctionPass(ID), DT(nullptr) { + initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + // We do not modify the shape of the CFG. + AU.setPreservesCFG(); + } + + bool runOnFunction(Function &F) override; + + private: + // Returns true if Basis is a basis for C, i.e., Basis dominates C and they + // share the same base and stride. + bool isBasisFor(const Candidate &Basis, const Candidate &C); + // Checks whether I is in a candidate form. If so, adds all the matching forms + // to Candidates, and tries to find the immediate basis for each of them. + void allocateCandidateAndFindBasis(Instruction *I); + // Given that I is in the form of "(B + Idx) * S", adds this form to + // Candidates, and finds its immediate basis. + void allocateCandidateAndFindBasis(Value *B, ConstantInt *Idx, Value *S, + Instruction *I); + // Rewrites candidate C with respect to Basis. + void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis); + + DominatorTree *DT; + ilist<Candidate> Candidates; + // Temporarily holds all instructions that are unlinked (but not deleted) by + // rewriteCandidateWithBasis. These instructions will be actually removed + // after all rewriting finishes. + DenseSet<Instruction *> UnlinkedInstructions; +}; +} // anonymous namespace + +char StraightLineStrengthReduce::ID = 0; +INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr", + "Straight line strength reduction", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr", + "Straight line strength reduction", false, false) + +FunctionPass *llvm::createStraightLineStrengthReducePass() { + return new StraightLineStrengthReduce(); +} + +bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, + const Candidate &C) { + return (Basis.Ins != C.Ins && // skip the same instruction + // Basis must dominate C in order to rewrite C with respect to Basis. + DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) && + // They share the same base and stride. + Basis.Base == C.Base && + Basis.Stride == C.Stride); +} + +// TODO: We currently implement an algorithm whose time complexity is linear to +// the number of existing candidates. However, a better algorithm exists. We +// could depth-first search the dominator tree, and maintain a hash table that +// contains all candidates that dominate the node being traversed. This hash +// table is indexed by the base and the stride of a candidate. Therefore, +// finding the immediate basis of a candidate boils down to one hash-table look +// up. +void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Value *B, + ConstantInt *Idx, + Value *S, + Instruction *I) { + Candidate C(B, Idx, S, I); + // Try to compute the immediate basis of C. + unsigned NumIterations = 0; + // Limit the scan radius to avoid running forever. + static const unsigned MaxNumIterations = 50; + for (auto Basis = Candidates.rbegin(); + Basis != Candidates.rend() && NumIterations < MaxNumIterations; + ++Basis, ++NumIterations) { + if (isBasisFor(*Basis, C)) { + C.Basis = &(*Basis); + break; + } + } + // Regardless of whether we find a basis for C, we need to push C to the + // candidate list. + Candidates.push_back(C); +} + +void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Instruction *I) { + Value *B = nullptr; + ConstantInt *Idx = nullptr; + // "(Base + Index) * Stride" must be a Mul instruction at the first hand. + if (I->getOpcode() == Instruction::Mul) { + if (IntegerType *ITy = dyn_cast<IntegerType>(I->getType())) { + Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); + for (unsigned Swapped = 0; Swapped < 2; ++Swapped) { + // Only handle the canonical operand ordering. + if (match(LHS, m_Add(m_Value(B), m_ConstantInt(Idx)))) { + // If LHS is in the form of "Base + Index", then I is in the form of + // "(Base + Index) * RHS". + allocateCandidateAndFindBasis(B, Idx, RHS, I); + } else { + // Otherwise, at least try the form (LHS + 0) * RHS. + allocateCandidateAndFindBasis(LHS, ConstantInt::get(ITy, 0), RHS, I); + } + // Swap LHS and RHS so that we also cover the cases where LHS is the + // stride. + if (LHS == RHS) + break; + std::swap(LHS, RHS); + } + } + } +} + +void StraightLineStrengthReduce::rewriteCandidateWithBasis( + const Candidate &C, const Candidate &Basis) { + // An instruction can correspond to multiple candidates. Therefore, instead of + // simply deleting an instruction when we rewrite it, we mark its parent as + // nullptr (i.e. unlink it) so that we can skip the candidates whose + // instruction is already rewritten. + if (!C.Ins->getParent()) + return; + assert(C.Base == Basis.Base && C.Stride == Basis.Stride); + // Basis = (B + i) * S + // C = (B + i') * S + // ==> + // C = Basis + (i' - i) * S + IRBuilder<> Builder(C.Ins); + ConstantInt *IndexOffset = ConstantInt::get( + C.Ins->getContext(), C.Index->getValue() - Basis.Index->getValue()); + Value *Reduced; + // TODO: preserve nsw/nuw in some cases. + if (IndexOffset->isOne()) { + // If (i' - i) is 1, fold C into Basis + S. + Reduced = Builder.CreateAdd(Basis.Ins, C.Stride); + } else if (IndexOffset->isMinusOne()) { + // If (i' - i) is -1, fold C into Basis - S. + Reduced = Builder.CreateSub(Basis.Ins, C.Stride); + } else { + Value *Bump = Builder.CreateMul(C.Stride, IndexOffset); + Reduced = Builder.CreateAdd(Basis.Ins, Bump); + } + Reduced->takeName(C.Ins); + C.Ins->replaceAllUsesWith(Reduced); + C.Ins->dropAllReferences(); + // Unlink C.Ins so that we can skip other candidates also corresponding to + // C.Ins. The actual deletion is postponed to the end of runOnFunction. + C.Ins->removeFromParent(); + UnlinkedInstructions.insert(C.Ins); +} + +bool StraightLineStrengthReduce::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + // Traverse the dominator tree in the depth-first order. This order makes sure + // all bases of a candidate are in Candidates when we process it. + for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT); + node != GraphTraits<DominatorTree *>::nodes_end(DT); ++node) { + BasicBlock *B = node->getBlock(); + for (auto I = B->begin(); I != B->end(); ++I) { + allocateCandidateAndFindBasis(I); + } + } + + // Rewrite candidates in the reverse depth-first order. This order makes sure + // a candidate being rewritten is not a basis for any other candidate. + while (!Candidates.empty()) { + const Candidate &C = Candidates.back(); + if (C.Basis != nullptr) { + rewriteCandidateWithBasis(C, *C.Basis); + } + Candidates.pop_back(); + } + + // Delete all unlink instructions. + for (auto I : UnlinkedInstructions) { + delete I; + } + bool Ret = !UnlinkedInstructions.empty(); + UnlinkedInstructions.clear(); + return Ret; +} diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index b9673ed..aaf6f9a 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -10,11 +10,14 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/RegionPass.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; @@ -166,6 +169,7 @@ class StructurizeCFG : public RegionPass { Region *ParentRegion; DominatorTree *DT; + LoopInfo *LI; RNVector Order; BBSet Visited; @@ -247,6 +251,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(LowerSwitchID); AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); RegionPass::getAnalysisUsage(AU); } @@ -278,11 +283,65 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { /// \brief Build up the general order of nodes void StructurizeCFG::orderNodes() { - scc_iterator<Region *> I = scc_begin(ParentRegion); - for (Order.clear(); !I.isAtEnd(); ++I) { - const std::vector<RegionNode *> &Nodes = *I; - Order.append(Nodes.begin(), Nodes.end()); + RNVector TempOrder; + ReversePostOrderTraversal<Region*> RPOT(ParentRegion); + TempOrder.append(RPOT.begin(), RPOT.end()); + + std::map<Loop*, unsigned> LoopBlocks; + + + // The reverse post-order traversal of the list gives us an ordering close + // to what we want. The only problem with it is that sometimes backedges + // for outer loops will be visited before backedges for inner loops. + for (RegionNode *RN : TempOrder) { + BasicBlock *BB = RN->getEntry(); + Loop *Loop = LI->getLoopFor(BB); + if (!LoopBlocks.count(Loop)) { + LoopBlocks[Loop] = 1; + continue; + } + LoopBlocks[Loop]++; } + + unsigned CurrentLoopDepth = 0; + Loop *CurrentLoop = nullptr; + BBSet TempVisited; + for (RNVector::iterator I = TempOrder.begin(), E = TempOrder.end(); I != E; ++I) { + BasicBlock *BB = (*I)->getEntry(); + unsigned LoopDepth = LI->getLoopDepth(BB); + + if (std::find(Order.begin(), Order.end(), *I) != Order.end()) + continue; + + if (LoopDepth < CurrentLoopDepth) { + // Make sure we have visited all blocks in this loop before moving back to + // the outer loop. + + RNVector::iterator LoopI = I; + while(LoopBlocks[CurrentLoop]) { + LoopI++; + BasicBlock *LoopBB = (*LoopI)->getEntry(); + if (LI->getLoopFor(LoopBB) == CurrentLoop) { + LoopBlocks[CurrentLoop]--; + Order.push_back(*LoopI); + } + } + } + + CurrentLoop = LI->getLoopFor(BB); + if (CurrentLoop) { + LoopBlocks[CurrentLoop]--; + } + + CurrentLoopDepth = LoopDepth; + Order.push_back(*I); + } + + // This pass originally used a post-order traversal and then operated on + // the list in reverse. Now that we are using a reverse post-order traversal + // rather than re-working the whole pass to operate on the list in order, + // we just reverse the list and continue to operate on it in reverse. + std::reverse(Order.begin(), Order.end()); } /// \brief Determine the end of the loops @@ -301,8 +360,9 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { BasicBlock *Succ = Term->getSuccessor(i); - if (Visited.count(Succ)) + if (Visited.count(Succ)) { Loops[Succ] = BB; + } } } } @@ -437,6 +497,10 @@ void StructurizeCFG::collectInfos() { for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend(); OI != OE; ++OI) { + DEBUG(dbgs() << "Visiting: " << + ((*OI)->isSubRegion() ? "SubRegion with entry: " : "") << + (*OI)->getEntry()->getName() << " Loop Depth: " << LI->getLoopDepth((*OI)->getEntry()) << "\n"); + // Analyze all the conditions leading to a node gatherPredicates(*OI); @@ -862,6 +926,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { ParentRegion = R; DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); orderNodes(); collectInfos(); diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index f3c3e30..715ddeb 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -126,7 +126,7 @@ namespace { char TailCallElim::ID = 0; INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination", false, false) @@ -136,7 +136,7 @@ FunctionPass *llvm::createTailCallEliminationPass() { } void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } /// \brief Scan the specified function for alloca instructions. @@ -386,7 +386,7 @@ bool TailCallElim::runTRE(Function &F) { // right, so don't even try to convert it... if (F.getFunctionType()->isVarArg()) return false; - TTI = &getAnalysis<TargetTransformInfo>(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); BasicBlock *OldEntry = nullptr; bool TailCallsAreMarkedTail = false; SmallVector<PHINode*, 8> ArgumentPHIs; |