diff options
Diffstat (limited to 'lib/Transforms/Scalar')
-rw-r--r-- | lib/Transforms/Scalar/CodeGenPrepare.cpp | 7 | ||||
-rw-r--r-- | lib/Transforms/Scalar/GVN.cpp | 443 | ||||
-rw-r--r-- | lib/Transforms/Scalar/IndVarSimplify.cpp | 2 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LoopRotation.cpp | 43 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LoopStrengthReduce.cpp | 99 | ||||
-rw-r--r-- | lib/Transforms/Scalar/MemCpyOptimizer.cpp | 8 |
6 files changed, 363 insertions, 239 deletions
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp index 615c517..f0d29c8 100644 --- a/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/ValueMap.h" #include "llvm/Analysis/DominatorInternals.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -88,7 +89,7 @@ namespace { /// Keeps track of non-local addresses that have been sunk into a block. /// This allows us to avoid inserting duplicate code for blocks with /// multiple load/stores of the same address. - DenseMap<Value*, Value*> SunkAddrs; + ValueMap<Value*, Value*> SunkAddrs; /// ModifiedDT - If CFG is modified in anyway, dominator tree may need to /// be updated. @@ -1653,10 +1654,6 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // start of the block. CurInstIterator = BB->begin(); SunkAddrs.clear(); - } else { - // This address is now available for reassignment, so erase the table - // entry; we don't want to match some completely different instruction. - SunkAddrs[Addr] = 0; } } ++NumMemoryInsts; diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 129af8d..996996d 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -45,6 +45,7 @@ #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include <vector> using namespace llvm; using namespace PatternMatch; @@ -498,6 +499,75 @@ void ValueTable::verifyRemoved(const Value *V) const { //===----------------------------------------------------------------------===// namespace { + class GVN; + struct AvailableValueInBlock { + /// BB - The basic block in question. + BasicBlock *BB; + enum ValType { + SimpleVal, // A simple offsetted value that is accessed. + LoadVal, // A value produced by a load. + MemIntrin // A memory intrinsic which is loaded from. + }; + + /// V - The value that is live out of the block. + PointerIntPair<Value *, 2, ValType> Val; + + /// Offset - The byte offset in Val that is interesting for the load query. + unsigned Offset; + + static AvailableValueInBlock get(BasicBlock *BB, Value *V, + unsigned Offset = 0) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(V); + Res.Val.setInt(SimpleVal); + Res.Offset = Offset; + return Res; + } + + static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI, + unsigned Offset = 0) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(MI); + Res.Val.setInt(MemIntrin); + Res.Offset = Offset; + return Res; + } + + static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI, + unsigned Offset = 0) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(LI); + Res.Val.setInt(LoadVal); + Res.Offset = Offset; + return Res; + } + + bool isSimpleValue() const { return Val.getInt() == SimpleVal; } + bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; } + bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; } + + Value *getSimpleValue() const { + assert(isSimpleValue() && "Wrong accessor"); + return Val.getPointer(); + } + + LoadInst *getCoercedLoadValue() const { + assert(isCoercedLoadValue() && "Wrong accessor"); + return cast<LoadInst>(Val.getPointer()); + } + + MemIntrinsic *getMemIntrinValue() const { + assert(isMemIntrinValue() && "Wrong accessor"); + return cast<MemIntrinsic>(Val.getPointer()); + } + + /// MaterializeAdjustedValue - Emit code into this block to adjust the value + /// defined here to the specified type. This handles various coercion cases. + Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const; + }; class GVN : public FunctionPass { bool NoLoads; @@ -519,6 +589,11 @@ namespace { BumpPtrAllocator TableAllocator; SmallVector<Instruction*, 8> InstrsToErase; + + typedef SmallVector<NonLocalDepResult, 64> LoadDepVect; + typedef SmallVector<AvailableValueInBlock, 64> AvailValInBlkVect; + typedef SmallVector<BasicBlock*, 64> UnavailBlkVect; + public: static char ID; // Pass identification, replacement for typeid explicit GVN(bool noloads = false) @@ -599,11 +674,17 @@ namespace { } - // Helper fuctions - // FIXME: eliminate or document these better + // Helper fuctions of redundant load elimination bool processLoad(LoadInst *L); - bool processInstruction(Instruction *I); bool processNonLocalLoad(LoadInst *L); + void AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, + AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks); + bool PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks); + + // Other helper routines + bool processInstruction(Instruction *I); bool processBlock(BasicBlock *BB); void dump(DenseMap<uint32_t, Value*> &d); bool iterateOnFunction(Function &F); @@ -612,6 +693,7 @@ namespace { void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; bool splitCriticalEdges(); + BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ); unsigned replaceAllDominatedUsesWith(Value *From, Value *To, const BasicBlockEdge &Root); bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root); @@ -1159,114 +1241,6 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, return ConstantFoldLoadFromConstPtr(Src, &TD); } -namespace { - -struct AvailableValueInBlock { - /// BB - The basic block in question. - BasicBlock *BB; - enum ValType { - SimpleVal, // A simple offsetted value that is accessed. - LoadVal, // A value produced by a load. - MemIntrin // A memory intrinsic which is loaded from. - }; - - /// V - The value that is live out of the block. - PointerIntPair<Value *, 2, ValType> Val; - - /// Offset - The byte offset in Val that is interesting for the load query. - unsigned Offset; - - static AvailableValueInBlock get(BasicBlock *BB, Value *V, - unsigned Offset = 0) { - AvailableValueInBlock Res; - Res.BB = BB; - Res.Val.setPointer(V); - Res.Val.setInt(SimpleVal); - Res.Offset = Offset; - return Res; - } - - static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI, - unsigned Offset = 0) { - AvailableValueInBlock Res; - Res.BB = BB; - Res.Val.setPointer(MI); - Res.Val.setInt(MemIntrin); - Res.Offset = Offset; - return Res; - } - - static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI, - unsigned Offset = 0) { - AvailableValueInBlock Res; - Res.BB = BB; - Res.Val.setPointer(LI); - Res.Val.setInt(LoadVal); - Res.Offset = Offset; - return Res; - } - - bool isSimpleValue() const { return Val.getInt() == SimpleVal; } - bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; } - bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; } - - Value *getSimpleValue() const { - assert(isSimpleValue() && "Wrong accessor"); - return Val.getPointer(); - } - - LoadInst *getCoercedLoadValue() const { - assert(isCoercedLoadValue() && "Wrong accessor"); - return cast<LoadInst>(Val.getPointer()); - } - - MemIntrinsic *getMemIntrinValue() const { - assert(isMemIntrinValue() && "Wrong accessor"); - return cast<MemIntrinsic>(Val.getPointer()); - } - - /// MaterializeAdjustedValue - Emit code into this block to adjust the value - /// defined here to the specified type. This handles various coercion cases. - Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const { - Value *Res; - if (isSimpleValue()) { - Res = getSimpleValue(); - if (Res->getType() != LoadTy) { - const DataLayout *TD = gvn.getDataLayout(); - assert(TD && "Need target data to handle type mismatch case"); - Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), - *TD); - - DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " - << *getSimpleValue() << '\n' - << *Res << '\n' << "\n\n\n"); - } - } else if (isCoercedLoadValue()) { - LoadInst *Load = getCoercedLoadValue(); - if (Load->getType() == LoadTy && Offset == 0) { - Res = Load; - } else { - Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(), - gvn); - - DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " " - << *getCoercedLoadValue() << '\n' - << *Res << '\n' << "\n\n\n"); - } - } else { - const DataLayout *TD = gvn.getDataLayout(); - assert(TD && "Need target data to handle type mismatch case"); - Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, - LoadTy, BB->getTerminator(), *TD); - DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset - << " " << *getMemIntrinValue() << '\n' - << *Res << '\n' << "\n\n\n"); - } - return Res; - } -}; - -} // end anonymous namespace /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock, /// construct SSA form, allowing us to eliminate LI. This returns the value @@ -1323,48 +1297,59 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, return V; } +Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const { + Value *Res; + if (isSimpleValue()) { + Res = getSimpleValue(); + if (Res->getType() != LoadTy) { + const DataLayout *TD = gvn.getDataLayout(); + assert(TD && "Need target data to handle type mismatch case"); + Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), + *TD); + + DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " + << *getSimpleValue() << '\n' + << *Res << '\n' << "\n\n\n"); + } + } else if (isCoercedLoadValue()) { + LoadInst *Load = getCoercedLoadValue(); + if (Load->getType() == LoadTy && Offset == 0) { + Res = Load; + } else { + Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(), + gvn); + + DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " " + << *getCoercedLoadValue() << '\n' + << *Res << '\n' << "\n\n\n"); + } + } else { + const DataLayout *TD = gvn.getDataLayout(); + assert(TD && "Need target data to handle type mismatch case"); + Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, + LoadTy, BB->getTerminator(), *TD); + DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset + << " " << *getMemIntrinValue() << '\n' + << *Res << '\n' << "\n\n\n"); + } + return Res; +} + static bool isLifetimeStart(const Instruction *Inst) { if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst)) return II->getIntrinsicID() == Intrinsic::lifetime_start; return false; } -/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are -/// non-local by performing PHI construction. -bool GVN::processNonLocalLoad(LoadInst *LI) { - // Find the non-local dependencies of the load. - SmallVector<NonLocalDepResult, 64> Deps; - AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI); - MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps); - //DEBUG(dbgs() << "INVESTIGATING NONLOCAL LOAD: " - // << Deps.size() << *LI << '\n'); - - // If we had to process more than one hundred blocks to find the - // dependencies, this load isn't worth worrying about. Optimizing - // it will be too expensive. - unsigned NumDeps = Deps.size(); - if (NumDeps > 100) - return false; - - // If we had a phi translation failure, we'll have a single entry which is a - // clobber in the current block. Reject this early. - if (NumDeps == 1 && - !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) { - DEBUG( - dbgs() << "GVN: non-local load "; - WriteAsOperand(dbgs(), LI); - dbgs() << " has unknown dependencies\n"; - ); - return false; - } +void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, + AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks) { // Filter out useless results (non-locals, etc). Keep track of the blocks // where we have a value available in repl, also keep track of whether we see // dependencies that produce an unknown value for the load (such as a call // that could potentially clobber the load). - SmallVector<AvailableValueInBlock, 64> ValuesPerBlock; - SmallVector<BasicBlock*, 64> UnavailableBlocks; - + unsigned NumDeps = Deps.size(); for (unsigned i = 0, e = NumDeps; i != e; ++i) { BasicBlock *DepBB = Deps[i].getBB(); MemDepResult DepInfo = Deps[i].getResult(); @@ -1480,35 +1465,11 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { } UnavailableBlocks.push_back(DepBB); - continue; } +} - // If we have no predecessors that produce a known value for this load, exit - // early. - if (ValuesPerBlock.empty()) return false; - - // If all of the instructions we depend on produce a known value for this - // load, then it is fully redundant and we can use PHI insertion to compute - // its value. Insert PHIs and remove the fully redundant value now. - if (UnavailableBlocks.empty()) { - DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n'); - - // Perform PHI construction. - Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this); - LI->replaceAllUsesWith(V); - - if (isa<PHINode>(V)) - V->takeName(LI); - if (V->getType()->getScalarType()->isPointerTy()) - MD->invalidateCachedPointerInfo(V); - markInstructionForDeletion(LI); - ++NumGVNLoad; - return true; - } - - if (!EnablePRE || !EnableLoadPRE) - return false; - +bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks) { // Okay, we have *some* definitions of the value. This means that the value // is available in some of our (transitive) predecessors. Lets think about // doing PRE of this load. This will involve inserting a new load into the @@ -1526,7 +1487,6 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { BasicBlock *LoadBB = LI->getParent(); BasicBlock *TmpBB = LoadBB; - bool allSingleSucc = true; while (TmpBB->getSinglePredecessor()) { TmpBB = TmpBB->getSinglePredecessor(); if (TmpBB == LoadBB) // Infinite (unreachable) loop. @@ -1555,7 +1515,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) FullyAvailableBlocks[UnavailableBlocks[i]] = false; - SmallVector<std::pair<TerminatorInst*, unsigned>, 4> NeedToSplit; + SmallVector<BasicBlock *, 4> CriticalEdgePred; for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E; ++PI) { BasicBlock *Pred = *PI; @@ -1578,20 +1538,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return false; } - unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB); - NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum)); + CriticalEdgePred.push_back(Pred); } } - if (!NeedToSplit.empty()) { - toSplit.append(NeedToSplit.begin(), NeedToSplit.end()); - return false; - } - // Decide whether PRE is profitable for this load. unsigned NumUnavailablePreds = PredLoads.size(); assert(NumUnavailablePreds != 0 && - "Fully available value should be eliminated above!"); + "Fully available value should already be eliminated!"); // If this load is unavailable in multiple predecessors, reject it. // FIXME: If we could restructure the CFG, we could make a common pred with @@ -1600,6 +1554,17 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { if (NumUnavailablePreds != 1) return false; + // Split critical edges, and update the unavailable predecessors accordingly. + for (SmallVector<BasicBlock *, 4>::iterator I = CriticalEdgePred.begin(), + E = CriticalEdgePred.end(); I != E; I++) { + BasicBlock *OrigPred = *I; + BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB); + PredLoads.erase(OrigPred); + PredLoads[NewPred] = 0; + DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->" + << LoadBB->getName() << '\n'); + } + // Check if the load can safely be moved to all the unavailable predecessors. bool CanDoPRE = true; SmallVector<Instruction*, 8> NewInsts; @@ -1615,13 +1580,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { // pointer if it is not available. PHITransAddr Address(LI->getPointerOperand(), TD); Value *LoadPtr = 0; - if (allSingleSucc) { - LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, - *DT, NewInsts); - } else { - Address.PHITranslateValue(LoadBB, UnavailablePred, DT); - LoadPtr = Address.getAddr(); - } + LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, + *DT, NewInsts); // If we couldn't find or insert a computation of this phi translated value, // we fail PRE. @@ -1632,24 +1592,6 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { break; } - // Make sure it is valid to move this load here. We have to watch out for: - // @1 = getelementptr (i8* p, ... - // test p and branch if == 0 - // load @1 - // It is valid to have the getelementptr before the test, even if p can - // be 0, as getelementptr only does address arithmetic. - // If we are not pushing the value through any multiple-successor blocks - // we do not have this case. Otherwise, check that the load is safe to - // put anywhere; this can be improved, but should be conservatively safe. - if (!allSingleSucc && - // FIXME: REEVALUTE THIS. - !isSafeToLoadUnconditionally(LoadPtr, - UnavailablePred->getTerminator(), - LI->getAlignment(), TD)) { - CanDoPRE = false; - break; - } - I->second = LoadPtr; } @@ -1659,7 +1601,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { if (MD) MD->removeInstruction(I); I->eraseFromParent(); } - return false; + // HINT:Don't revert the edge-splitting as following transformation may + // also need to split these critial edges. + return !CriticalEdgePred.empty(); } // Okay, we can eliminate this load by inserting a reload in the predecessor @@ -1714,6 +1658,72 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return true; } +/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are +/// non-local by performing PHI construction. +bool GVN::processNonLocalLoad(LoadInst *LI) { + // Step 1: Find the non-local dependencies of the load. + LoadDepVect Deps; + AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI); + MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps); + + // If we had to process more than one hundred blocks to find the + // dependencies, this load isn't worth worrying about. Optimizing + // it will be too expensive. + unsigned NumDeps = Deps.size(); + if (NumDeps > 100) + return false; + + // If we had a phi translation failure, we'll have a single entry which is a + // clobber in the current block. Reject this early. + if (NumDeps == 1 && + !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) { + DEBUG( + dbgs() << "GVN: non-local load "; + WriteAsOperand(dbgs(), LI); + dbgs() << " has unknown dependencies\n"; + ); + return false; + } + + // Step 2: Analyze the availability of the load + AvailValInBlkVect ValuesPerBlock; + UnavailBlkVect UnavailableBlocks; + AnalyzeLoadAvailability(LI, Deps, ValuesPerBlock, UnavailableBlocks); + + // If we have no predecessors that produce a known value for this load, exit + // early. + if (ValuesPerBlock.empty()) + return false; + + // Step 3: Eliminate fully redundancy. + // + // If all of the instructions we depend on produce a known value for this + // load, then it is fully redundant and we can use PHI insertion to compute + // its value. Insert PHIs and remove the fully redundant value now. + if (UnavailableBlocks.empty()) { + DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n'); + + // Perform PHI construction. + Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this); + LI->replaceAllUsesWith(V); + + if (isa<PHINode>(V)) + V->takeName(LI); + if (V->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(V); + markInstructionForDeletion(LI); + ++NumGVNLoad; + return true; + } + + // Step 4: Eliminate partial redundancy. + if (!EnablePRE || !EnableLoadPRE) + return false; + + return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks); +} + + static void patchReplacementInstruction(Instruction *I, Value *Repl) { // Patch the replacement so that it is not more restrictive than the value // being replaced. @@ -2296,8 +2306,6 @@ bool GVN::runOnFunction(Function& F) { while (ShouldContinue) { DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n"); ShouldContinue = iterateOnFunction(F); - if (splitCriticalEdges()) - ShouldContinue = true; Changed |= ShouldContinue; ++Iteration; } @@ -2309,6 +2317,7 @@ bool GVN::runOnFunction(Function& F) { Changed |= PREChanged; } } + // FIXME: Should perform GVN again after PRE does something. PRE can move // computations into blocks where they become fully redundant. Note that // we can't do this until PRE's critical edge splitting updates memdep. @@ -2542,6 +2551,15 @@ bool GVN::performPRE(Function &F) { return Changed; } +/// Split the critical edge connecting the given two blocks, and return +/// the block inserted to the critical edge. +BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { + BasicBlock *BB = SplitCriticalEdge(Pred, Succ, this); + if (MD) + MD->invalidateCachedPredecessors(); + return BB; +} + /// splitCriticalEdges - Split critical edges found during the previous /// iteration that may enable further optimization. bool GVN::splitCriticalEdges() { @@ -2568,9 +2586,18 @@ bool GVN::iterateOnFunction(Function &F) { RE = RPOT.end(); RI != RE; ++RI) Changed |= processBlock(*RI); #else + // Save the blocks this function have before transformation begins. GVN may + // split critical edge, and hence may invalidate the RPO/DT iterator. + // + std::vector<BasicBlock *> BBVect; + BBVect.reserve(256); for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()), DE = df_end(DT->getRootNode()); DI != DE; ++DI) - Changed |= processBlock(DI->getBlock()); + BBVect.push_back(DI->getBlock()); + + for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end(); + I != E; I++) + Changed |= processBlock(*I); #endif return Changed; diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 8e76c78..df11e92 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -532,7 +532,7 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { // and varies predictably *inside* the loop. Evaluate the value it // contains when the loop exits, if possible. const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop()); - if (!SE->isLoopInvariant(ExitValue, L)) + if (!SE->isLoopInvariant(ExitValue, L) || !isSafeToExpand(ExitValue)) continue; // Computing the value outside of the loop brings no benefit if : diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index e98ae95..14c5655 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -56,8 +56,8 @@ namespace { } bool runOnLoop(Loop *L, LPPassManager &LPM); - void simplifyLoopLatch(Loop *L); - bool rotateLoop(Loop *L); + bool simplifyLoopLatch(Loop *L); + bool rotateLoop(Loop *L, bool SimplifiedLatch); private: LoopInfo *LI; @@ -84,13 +84,14 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { // Simplify the loop latch before attempting to rotate the header // upward. Rotation may not be needed if the loop tail can be folded into the // loop exit. - simplifyLoopLatch(L); + bool SimplifiedLatch = simplifyLoopLatch(L); // One loop can be rotated multiple times. bool MadeChange = false; - while (rotateLoop(L)) + while (rotateLoop(L, SimplifiedLatch)) { MadeChange = true; - + SimplifiedLatch = false; + } return MadeChange; } @@ -212,25 +213,25 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, /// canonical form so downstream passes can handle it. /// /// I don't believe this invalidates SCEV. -void LoopRotate::simplifyLoopLatch(Loop *L) { +bool LoopRotate::simplifyLoopLatch(Loop *L) { BasicBlock *Latch = L->getLoopLatch(); if (!Latch || Latch->hasAddressTaken()) - return; + return false; BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); if (!Jmp || !Jmp->isUnconditional()) - return; + return false; BasicBlock *LastExit = Latch->getSinglePredecessor(); if (!LastExit || !L->isLoopExiting(LastExit)) - return; + return false; BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); if (!BI) - return; + return false; if (!shouldSpeculateInstrs(Latch->begin(), Jmp)) - return; + return false; DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " << LastExit->getName() << "\n"); @@ -253,10 +254,20 @@ void LoopRotate::simplifyLoopLatch(Loop *L) { if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) DT->eraseNode(Latch); Latch->eraseFromParent(); + return true; } /// Rotate loop LP. Return true if the loop is rotated. -bool LoopRotate::rotateLoop(Loop *L) { +/// +/// \param SimplifiedLatch is true if the latch was just folded into the final +/// loop exit. In this case we may want to rotate even though the new latch is +/// now an exiting branch. This rotation would have happened had the latch not +/// been simplified. However, if SimplifiedLatch is false, then we avoid +/// rotating loops in which the latch exits to avoid excessive or endless +/// rotation. LoopRotate should be repeatable and converge to a canonical +/// form. This property is satisfied because simplifying the loop latch can only +/// happen once across multiple invocations of the LoopRotate pass. +bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; @@ -276,7 +287,12 @@ bool LoopRotate::rotateLoop(Loop *L) { // If the loop latch already contains a branch that leaves the loop then the // loop is already rotated. - if (OrigLatch == 0 || L->isLoopExiting(OrigLatch)) + if (OrigLatch == 0) + return false; + + // Rotate if either the loop latch does *not* exit the loop, or if the loop + // latch was just simplified. + if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch) return false; // Check size of original header and reject loop if it is very big or we can't @@ -505,4 +521,3 @@ bool LoopRotate::rotateLoop(Loop *L) { ++NumRotated; return true; } - diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 73e44d7..b107fef 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -774,6 +774,16 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { } namespace { +class LSRUse; +} +// Check if it is legal to fold 2 base registers. +static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, + const Formula &F); +// Get the cost of the scaling factor used in F for LU. +static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F); + +namespace { /// Cost - This class is used to measure and compare candidate formulae. class Cost { @@ -785,11 +795,12 @@ class Cost { unsigned NumBaseAdds; unsigned ImmCost; unsigned SetupCost; + unsigned ScaleCost; public: Cost() : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), - SetupCost(0) {} + SetupCost(0), ScaleCost(0) {} bool operator<(const Cost &Other) const; @@ -799,9 +810,9 @@ public: // Once any of the metrics loses, they must all remain losers. bool isValid() { return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds - | ImmCost | SetupCost) != ~0u) + | ImmCost | SetupCost | ScaleCost) != ~0u) || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds - & ImmCost & SetupCost) == ~0u); + & ImmCost & SetupCost & ScaleCost) == ~0u); } #endif @@ -810,12 +821,14 @@ public: return NumRegs == ~0u; } - void RateFormula(const Formula &F, + void RateFormula(const TargetTransformInfo &TTI, + const Formula &F, SmallPtrSet<const SCEV *, 16> &Regs, const DenseSet<const SCEV *> &VisitedRegs, const Loop *L, const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, SmallPtrSet<const SCEV *, 16> *LoserRegs = 0); void print(raw_ostream &OS) const; @@ -900,12 +913,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg, } } -void Cost::RateFormula(const Formula &F, +void Cost::RateFormula(const TargetTransformInfo &TTI, + const Formula &F, SmallPtrSet<const SCEV *, 16> &Regs, const DenseSet<const SCEV *> &VisitedRegs, const Loop *L, const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, SmallPtrSet<const SCEV *, 16> *LoserRegs) { // Tally up the registers. if (const SCEV *ScaledReg = F.ScaledReg) { @@ -932,7 +947,12 @@ void Cost::RateFormula(const Formula &F, // Determine how many (unfolded) adds we'll need inside the loop. size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0); if (NumBaseParts > 1) - NumBaseAdds += NumBaseParts - 1; + // Do not count the base and a possible second register if the target + // allows to fold 2 registers. + NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F)); + + // Accumulate non-free scaling amounts. + ScaleCost += getScalingFactorCost(TTI, LU, F); // Tally up the non-zero immediates. for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), @@ -955,6 +975,7 @@ void Cost::Loose() { NumBaseAdds = ~0u; ImmCost = ~0u; SetupCost = ~0u; + ScaleCost = ~0u; } /// operator< - Choose the lower cost. @@ -967,6 +988,8 @@ bool Cost::operator<(const Cost &Other) const { return NumIVMuls < Other.NumIVMuls; if (NumBaseAdds != Other.NumBaseAdds) return NumBaseAdds < Other.NumBaseAdds; + if (ScaleCost != Other.ScaleCost) + return ScaleCost < Other.ScaleCost; if (ImmCost != Other.ImmCost) return ImmCost < Other.ImmCost; if (SetupCost != Other.SetupCost) @@ -983,6 +1006,8 @@ void Cost::print(raw_ostream &OS) const { if (NumBaseAdds != 0) OS << ", plus " << NumBaseAdds << " base add" << (NumBaseAdds == 1 ? "" : "s"); + if (ScaleCost != 0) + OS << ", plus " << ScaleCost << " scale cost"; if (ImmCost != 0) OS << ", plus " << ImmCost << " imm cost"; if (SetupCost != 0) @@ -1359,6 +1384,58 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, F.BaseOffset, F.HasBaseReg, F.Scale); } +static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, + const Formula &F) { + // If F is used as an Addressing Mode, it may fold one Base plus one + // scaled register. If the scaled register is nil, do as if another + // element of the base regs is a 1-scaled register. + // This is possible if BaseRegs has at least 2 registers. + + // If this is not an address calculation, this is not an addressing mode + // use. + if (LU.Kind != LSRUse::Address) + return false; + + // F is already scaled. + if (F.Scale != 0) + return false; + + // We need to keep one register for the base and one to scale. + if (F.BaseRegs.size() < 2) + return false; + + return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, + F.BaseGV, F.BaseOffset, F.HasBaseReg, 1); + } + +static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F) { + if (!F.Scale) + return 0; + assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, F) && "Illegal formula in use."); + + switch (LU.Kind) { + case LSRUse::Address: { + int CurScaleCost = TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, + F.BaseOffset, F.HasBaseReg, + F.Scale); + assert(CurScaleCost >= 0 && "Legal addressing mode has an illegal cost!"); + return CurScaleCost; + } + case LSRUse::ICmpZero: + // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg. + // Therefore, return 0 in case F.Scale == -1. + return F.Scale != -1; + + case LSRUse::Basic: + case LSRUse::Special: + return 0; + } + + llvm_unreachable("Invalid LSRUse Kind!"); +} + static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, @@ -3607,7 +3684,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { abs64(NewF.BaseOffset)) && (C->getValue()->getValue() + NewF.BaseOffset).countTrailingZeros() >= - CountTrailingZeros_64(NewF.BaseOffset)) + countTrailingZeros<uint64_t>(NewF.BaseOffset)) goto skip_formula; // Ok, looks good. @@ -3690,7 +3767,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { // the corresponding bad register from the Regs set. Cost CostF; Regs.clear(); - CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, + CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU, &LoserRegs); if (CostF.isLoser()) { // During initial formula generation, undesirable formulae are generated @@ -3726,7 +3803,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { Cost CostBest; Regs.clear(); - CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT); + CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE, + DT, LU); if (CostF < CostBest) std::swap(F, Best); DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); @@ -4079,7 +4157,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, // the current best, prune the search at that point. NewCost = CurCost; NewRegs = CurRegs; - NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT); + NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT, + LU); if (NewCost < SolutionCost) { Workspace.push_back(&F); if (Workspace.size() != Uses.size()) { diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index be0f0e8..c325925 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -626,8 +626,14 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return false; Type *StructTy = cast<PointerType>(A->getType())->getElementType(); - uint64_t destSize = TD->getTypeAllocSize(StructTy); + if (!StructTy->isSized()) { + // The call may never return and hence the copy-instruction may never + // be executed, and therefore it's not safe to say "the destination + // has at least <cpyLen> bytes, as implied by the copy-instruction", + return false; + } + uint64_t destSize = TD->getTypeAllocSize(StructTy); if (destSize < srcSize) return false; } else { |