diff options
Diffstat (limited to 'lib/Transforms')
34 files changed, 1445 insertions, 1279 deletions
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index a32e550..1522aa4 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -341,6 +341,12 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP, TD, TLI)); if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr) SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE); + + // If the initializer is an all-null value and we have an inbounds GEP, + // we already know what the result of any load from that GEP is. + // TODO: Handle splats. + if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds()) + SubInit = Constant::getNullValue(GEP->getType()->getElementType()); } Changed |= CleanupConstantGlobalUsers(GEP, SubInit, TD, TLI); diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp index 3c7fac6..664ddf6 100644 --- a/lib/Transforms/IPO/InlineAlways.cpp +++ b/lib/Transforms/IPO/InlineAlways.cpp @@ -32,7 +32,6 @@ namespace { // AlwaysInliner only inlines functions that are mark as "always inline". class AlwaysInliner : public Inliner { - InlineCostAnalyzer CA; public: // Use extremely low threshold. AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/true) { @@ -43,40 +42,11 @@ namespace { initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry()); } static char ID; // Pass identification, replacement for typeid - InlineCost getInlineCost(CallSite CS) { - Function *Callee = CS.getCalledFunction(); - // We assume indirect calls aren't calling an always-inline function. - if (!Callee) return InlineCost::getNever(); - - // We can't inline calls to external functions. - // FIXME: We shouldn't even get here. - if (Callee->isDeclaration()) return InlineCost::getNever(); - - // Return never for anything not marked as always inline. - if (!Callee->hasFnAttr(Attribute::AlwaysInline)) - return InlineCost::getNever(); - - // We still have to check the inline cost in case there are reasons to - // not inline which trump the always-inline attribute such as setjmp and - // indirectbr. - return CA.getInlineCost(CS); - } - float getInlineFudgeFactor(CallSite CS) { - return CA.getInlineFudgeFactor(CS); - } - void resetCachedCostInfo(Function *Caller) { - CA.resetCachedCostInfo(Caller); - } - void growCachedCostInfo(Function* Caller, Function* Callee) { - CA.growCachedCostInfo(Caller, Callee); - } + virtual InlineCost getInlineCost(CallSite CS); virtual bool doFinalization(CallGraph &CG) { return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true); } virtual bool doInitialization(CallGraph &CG); - void releaseMemory() { - CA.clear(); - } }; } @@ -93,9 +63,70 @@ Pass *llvm::createAlwaysInlinerPass(bool InsertLifetime) { return new AlwaysInliner(InsertLifetime); } +/// \brief Minimal filter to detect invalid constructs for inlining. +static bool isInlineViable(Function &F) { + bool ReturnsTwice = F.hasFnAttr(Attribute::ReturnsTwice); + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { + // Disallow inlining of functions which contain an indirect branch. + if (isa<IndirectBrInst>(BI->getTerminator())) + return false; + + for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; + ++II) { + CallSite CS(II); + if (!CS) + continue; + + // Disallow recursive calls. + if (&F == CS.getCalledFunction()) + return false; + + // Disallow calls which expose returns-twice to a function not previously + // attributed as such. + if (!ReturnsTwice && CS.isCall() && + cast<CallInst>(CS.getInstruction())->canReturnTwice()) + return false; + } + } + + return true; +} + +/// \brief Get the inline cost for the always-inliner. +/// +/// The always inliner *only* handles functions which are marked with the +/// attribute to force inlining. As such, it is dramatically simpler and avoids +/// using the powerful (but expensive) inline cost analysis. Instead it uses +/// a very simple and boring direct walk of the instructions looking for +/// impossible-to-inline constructs. +/// +/// Note, it would be possible to go to some lengths to cache the information +/// computed here, but as we only expect to do this for relatively few and +/// small functions which have the explicit attribute to force inlining, it is +/// likely not worth it in practice. +InlineCost AlwaysInliner::getInlineCost(CallSite CS) { + Function *Callee = CS.getCalledFunction(); + // We assume indirect calls aren't calling an always-inline function. + if (!Callee) return InlineCost::getNever(); + + // We can't inline calls to external functions. + // FIXME: We shouldn't even get here. + if (Callee->isDeclaration()) return InlineCost::getNever(); + + // Return never for anything not marked as always inline. + if (!Callee->hasFnAttr(Attribute::AlwaysInline)) + return InlineCost::getNever(); + + // Do some minimal analysis to preclude non-viable functions. + if (!isInlineViable(*Callee)) + return InlineCost::getNever(); + + // Otherwise, force inlining. + return InlineCost::getAlways(); +} + // doInitialization - Initializes the vector of functions that have not // been annotated with the "always inline" attribute. bool AlwaysInliner::doInitialization(CallGraph &CG) { - CA.setTargetData(getAnalysisIfAvailable<TargetData>()); return false; } diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp index 03032e6..50038d8 100644 --- a/lib/Transforms/IPO/InlineSimple.cpp +++ b/lib/Transforms/IPO/InlineSimple.cpp @@ -40,21 +40,9 @@ namespace { } static char ID; // Pass identification, replacement for typeid InlineCost getInlineCost(CallSite CS) { - return CA.getInlineCost(CS); - } - float getInlineFudgeFactor(CallSite CS) { - return CA.getInlineFudgeFactor(CS); - } - void resetCachedCostInfo(Function *Caller) { - CA.resetCachedCostInfo(Caller); - } - void growCachedCostInfo(Function* Caller, Function* Callee) { - CA.growCachedCostInfo(Caller, Callee); + return CA.getInlineCost(CS, getInlineThreshold(CS)); } virtual bool doInitialization(CallGraph &CG); - void releaseMemory() { - CA.clear(); - } }; } diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index 9975333..dc9cbfb 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -19,7 +19,6 @@ #include "llvm/IntrinsicInst.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" -#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Target/TargetData.h" #include "llvm/Transforms/IPO/InlinerPass.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -37,6 +36,11 @@ STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined"); STATISTIC(NumDeleted, "Number of functions deleted because all callers found"); STATISTIC(NumMergedAllocas, "Number of allocas merged together"); +// This weirdly named statistic tracks the number of times that, when attemting +// to inline a function A into B, we analyze the callers of B in order to see +// if those would be more profitable and blocked inline steps. +STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed"); + static cl::opt<int> InlineLimit("inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, cl::desc("Control the amount of inlining to perform (default = 225)")); @@ -232,14 +236,10 @@ bool Inliner::shouldInline(CallSite CS) { return false; } - int Cost = IC.getValue(); Function *Caller = CS.getCaller(); - int CurrentThreshold = getInlineThreshold(CS); - float FudgeFactor = getInlineFudgeFactor(CS); - int AdjThreshold = (int)(CurrentThreshold * FudgeFactor); - if (Cost >= AdjThreshold) { - DEBUG(dbgs() << " NOT Inlining: cost=" << Cost - << ", thres=" << AdjThreshold + if (!IC) { + DEBUG(dbgs() << " NOT Inlining: cost=" << IC.getCost() + << ", thres=" << (IC.getCostDelta() + IC.getCost()) << ", Call: " << *CS.getInstruction() << "\n"); return false; } @@ -256,12 +256,17 @@ bool Inliner::shouldInline(CallSite CS) { // are used. Thus we will always have the opportunity to make local inlining // decisions. Importantly the linkonce-ODR linkage covers inline functions // and templates in C++. + // + // FIXME: All of this logic should be sunk into getInlineCost. It relies on + // the internal implementation of the inline cost metrics rather than + // treating them as truly abstract units etc. if (Caller->hasLocalLinkage() || Caller->getLinkage() == GlobalValue::LinkOnceODRLinkage) { int TotalSecondaryCost = 0; - bool outerCallsFound = false; + // The candidate cost to be imposed upon the current function. + int CandidateCost = IC.getCost() - (InlineConstants::CallPenalty + 1); // This bool tracks what happens if we do NOT inline C into B. - bool callerWillBeRemoved = true; + bool callerWillBeRemoved = Caller->hasLocalLinkage(); // This bool tracks what happens if we DO inline C into B. bool inliningPreventsSomeOuterInline = false; for (Value::use_iterator I = Caller->use_begin(), E =Caller->use_end(); @@ -277,26 +282,20 @@ bool Inliner::shouldInline(CallSite CS) { } InlineCost IC2 = getInlineCost(CS2); - if (IC2.isNever()) + ++NumCallerCallersAnalyzed; + if (!IC2) { callerWillBeRemoved = false; - if (IC2.isAlways() || IC2.isNever()) + continue; + } + if (IC2.isAlways()) continue; - outerCallsFound = true; - int Cost2 = IC2.getValue(); - int CurrentThreshold2 = getInlineThreshold(CS2); - float FudgeFactor2 = getInlineFudgeFactor(CS2); - - if (Cost2 >= (int)(CurrentThreshold2 * FudgeFactor2)) - callerWillBeRemoved = false; - - // See if we have this case. We subtract off the penalty - // for the call instruction, which we would be deleting. - if (Cost2 < (int)(CurrentThreshold2 * FudgeFactor2) && - Cost2 + Cost - (InlineConstants::CallPenalty + 1) >= - (int)(CurrentThreshold2 * FudgeFactor2)) { + // See if inlining or original callsite would erase the cost delta of + // this callsite. We subtract off the penalty for the call instruction, + // which we would be deleting. + if (IC2.getCostDelta() <= CandidateCost) { inliningPreventsSomeOuterInline = true; - TotalSecondaryCost += Cost2; + TotalSecondaryCost += IC2.getCost(); } } // If all outer calls to Caller would get inlined, the cost for the last @@ -306,17 +305,16 @@ bool Inliner::shouldInline(CallSite CS) { if (callerWillBeRemoved && Caller->use_begin() != Caller->use_end()) TotalSecondaryCost += InlineConstants::LastCallToStaticBonus; - if (outerCallsFound && inliningPreventsSomeOuterInline && - TotalSecondaryCost < Cost) { - DEBUG(dbgs() << " NOT Inlining: " << *CS.getInstruction() << - " Cost = " << Cost << + if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost()) { + DEBUG(dbgs() << " NOT Inlining: " << *CS.getInstruction() << + " Cost = " << IC.getCost() << ", outer Cost = " << TotalSecondaryCost << '\n'); return false; } } - DEBUG(dbgs() << " Inlining: cost=" << Cost - << ", thres=" << AdjThreshold + DEBUG(dbgs() << " Inlining: cost=" << IC.getCost() + << ", thres=" << (IC.getCostDelta() + IC.getCost()) << ", Call: " << *CS.getInstruction() << '\n'); return true; } @@ -335,38 +333,6 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID, return false; } -/// \brief Simplify arguments going into a particular callsite. -/// -/// This is important to do each time we add a callsite due to inlining so that -/// constants and other entities which feed into inline cost estimation are -/// properly recognized when analyzing the new callsite. Consider: -/// void outer(int x) { -/// if (x < 42) -/// return inner(42 - x); -/// ... -/// } -/// void inner(int x) { -/// ... -/// } -/// -/// The inliner gives calls to 'outer' with a constant argument a bonus because -/// it will delete one side of a branch. But the resulting call to 'inner' -/// will, after inlining, also have a constant operand. We need to do just -/// enough constant folding to expose this for callsite arguments. The rest -/// will be taken care of after the inliner finishes running. -static void simplifyCallSiteArguments(const TargetData *TD, CallSite CS) { - // FIXME: It would be nice to avoid this smallvector if RAUW doesn't - // invalidate operand iterators in any cases. - SmallVector<std::pair<Value *, Value*>, 4> SimplifiedArgs; - for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); - I != E; ++I) - if (Instruction *Inst = dyn_cast<Instruction>(*I)) - if (Value *SimpleArg = SimplifyInstruction(Inst, TD)) - SimplifiedArgs.push_back(std::make_pair(Inst, SimpleArg)); - for (unsigned Idx = 0, Size = SimplifiedArgs.size(); Idx != Size; ++Idx) - SimplifiedArgs[Idx].first->replaceAllUsesWith(SimplifiedArgs[Idx].second); -} - bool Inliner::runOnSCC(CallGraphSCC &SCC) { CallGraph &CG = getAnalysis<CallGraph>(); const TargetData *TD = getAnalysisIfAvailable<TargetData>(); @@ -455,8 +421,6 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { CG[Caller]->removeCallEdgeFor(CS); CS.getInstruction()->eraseFromParent(); ++NumCallsDeleted; - // Update the cached cost info with the missing call - growCachedCostInfo(Caller, NULL); } else { // We can only inline direct calls to non-declarations. if (Callee == 0 || Callee->isDeclaration()) continue; @@ -494,14 +458,9 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { for (unsigned i = 0, e = InlineInfo.InlinedCalls.size(); i != e; ++i) { Value *Ptr = InlineInfo.InlinedCalls[i]; - CallSite NewCS = Ptr; - simplifyCallSiteArguments(TD, NewCS); - CallSites.push_back(std::make_pair(NewCS, NewHistoryID)); + CallSites.push_back(std::make_pair(CallSite(Ptr), NewHistoryID)); } } - - // Update the cached cost info with the inlined call. - growCachedCostInfo(Caller, Callee); } // If we inlined or deleted the last possible call site to the function, @@ -521,8 +480,6 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { // Remove any call graph edges from the callee to its callees. CalleeNode->removeAllCalledFunctions(); - resetCachedCostInfo(Callee); - // Removing the node for callee from the call graph and delete it. delete CG.removeFunctionFromModule(CalleeNode); ++NumDeleted; @@ -601,14 +558,13 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) { // Note that it doesn't matter that we are iterating over a non-stable order // here to do this, it doesn't matter which order the functions are deleted // in. - std::sort(FunctionsToRemove.begin(), FunctionsToRemove.end()); + array_pod_sort(FunctionsToRemove.begin(), FunctionsToRemove.end()); FunctionsToRemove.erase(std::unique(FunctionsToRemove.begin(), FunctionsToRemove.end()), FunctionsToRemove.end()); for (SmallVectorImpl<CallGraphNode *>::iterator I = FunctionsToRemove.begin(), E = FunctionsToRemove.end(); I != E; ++I) { - resetCachedCostInfo((*I)->getFunction()); delete CG.removeFunctionFromModule(*I); ++NumDeleted; } diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp index 7cb1d18..fb5869e 100644 --- a/lib/Transforms/IPO/Internalize.cpp +++ b/lib/Transforms/IPO/Internalize.cpp @@ -122,6 +122,11 @@ bool InternalizePass::runOnModule(Module &M) { bool Changed = false; + // Never internalize functions which code-gen might insert. + // FIXME: We should probably add this (and the __stack_chk_guard) via some + // type of call-back in CodeGen. + ExternalNames.insert("__stack_chk_fail"); + // Mark all functions not in the api as internal. // FIXME: maybe use private linkage? for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) @@ -148,9 +153,11 @@ bool InternalizePass::runOnModule(Module &M) { // won't find them. (see MachineModuleInfo.) ExternalNames.insert("llvm.global_ctors"); ExternalNames.insert("llvm.global_dtors"); - ExternalNames.insert("llvm.noinline"); ExternalNames.insert("llvm.global.annotations"); + // Never internalize symbols code-gen inserts. + ExternalNames.insert("__stack_chk_guard"); + // Mark all global variables with initializers that are not in the api as // internal as well. // FIXME: maybe use private linkage? diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 8408437..43b4ab5 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -35,6 +35,11 @@ using namespace llvm; static cl::opt<bool> RunVectorization("vectorize", cl::desc("Run vectorization passes")); +static cl::opt<bool> +UseGVNAfterVectorization("use-gvn-after-vectorization", + cl::init(false), cl::Hidden, + cl::desc("Run GVN instead of Early CSE after vectorization passes")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -182,8 +187,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { if (Vectorize) { MPM.add(createBBVectorizePass()); MPM.add(createInstructionCombiningPass()); - if (OptLevel > 1) - MPM.add(createGVNPass()); // Remove redundancies + if (OptLevel > 1 && UseGVNAfterVectorization) + MPM.add(createGVNPass()); // Remove redundancies + else + MPM.add(createEarlyCSEPass()); // Catch trivial redundancies } MPM.add(createAggressiveDCEPass()); // Delete dead instructions @@ -202,11 +209,13 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { if (OptLevel > 1) MPM.add(createConstantMergePass()); // Merge dup global constants } + addExtensionsToPM(EP_OptimizerLast, MPM); } void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, bool Internalize, - bool RunInliner) { + bool RunInliner, + bool DisableGVNLoadPRE) { // Provide AliasAnalysis services for optimizations. addInitialAliasAnalysisPasses(PM); @@ -262,9 +271,9 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, PM.add(createFunctionAttrsPass()); // Add nocapture. PM.add(createGlobalsModRefPass()); // IP alias analysis. - PM.add(createLICMPass()); // Hoist loop invariants. - PM.add(createGVNPass()); // Remove redundancies. - PM.add(createMemCpyOptPass()); // Remove dead memcpys. + PM.add(createLICMPass()); // Hoist loop invariants. + PM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. + PM.add(createMemCpyOptPass()); // Remove dead memcpys. // Nuke dead stores. PM.add(createDeadStoreEliminationPass()); diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h index 464e9d0..199df51 100644 --- a/lib/Transforms/InstCombine/InstCombine.h +++ b/lib/Transforms/InstCombine/InstCombine.h @@ -291,9 +291,9 @@ public: return 0; // Don't do anything with FI } - void ComputeMaskedBits(Value *V, const APInt &Mask, APInt &KnownZero, + void ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne, unsigned Depth = 0) const { - return llvm::ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD, Depth); + return llvm::ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth); } bool MaskedValueIsZero(Value *V, const APInt &Mask, diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 908038b..05e702f 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -141,10 +141,9 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // a sub and fuse this add with it. if (LHS->hasOneUse() && (XorRHS->getValue()+1).isPowerOf2()) { IntegerType *IT = cast<IntegerType>(I.getType()); - APInt Mask = APInt::getAllOnesValue(IT->getBitWidth()); APInt LHSKnownOne(IT->getBitWidth(), 0); APInt LHSKnownZero(IT->getBitWidth(), 0); - ComputeMaskedBits(XorLHS, Mask, LHSKnownZero, LHSKnownOne); + ComputeMaskedBits(XorLHS, LHSKnownZero, LHSKnownOne); if ((XorRHS->getValue() | LHSKnownZero).isAllOnesValue()) return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI), XorLHS); @@ -202,14 +201,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // A+B --> A|B iff A and B have no bits set in common. if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) { - APInt Mask = APInt::getAllOnesValue(IT->getBitWidth()); APInt LHSKnownOne(IT->getBitWidth(), 0); APInt LHSKnownZero(IT->getBitWidth(), 0); - ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne); + ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne); if (LHSKnownZero != 0) { APInt RHSKnownOne(IT->getBitWidth(), 0); APInt RHSKnownZero(IT->getBitWidth(), 0); - ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne); + ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne); // No bits in common -> bitwise or. if ((LHSKnownZero|RHSKnownZero).isAllOnesValue()) diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 1165660..0dbe11d 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1362,13 +1362,8 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, // part of the value (e.g. byte 3) then it must be shifted right. If from the // low part, it must be shifted left. unsigned DestByteNo = InputByteNo + OverallLeftShift; - if (InputByteNo < ByteValues.size()/2) { - if (ByteValues.size()-1-DestByteNo != InputByteNo) - return true; - } else { - if (ByteValues.size()-1-DestByteNo != InputByteNo) - return true; - } + if (ByteValues.size()-1-DestByteNo != InputByteNo) + return true; // If the destination byte value is already defined, the values are or'd // together, which isn't a bswap (unless it's an or of the same bits). diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index b550fe8..77e4727 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -361,8 +361,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { uint32_t BitWidth = IT->getBitWidth(); APInt KnownZero(BitWidth, 0); APInt KnownOne(BitWidth, 0); - ComputeMaskedBits(II->getArgOperand(0), APInt::getAllOnesValue(BitWidth), - KnownZero, KnownOne); + ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne); unsigned TrailingZeros = KnownOne.countTrailingZeros(); APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros)); if ((Mask & KnownZero) == Mask) @@ -380,8 +379,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { uint32_t BitWidth = IT->getBitWidth(); APInt KnownZero(BitWidth, 0); APInt KnownOne(BitWidth, 0); - ComputeMaskedBits(II->getArgOperand(0), APInt::getAllOnesValue(BitWidth), - KnownZero, KnownOne); + ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne); unsigned LeadingZeros = KnownOne.countLeadingZeros(); APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros)); if ((Mask & KnownZero) == Mask) @@ -394,17 +392,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType()); uint32_t BitWidth = IT->getBitWidth(); - APInt Mask = APInt::getSignBit(BitWidth); APInt LHSKnownZero(BitWidth, 0); APInt LHSKnownOne(BitWidth, 0); - ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne); + ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne); bool LHSKnownNegative = LHSKnownOne[BitWidth - 1]; bool LHSKnownPositive = LHSKnownZero[BitWidth - 1]; if (LHSKnownNegative || LHSKnownPositive) { APInt RHSKnownZero(BitWidth, 0); APInt RHSKnownOne(BitWidth, 0); - ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne); + ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne); bool RHSKnownNegative = RHSKnownOne[BitWidth - 1]; bool RHSKnownPositive = RHSKnownZero[BitWidth - 1]; if (LHSKnownNegative && RHSKnownNegative) { @@ -488,14 +485,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::umul_with_overflow: { Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); unsigned BitWidth = cast<IntegerType>(LHS->getType())->getBitWidth(); - APInt Mask = APInt::getAllOnesValue(BitWidth); APInt LHSKnownZero(BitWidth, 0); APInt LHSKnownOne(BitWidth, 0); - ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne); + ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne); APInt RHSKnownZero(BitWidth, 0); APInt RHSKnownOne(BitWidth, 0); - ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne); + ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne); // Get the largest possible values for each operand. APInt LHSMax = ~LHSKnownZero; diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index c5ddb75..39279f4 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -541,8 +541,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, // If Op1C some other power of two, convert: uint32_t BitWidth = Op1C->getType()->getBitWidth(); APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - APInt TypeMask(APInt::getAllOnesValue(BitWidth)); - ComputeMaskedBits(ICI->getOperand(0), TypeMask, KnownZero, KnownOne); + ComputeMaskedBits(ICI->getOperand(0), KnownZero, KnownOne); APInt KnownZeroMask(~KnownZero); if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1? @@ -590,9 +589,8 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, APInt KnownZeroLHS(BitWidth, 0), KnownOneLHS(BitWidth, 0); APInt KnownZeroRHS(BitWidth, 0), KnownOneRHS(BitWidth, 0); - APInt TypeMask(APInt::getAllOnesValue(BitWidth)); - ComputeMaskedBits(LHS, TypeMask, KnownZeroLHS, KnownOneLHS); - ComputeMaskedBits(RHS, TypeMask, KnownZeroRHS, KnownOneRHS); + ComputeMaskedBits(LHS, KnownZeroLHS, KnownOneLHS); + ComputeMaskedBits(RHS, KnownZeroRHS, KnownOneRHS); if (KnownZeroLHS == KnownZeroRHS && KnownOneLHS == KnownOneRHS) { APInt KnownBits = KnownZeroLHS | KnownOneLHS; @@ -911,8 +909,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){ unsigned BitWidth = Op1C->getType()->getBitWidth(); APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - APInt TypeMask(APInt::getAllOnesValue(BitWidth)); - ComputeMaskedBits(Op0, TypeMask, KnownZero, KnownOne); + ComputeMaskedBits(Op0, KnownZero, KnownOne); APInt KnownZeroMask(~KnownZero); if (KnownZeroMask.isPowerOf2()) { diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 5308992..ab2987f 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1028,9 +1028,8 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, // of the high bits truncated out of x are known. unsigned DstBits = LHSI->getType()->getPrimitiveSizeInBits(), SrcBits = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits(); - APInt Mask(APInt::getHighBitsSet(SrcBits, SrcBits-DstBits)); APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0); - ComputeMaskedBits(LHSI->getOperand(0), Mask, KnownZero, KnownOne); + ComputeMaskedBits(LHSI->getOperand(0), KnownZero, KnownOne); // If all the high bits are known, we can do this xform. if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) { diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 7446a51..b2f2e24 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -22,6 +22,72 @@ using namespace llvm; STATISTIC(NumDeadStore, "Number of dead stores eliminated"); +// Try to kill dead allocas by walking through its uses until we see some use +// that could escape. This is a conservative analysis which tries to handle +// GEPs, bitcasts, stores, and no-op intrinsics. These tend to be the things +// left after inlining and SROA finish chewing on an alloca. +static Instruction *removeDeadAlloca(InstCombiner &IC, AllocaInst &AI) { + SmallVector<Instruction *, 4> Worklist, DeadStores; + Worklist.push_back(&AI); + do { + Instruction *PI = Worklist.pop_back_val(); + for (Value::use_iterator UI = PI->use_begin(), UE = PI->use_end(); + UI != UE; ++UI) { + Instruction *I = cast<Instruction>(*UI); + switch (I->getOpcode()) { + default: + // Give up the moment we see something we can't handle. + return 0; + + case Instruction::GetElementPtr: + case Instruction::BitCast: + Worklist.push_back(I); + continue; + + case Instruction::Call: + // We can handle a limited subset of calls to no-op intrinsics. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::dbg_declare: + case Intrinsic::dbg_value: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + continue; + default: + return 0; + } + } + // Reject everything else. + return 0; + + case Instruction::Store: { + // Stores into the alloca are only live if the alloca is live. + StoreInst *SI = cast<StoreInst>(I); + // We can eliminate atomic stores, but not volatile. + if (SI->isVolatile()) + return 0; + // The store is only trivially safe if the poniter is the destination + // as opposed to the value. We're conservative here and don't check for + // the case where we store the address of a dead alloca into a dead + // alloca. + if (SI->getPointerOperand() != PI) + return 0; + DeadStores.push_back(I); + continue; + } + } + } + } while (!Worklist.empty()); + + // The alloca is dead. Kill off all the stores to it, and then replace it + // with undef. + while (!DeadStores.empty()) + IC.EraseInstFromFunction(*DeadStores.pop_back_val()); + return IC.ReplaceInstUsesWith(AI, UndefValue::get(AI.getType())); +} + Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // Ensure that the alloca array size argument has type intptr_t, so that // any casting is exposed early. @@ -81,7 +147,10 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType())); } - return 0; + // Try to aggressively remove allocas which are only used for GEPs, lifetime + // markers, and stores. This happens when SROA iteratively promotes stores + // out of the alloca, and we need to cleanup after it. + return removeDeadAlloca(*this, AI); } diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 61a8e5b..125c74a 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -142,7 +142,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, Instruction *I = dyn_cast<Instruction>(V); if (!I) { - ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth); + ComputeMaskedBits(V, KnownZero, KnownOne, Depth); return 0; // Only analyze instructions. } @@ -156,10 +156,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // this instruction has a simpler value in that context. if (I->getOpcode() == Instruction::And) { // If either the LHS or the RHS are Zero, the result is zero. - ComputeMaskedBits(I->getOperand(1), DemandedMask, - RHSKnownZero, RHSKnownOne, Depth+1); - ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownZero, - LHSKnownZero, LHSKnownOne, Depth+1); + ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1); + ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); // If all of the demanded bits are known 1 on one side, return the other. // These bits cannot contribute to the result of the 'and' in this @@ -180,10 +178,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // only bits from X or Y are demanded. // If either the LHS or the RHS are One, the result is One. - ComputeMaskedBits(I->getOperand(1), DemandedMask, - RHSKnownZero, RHSKnownOne, Depth+1); - ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownOne, - LHSKnownZero, LHSKnownOne, Depth+1); + ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1); + ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); // If all of the demanded bits are known zero on one side, return the // other. These bits cannot contribute to the result of the 'or' in this @@ -206,7 +202,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, } // Compute the KnownZero/KnownOne bits to simplify things downstream. - ComputeMaskedBits(I, DemandedMask, KnownZero, KnownOne, Depth); + ComputeMaskedBits(I, KnownZero, KnownOne, Depth); return 0; } @@ -219,7 +215,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, switch (I->getOpcode()) { default: - ComputeMaskedBits(I, DemandedMask, KnownZero, KnownOne, Depth); + ComputeMaskedBits(I, KnownZero, KnownOne, Depth); break; case Instruction::And: // If either the LHS or the RHS are Zero, the result is zero. @@ -570,7 +566,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // Otherwise just hand the sub off to ComputeMaskedBits to fill in // the known zeros and ones. - ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth); + ComputeMaskedBits(V, KnownZero, KnownOne, Depth); // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known // zero. @@ -729,10 +725,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // The sign bit is the LHS's sign bit, except when the result of the // remainder is zero. if (DemandedMask.isNegative() && KnownZero.isNonNegative()) { - APInt Mask2 = APInt::getSignBit(BitWidth); APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0); - ComputeMaskedBits(I->getOperand(0), Mask2, LHSKnownZero, LHSKnownOne, - Depth+1); + ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); // If it's known zero, our sign bit is also zero. if (LHSKnownZero.isNegative()) KnownZero |= LHSKnownZero; @@ -795,7 +789,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, return 0; } } - ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth); + ComputeMaskedBits(V, KnownZero, KnownOne, Depth); break; } diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 349ba83..066b2ec 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -916,8 +916,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Handle gep(bitcast x) and gep(gep x, 0, 0, 0). Value *StrippedPtr = PtrOp->stripPointerCasts(); PointerType *StrippedPtrTy = dyn_cast<PointerType>(StrippedPtr->getType()); - // We do not handle pointer-vector geps here - if (!StrippedPtr) + + // We do not handle pointer-vector geps here. + if (!StrippedPtrTy) return 0; if (StrippedPtr != PtrOp && diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 85fda30..8bb337e 100644 --- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -22,16 +22,20 @@ #define DEBUG_TYPE "tsan" #include "FunctionBlackList.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Intrinsics.h" #include "llvm/Function.h" +#include "llvm/LLVMContext.h" +#include "llvm/Metadata.h" #include "llvm/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/IRBuilder.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -42,16 +46,36 @@ using namespace llvm; static cl::opt<std::string> ClBlackListFile("tsan-blacklist", cl::desc("Blacklist file"), cl::Hidden); +static cl::opt<bool> ClPrintStats("tsan-print-stats", + cl::desc("Print ThreadSanitizer instrumentation stats"), cl::Hidden); + namespace { + +// Stats counters for ThreadSanitizer instrumentation. +struct ThreadSanitizerStats { + size_t NumInstrumentedReads; + size_t NumInstrumentedWrites; + size_t NumOmittedReadsBeforeWrite; + size_t NumAccessesWithBadSize; + size_t NumInstrumentedVtableWrites; + size_t NumOmittedReadsFromConstantGlobals; + size_t NumOmittedReadsFromVtable; +}; + /// ThreadSanitizer: instrument the code in module to find races. struct ThreadSanitizer : public FunctionPass { ThreadSanitizer(); bool runOnFunction(Function &F); bool doInitialization(Module &M); + bool doFinalization(Module &M); bool instrumentLoadOrStore(Instruction *I); static char ID; // Pass identification, replacement for typeid. private: + void choseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local, + SmallVectorImpl<Instruction*> &All); + bool addrPointsToConstantData(Value *Addr); + TargetData *TD; OwningPtr<FunctionBlackList> BL; // Callbacks to run-time library are computed in doInitialization. @@ -61,6 +85,10 @@ struct ThreadSanitizer : public FunctionPass { static const size_t kNumberOfAccessSizes = 5; Value *TsanRead[kNumberOfAccessSizes]; Value *TsanWrite[kNumberOfAccessSizes]; + Value *TsanVptrUpdate; + + // Stats are modified w/o synchronization. + ThreadSanitizerStats stats; }; } // namespace @@ -83,6 +111,7 @@ bool ThreadSanitizer::doInitialization(Module &M) { if (!TD) return false; BL.reset(new FunctionBlackList(ClBlackListFile)); + memset(&stats, 0, sizeof(stats)); // Always insert a call to __tsan_init into the module's CTORs. IRBuilder<> IRB(M.getContext()); @@ -105,14 +134,103 @@ bool ThreadSanitizer::doInitialization(Module &M) { TsanWrite[i] = M.getOrInsertFunction(WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL); } + TsanVptrUpdate = M.getOrInsertFunction("__tsan_vptr_update", IRB.getVoidTy(), + IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), + NULL); + return true; +} + +bool ThreadSanitizer::doFinalization(Module &M) { + if (ClPrintStats) { + errs() << "ThreadSanitizerStats " << M.getModuleIdentifier() + << ": wr " << stats.NumInstrumentedWrites + << "; rd " << stats.NumInstrumentedReads + << "; vt " << stats.NumInstrumentedVtableWrites + << "; bs " << stats.NumAccessesWithBadSize + << "; rbw " << stats.NumOmittedReadsBeforeWrite + << "; rcg " << stats.NumOmittedReadsFromConstantGlobals + << "; rvt " << stats.NumOmittedReadsFromVtable + << "\n"; + } return true; } +static bool isVtableAccess(Instruction *I) { + if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) { + if (Tag->getNumOperands() < 1) return false; + if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) { + if (Tag1->getString() == "vtable pointer") return true; + } + } + return false; +} + +bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) { + // If this is a GEP, just analyze its pointer operand. + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr)) + Addr = GEP->getPointerOperand(); + + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) { + if (GV->isConstant()) { + // Reads from constant globals can not race with any writes. + stats.NumOmittedReadsFromConstantGlobals++; + return true; + } + } else if(LoadInst *L = dyn_cast<LoadInst>(Addr)) { + if (isVtableAccess(L)) { + // Reads from a vtable pointer can not race with any writes. + stats.NumOmittedReadsFromVtable++; + return true; + } + } + return false; +} + +// Instrumenting some of the accesses may be proven redundant. +// Currently handled: +// - read-before-write (within same BB, no calls between) +// +// We do not handle some of the patterns that should not survive +// after the classic compiler optimizations. +// E.g. two reads from the same temp should be eliminated by CSE, +// two writes should be eliminated by DSE, etc. +// +// 'Local' is a vector of insns within the same BB (no calls between). +// 'All' is a vector of insns that will be instrumented. +void ThreadSanitizer::choseInstructionsToInstrument( + SmallVectorImpl<Instruction*> &Local, + SmallVectorImpl<Instruction*> &All) { + SmallSet<Value*, 8> WriteTargets; + // Iterate from the end. + for (SmallVectorImpl<Instruction*>::reverse_iterator It = Local.rbegin(), + E = Local.rend(); It != E; ++It) { + Instruction *I = *It; + if (StoreInst *Store = dyn_cast<StoreInst>(I)) { + WriteTargets.insert(Store->getPointerOperand()); + } else { + LoadInst *Load = cast<LoadInst>(I); + Value *Addr = Load->getPointerOperand(); + if (WriteTargets.count(Addr)) { + // We will write to this temp, so no reason to analyze the read. + stats.NumOmittedReadsBeforeWrite++; + continue; + } + if (addrPointsToConstantData(Addr)) { + // Addr points to some constant data -- it can not race with any writes. + continue; + } + } + All.push_back(I); + } + Local.clear(); +} + bool ThreadSanitizer::runOnFunction(Function &F) { if (!TD) return false; if (BL->isIn(F)) return false; SmallVector<Instruction*, 8> RetVec; - SmallVector<Instruction*, 8> LoadsAndStores; + SmallVector<Instruction*, 8> AllLoadsAndStores; + SmallVector<Instruction*, 8> LocalLoadsAndStores; bool Res = false; bool HasCalls = false; @@ -123,12 +241,15 @@ bool ThreadSanitizer::runOnFunction(Function &F) { for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE; ++BI) { if (isa<LoadInst>(BI) || isa<StoreInst>(BI)) - LoadsAndStores.push_back(BI); + LocalLoadsAndStores.push_back(BI); else if (isa<ReturnInst>(BI)) RetVec.push_back(BI); - else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) + else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) { HasCalls = true; + choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores); + } } + choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores); } // We have collected all loads and stores. @@ -136,8 +257,8 @@ bool ThreadSanitizer::runOnFunction(Function &F) { // (e.g. variables that do not escape, etc). // Instrument memory accesses. - for (size_t i = 0, n = LoadsAndStores.size(); i < n; ++i) { - Res |= instrumentLoadOrStore(LoadsAndStores[i]); + for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) { + Res |= instrumentLoadOrStore(AllLoadsAndStores[i]); } // Instrument function entry/exit points if there were instrumented accesses. @@ -151,6 +272,7 @@ bool ThreadSanitizer::runOnFunction(Function &F) { IRBuilder<> IRBRet(RetVec[i]); IRBRet.CreateCall(TsanFuncExit); } + Res = true; } return Res; } @@ -167,12 +289,23 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) { uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy); if (TypeSize != 8 && TypeSize != 16 && TypeSize != 32 && TypeSize != 64 && TypeSize != 128) { + stats.NumAccessesWithBadSize++; // Ignore all unusual sizes. return false; } + if (IsWrite && isVtableAccess(I)) { + Value *StoredValue = cast<StoreInst>(I)->getValueOperand(); + IRB.CreateCall2(TsanVptrUpdate, + IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy())); + stats.NumInstrumentedVtableWrites++; + return true; + } size_t Idx = CountTrailingZeros_32(TypeSize / 8); assert(Idx < kNumberOfAccessSizes); Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx]; IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy())); + if (IsWrite) stats.NumInstrumentedWrites++; + else stats.NumInstrumentedReads++; return true; } diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp index 020ec57..9a5423f 100644 --- a/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -567,8 +567,8 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { // happens. WeakVH IterHandle(CurInstIterator); - ReplaceAndSimplifyAllUses(CI, RetVal, TLI ? TLI->getTargetData() : 0, - TLInfo, ModifiedDT ? 0 : DT); + replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getTargetData() : 0, + TLInfo, ModifiedDT ? 0 : DT); // If the iterator instruction was recursively deleted, start over at the // start of the block. diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index ac80c48..fb733ad 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -1974,109 +1974,119 @@ unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To, /// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with /// 'RHS' everywhere in the scope. Returns whether a change was made. bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) { - if (LHS == RHS) return false; - assert(LHS->getType() == RHS->getType() && "Equal but types differ!"); + SmallVector<std::pair<Value*, Value*>, 4> Worklist; + Worklist.push_back(std::make_pair(LHS, RHS)); + bool Changed = false; - // Don't try to propagate equalities between constants. - if (isa<Constant>(LHS) && isa<Constant>(RHS)) - return false; + while (!Worklist.empty()) { + std::pair<Value*, Value*> Item = Worklist.pop_back_val(); + LHS = Item.first; RHS = Item.second; + + if (LHS == RHS) continue; + assert(LHS->getType() == RHS->getType() && "Equality but unequal types!"); + + // Don't try to propagate equalities between constants. + if (isa<Constant>(LHS) && isa<Constant>(RHS)) continue; - // Prefer a constant on the right-hand side, or an Argument if no constants. - if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS))) - std::swap(LHS, RHS); - assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!"); - - // If there is no obvious reason to prefer the left-hand side over the right- - // hand side, ensure the longest lived term is on the right-hand side, so the - // shortest lived term will be replaced by the longest lived. This tends to - // expose more simplifications. - uint32_t LVN = VN.lookup_or_add(LHS); - if ((isa<Argument>(LHS) && isa<Argument>(RHS)) || - (isa<Instruction>(LHS) && isa<Instruction>(RHS))) { - // Move the 'oldest' value to the right-hand side, using the value number as - // a proxy for age. - uint32_t RVN = VN.lookup_or_add(RHS); - if (LVN < RVN) { + // Prefer a constant on the right-hand side, or an Argument if no constants. + if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS))) std::swap(LHS, RHS); - LVN = RVN; + assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!"); + + // If there is no obvious reason to prefer the left-hand side over the right- + // hand side, ensure the longest lived term is on the right-hand side, so the + // shortest lived term will be replaced by the longest lived. This tends to + // expose more simplifications. + uint32_t LVN = VN.lookup_or_add(LHS); + if ((isa<Argument>(LHS) && isa<Argument>(RHS)) || + (isa<Instruction>(LHS) && isa<Instruction>(RHS))) { + // Move the 'oldest' value to the right-hand side, using the value number as + // a proxy for age. + uint32_t RVN = VN.lookup_or_add(RHS); + if (LVN < RVN) { + std::swap(LHS, RHS); + LVN = RVN; + } + } + assert((!isa<Instruction>(RHS) || + DT->properlyDominates(cast<Instruction>(RHS)->getParent(), Root)) && + "Instruction doesn't dominate scope!"); + + // If value numbering later deduces that an instruction in the scope is equal + // to 'LHS' then ensure it will be turned into 'RHS'. + addToLeaderTable(LVN, RHS, Root); + + // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. As + // LHS always has at least one use that is not dominated by Root, this will + // never do anything if LHS has only one use. + if (!LHS->hasOneUse()) { + unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root); + Changed |= NumReplacements > 0; + NumGVNEqProp += NumReplacements; } - } - - // If value numbering later deduces that an instruction in the scope is equal - // to 'LHS' then ensure it will be turned into 'RHS'. - addToLeaderTable(LVN, RHS, Root); - // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. As - // LHS always has at least one use that is not dominated by Root, this will - // never do anything if LHS has only one use. - bool Changed = false; - if (!LHS->hasOneUse()) { - unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root); - Changed |= NumReplacements > 0; - NumGVNEqProp += NumReplacements; - } - - // Now try to deduce additional equalities from this one. For example, if the - // known equality was "(A != B)" == "false" then it follows that A and B are - // equal in the scope. Only boolean equalities with an explicit true or false - // RHS are currently supported. - if (!RHS->getType()->isIntegerTy(1)) - // Not a boolean equality - bail out. - return Changed; - ConstantInt *CI = dyn_cast<ConstantInt>(RHS); - if (!CI) - // RHS neither 'true' nor 'false' - bail out. - return Changed; - // Whether RHS equals 'true'. Otherwise it equals 'false'. - bool isKnownTrue = CI->isAllOnesValue(); - bool isKnownFalse = !isKnownTrue; - - // If "A && B" is known true then both A and B are known true. If "A || B" - // is known false then both A and B are known false. - Value *A, *B; - if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) || - (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) { - Changed |= propagateEquality(A, RHS, Root); - Changed |= propagateEquality(B, RHS, Root); - return Changed; - } + // Now try to deduce additional equalities from this one. For example, if the + // known equality was "(A != B)" == "false" then it follows that A and B are + // equal in the scope. Only boolean equalities with an explicit true or false + // RHS are currently supported. + if (!RHS->getType()->isIntegerTy(1)) + // Not a boolean equality - bail out. + continue; + ConstantInt *CI = dyn_cast<ConstantInt>(RHS); + if (!CI) + // RHS neither 'true' nor 'false' - bail out. + continue; + // Whether RHS equals 'true'. Otherwise it equals 'false'. + bool isKnownTrue = CI->isAllOnesValue(); + bool isKnownFalse = !isKnownTrue; + + // If "A && B" is known true then both A and B are known true. If "A || B" + // is known false then both A and B are known false. + Value *A, *B; + if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) || + (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) { + Worklist.push_back(std::make_pair(A, RHS)); + Worklist.push_back(std::make_pair(B, RHS)); + continue; + } - // If we are propagating an equality like "(A == B)" == "true" then also - // propagate the equality A == B. When propagating a comparison such as - // "(A >= B)" == "true", replace all instances of "A < B" with "false". - if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) { - Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1); - - // If "A == B" is known true, or "A != B" is known false, then replace - // A with B everywhere in the scope. - if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) || - (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) - Changed |= propagateEquality(Op0, Op1, Root); - - // If "A >= B" is known true, replace "A < B" with false everywhere. - CmpInst::Predicate NotPred = Cmp->getInversePredicate(); - Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse); - // Since we don't have the instruction "A < B" immediately to hand, work out - // the value number that it would have and use that to find an appropriate - // instruction (if any). - uint32_t NextNum = VN.getNextUnusedValueNumber(); - uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1); - // If the number we were assigned was brand new then there is no point in - // looking for an instruction realizing it: there cannot be one! - if (Num < NextNum) { - Value *NotCmp = findLeader(Root, Num); - if (NotCmp && isa<Instruction>(NotCmp)) { - unsigned NumReplacements = - replaceAllDominatedUsesWith(NotCmp, NotVal, Root); - Changed |= NumReplacements > 0; - NumGVNEqProp += NumReplacements; + // If we are propagating an equality like "(A == B)" == "true" then also + // propagate the equality A == B. When propagating a comparison such as + // "(A >= B)" == "true", replace all instances of "A < B" with "false". + if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) { + Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1); + + // If "A == B" is known true, or "A != B" is known false, then replace + // A with B everywhere in the scope. + if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) || + (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) + Worklist.push_back(std::make_pair(Op0, Op1)); + + // If "A >= B" is known true, replace "A < B" with false everywhere. + CmpInst::Predicate NotPred = Cmp->getInversePredicate(); + Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse); + // Since we don't have the instruction "A < B" immediately to hand, work out + // the value number that it would have and use that to find an appropriate + // instruction (if any). + uint32_t NextNum = VN.getNextUnusedValueNumber(); + uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1); + // If the number we were assigned was brand new then there is no point in + // looking for an instruction realizing it: there cannot be one! + if (Num < NextNum) { + Value *NotCmp = findLeader(Root, Num); + if (NotCmp && isa<Instruction>(NotCmp)) { + unsigned NumReplacements = + replaceAllDominatedUsesWith(NotCmp, NotVal, Root); + Changed |= NumReplacements > 0; + NumGVNEqProp += NumReplacements; + } } - } - // Ensure that any instruction in scope that gets the "A < B" value number - // is replaced with false. - addToLeaderTable(Num, NotVal, Root); + // Ensure that any instruction in scope that gets the "A < B" value number + // is replaced with false. + addToLeaderTable(Num, NotVal, Root); - return Changed; + continue; + } } return Changed; @@ -2325,7 +2335,14 @@ bool GVN::performPRE(Function &F) { CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || isa<DbgInfoIntrinsic>(CurInst)) continue; - + + // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from + // sinking the compare again, and it would force the code generator to + // move the i1 from processor flags or predicate registers into a general + // purpose register. + if (isa<CmpInst>(CurInst)) + continue; + // We don't currently value number ANY inline asm calls. if (CallInst *CallI = dyn_cast<CallInst>(CurInst)) if (CallI->isInlineAsm()) diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 490617a..a9ba657 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -33,7 +33,6 @@ #include "llvm/LLVMContext.h" #include "llvm/Type.h" #include "llvm/Analysis/Dominators.h" -#include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -50,18 +49,12 @@ #include "llvm/ADT/Statistic.h" using namespace llvm; -STATISTIC(NumRemoved , "Number of aux indvars removed"); STATISTIC(NumWidened , "Number of indvars widened"); -STATISTIC(NumInserted , "Number of canonical indvars added"); STATISTIC(NumReplaced , "Number of exit values replaced"); STATISTIC(NumLFTR , "Number of loop exit tests replaced"); STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated"); STATISTIC(NumElimIV , "Number of congruent IVs eliminated"); -static cl::opt<bool> EnableIVRewrite( - "enable-iv-rewrite", cl::Hidden, - cl::desc("Enable canonical induction variable rewriting")); - // Trip count verification can be enabled by default under NDEBUG if we // implement a strong expression equivalence checker in SCEV. Until then, we // use the verify-indvars flag, which may assert in some cases. @@ -71,7 +64,6 @@ static cl::opt<bool> VerifyIndvars( namespace { class IndVarSimplify : public LoopPass { - IVUsers *IU; LoopInfo *LI; ScalarEvolution *SE; DominatorTree *DT; @@ -82,7 +74,7 @@ namespace { public: static char ID; // Pass identification, replacement for typeid - IndVarSimplify() : LoopPass(ID), IU(0), LI(0), SE(0), DT(0), TD(0), + IndVarSimplify() : LoopPass(ID), LI(0), SE(0), DT(0), TD(0), Changed(false) { initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); } @@ -95,13 +87,9 @@ namespace { AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - if (EnableIVRewrite) - AU.addRequired<IVUsers>(); AU.addPreserved<ScalarEvolution>(); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); - if (EnableIVRewrite) - AU.addPreserved<IVUsers>(); AU.setPreservesCFG(); } @@ -119,8 +107,6 @@ namespace { void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); - void RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter); - Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, PHINode *IndVar, SCEVExpander &Rewriter); @@ -136,7 +122,6 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(IVUsers) INITIALIZE_PASS_END(IndVarSimplify, "indvars", "Induction Variable Simplification", false, false) @@ -448,13 +433,6 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { PN->replaceAllUsesWith(Conv); RecursivelyDeleteTriviallyDeadInstructions(PN); } - - // Add a new IVUsers entry for the newly-created integer PHI. - if (IU) { - SmallPtrSet<Loop*, 16> SimplifiedLoopNests; - IU->AddUsersIfInteresting(NewPHI, SimplifiedLoopNests); - } - Changed = true; } @@ -600,124 +578,6 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { } //===----------------------------------------------------------------------===// -// Rewrite IV users based on a canonical IV. -// Only for use with -enable-iv-rewrite. -//===----------------------------------------------------------------------===// - -/// FIXME: It is an extremely bad idea to indvar substitute anything more -/// complex than affine induction variables. Doing so will put expensive -/// polynomial evaluations inside of the loop, and the str reduction pass -/// currently can only reduce affine polynomials. For now just disable -/// indvar subst on anything more complex than an affine addrec, unless -/// it can be expanded to a trivial value. -static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) { - // Loop-invariant values are safe. - if (SE->isLoopInvariant(S, L)) return true; - - // Affine addrecs are safe. Non-affine are not, because LSR doesn't know how - // to transform them into efficient code. - if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) - return AR->isAffine(); - - // An add is safe it all its operands are safe. - if (const SCEVCommutativeExpr *Commutative - = dyn_cast<SCEVCommutativeExpr>(S)) { - for (SCEVCommutativeExpr::op_iterator I = Commutative->op_begin(), - E = Commutative->op_end(); I != E; ++I) - if (!isSafe(*I, L, SE)) return false; - return true; - } - - // A cast is safe if its operand is. - if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) - return isSafe(C->getOperand(), L, SE); - - // A udiv is safe if its operands are. - if (const SCEVUDivExpr *UD = dyn_cast<SCEVUDivExpr>(S)) - return isSafe(UD->getLHS(), L, SE) && - isSafe(UD->getRHS(), L, SE); - - // SCEVUnknown is always safe. - if (isa<SCEVUnknown>(S)) - return true; - - // Nothing else is safe. - return false; -} - -void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) { - // Rewrite all induction variable expressions in terms of the canonical - // induction variable. - // - // If there were induction variables of other sizes or offsets, manually - // add the offsets to the primary induction variable and cast, avoiding - // the need for the code evaluation methods to insert induction variables - // of different sizes. - for (IVUsers::iterator UI = IU->begin(), E = IU->end(); UI != E; ++UI) { - Value *Op = UI->getOperandValToReplace(); - Type *UseTy = Op->getType(); - Instruction *User = UI->getUser(); - - // Compute the final addrec to expand into code. - const SCEV *AR = IU->getReplacementExpr(*UI); - - // Evaluate the expression out of the loop, if possible. - if (!L->contains(UI->getUser())) { - const SCEV *ExitVal = SE->getSCEVAtScope(AR, L->getParentLoop()); - if (SE->isLoopInvariant(ExitVal, L)) - AR = ExitVal; - } - - // FIXME: It is an extremely bad idea to indvar substitute anything more - // complex than affine induction variables. Doing so will put expensive - // polynomial evaluations inside of the loop, and the str reduction pass - // currently can only reduce affine polynomials. For now just disable - // indvar subst on anything more complex than an affine addrec, unless - // it can be expanded to a trivial value. - if (!isSafe(AR, L, SE)) - continue; - - // Determine the insertion point for this user. By default, insert - // immediately before the user. The SCEVExpander class will automatically - // hoist loop invariants out of the loop. For PHI nodes, there may be - // multiple uses, so compute the nearest common dominator for the - // incoming blocks. - Instruction *InsertPt = getInsertPointForUses(User, Op, DT); - - // Now expand it into actual Instructions and patch it into place. - Value *NewVal = Rewriter.expandCodeFor(AR, UseTy, InsertPt); - - DEBUG(dbgs() << "INDVARS: Rewrote IV '" << *AR << "' " << *Op << '\n' - << " into = " << *NewVal << "\n"); - - if (!isValidRewrite(Op, NewVal)) { - DeadInsts.push_back(NewVal); - continue; - } - // Inform ScalarEvolution that this value is changing. The change doesn't - // affect its value, but it does potentially affect which use lists the - // value will be on after the replacement, which affects ScalarEvolution's - // ability to walk use lists and drop dangling pointers when a value is - // deleted. - SE->forgetValue(User); - - // Patch the new value into place. - if (Op->hasName()) - NewVal->takeName(Op); - if (Instruction *NewValI = dyn_cast<Instruction>(NewVal)) - NewValI->setDebugLoc(User->getDebugLoc()); - User->replaceUsesOfWith(Op, NewVal); - UI->setOperandValToReplace(NewVal); - - ++NumRemoved; - Changed = true; - - // The old value may be dead now. - DeadInsts.push_back(Op); - } -} - -//===----------------------------------------------------------------------===// // IV Widening - Extend the width of an IV to cover its widest uses. //===----------------------------------------------------------------------===// @@ -1262,9 +1122,6 @@ static bool isHighCostExpansion(const SCEV *S, BranchInst *BI, } } - if (EnableIVRewrite) - return false; - // Recurse past add expressions, which commonly occur in the // BackedgeTakenCount. They may already exist in program code, and if not, // they are not too expensive rematerialize. @@ -1321,36 +1178,6 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) { return true; } -/// getBackedgeIVType - Get the widest type used by the loop test after peeking -/// through Truncs. -/// -/// TODO: Unnecessary when ForceLFTR is removed. -static Type *getBackedgeIVType(Loop *L) { - if (!L->getExitingBlock()) - return 0; - - // Can't rewrite non-branch yet. - BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator()); - if (!BI) - return 0; - - ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); - if (!Cond) - return 0; - - Type *Ty = 0; - for(User::op_iterator OI = Cond->op_begin(), OE = Cond->op_end(); - OI != OE; ++OI) { - assert((!Ty || Ty == (*OI)->getType()) && "bad icmp operand types"); - TruncInst *Trunc = dyn_cast<TruncInst>(*OI); - if (!Trunc) - continue; - - return Trunc->getSrcTy(); - } - return Ty; -} - /// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop /// invariant value to the phi. static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { @@ -1619,8 +1446,7 @@ LinearFunctionTestReplace(Loop *L, // LFTR can ignore IV overflow and truncate to the width of // BECount. This avoids materializing the add(zext(add)) expression. - Type *CntTy = !EnableIVRewrite ? - BackedgeTakenCount->getType() : IndVar->getType(); + Type *CntTy = BackedgeTakenCount->getType(); const SCEV *IVCount = BackedgeTakenCount; @@ -1805,8 +1631,6 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { if (!L->isLoopSimplifyForm()) return false; - if (EnableIVRewrite) - IU = &getAnalysis<IVUsers>(); LI = &getAnalysis<LoopInfo>(); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTree>(); @@ -1833,10 +1657,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // attempt to avoid evaluating SCEVs for sign/zero extend operations until // other expressions involving loop IVs have been evaluated. This helps SCEV // set no-wrap flags before normalizing sign/zero extension. - if (!EnableIVRewrite) { - Rewriter.disableCanonicalMode(); - SimplifyAndExtend(L, Rewriter, LPM); - } + Rewriter.disableCanonicalMode(); + SimplifyAndExtend(L, Rewriter, LPM); // Check to see if this loop has a computable loop-invariant execution count. // If so, this means that we can compute the final value of any expressions @@ -1847,106 +1669,28 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) RewriteLoopExitValues(L, Rewriter); - // Eliminate redundant IV users. - if (EnableIVRewrite) - Changed |= simplifyIVUsers(IU, SE, &LPM, DeadInsts); - // Eliminate redundant IV cycles. - if (!EnableIVRewrite) - NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); - - // Compute the type of the largest recurrence expression, and decide whether - // a canonical induction variable should be inserted. - Type *LargestType = 0; - bool NeedCannIV = false; - bool ExpandBECount = canExpandBackedgeTakenCount(L, SE); - if (EnableIVRewrite && ExpandBECount) { - // If we have a known trip count and a single exit block, we'll be - // rewriting the loop exit test condition below, which requires a - // canonical induction variable. - NeedCannIV = true; - Type *Ty = BackedgeTakenCount->getType(); - if (!EnableIVRewrite) { - // In this mode, SimplifyIVUsers may have already widened the IV used by - // the backedge test and inserted a Trunc on the compare's operand. Get - // the wider type to avoid creating a redundant narrow IV only used by the - // loop test. - LargestType = getBackedgeIVType(L); - } - if (!LargestType || - SE->getTypeSizeInBits(Ty) > - SE->getTypeSizeInBits(LargestType)) - LargestType = SE->getEffectiveSCEVType(Ty); - } - if (EnableIVRewrite) { - for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) { - NeedCannIV = true; - Type *Ty = - SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType()); - if (!LargestType || - SE->getTypeSizeInBits(Ty) > - SE->getTypeSizeInBits(LargestType)) - LargestType = Ty; - } - } - - // Now that we know the largest of the induction variable expressions - // in this loop, insert a canonical induction variable of the largest size. - PHINode *IndVar = 0; - if (NeedCannIV) { - // Check to see if the loop already has any canonical-looking induction - // variables. If any are present and wider than the planned canonical - // induction variable, temporarily remove them, so that the Rewriter - // doesn't attempt to reuse them. - SmallVector<PHINode *, 2> OldCannIVs; - while (PHINode *OldCannIV = L->getCanonicalInductionVariable()) { - if (SE->getTypeSizeInBits(OldCannIV->getType()) > - SE->getTypeSizeInBits(LargestType)) - OldCannIV->removeFromParent(); - else - break; - OldCannIVs.push_back(OldCannIV); - } - - IndVar = Rewriter.getOrInsertCanonicalInductionVariable(L, LargestType); - - ++NumInserted; - Changed = true; - DEBUG(dbgs() << "INDVARS: New CanIV: " << *IndVar << '\n'); + NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); - // Now that the official induction variable is established, reinsert - // any old canonical-looking variables after it so that the IR remains - // consistent. They will be deleted as part of the dead-PHI deletion at - // the end of the pass. - while (!OldCannIVs.empty()) { - PHINode *OldCannIV = OldCannIVs.pop_back_val(); - OldCannIV->insertBefore(L->getHeader()->getFirstInsertionPt()); - } - } - else if (!EnableIVRewrite && ExpandBECount && needsLFTR(L, DT)) { - IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD); - } // If we have a trip count expression, rewrite the loop's exit condition // using it. We can currently only handle loops with a single exit. - Value *NewICmp = 0; - if (ExpandBECount && IndVar) { - // Check preconditions for proper SCEVExpander operation. SCEV does not - // express SCEVExpander's dependencies, such as LoopSimplify. Instead any - // pass that uses the SCEVExpander must do it. This does not work well for - // loop passes because SCEVExpander makes assumptions about all loops, while - // LoopPassManager only forces the current loop to be simplified. - // - // FIXME: SCEV expansion has no way to bail out, so the caller must - // explicitly check any assumptions made by SCEV. Brittle. - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount); - if (!AR || AR->getLoop()->getLoopPreheader()) - NewICmp = - LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, Rewriter); + if (canExpandBackedgeTakenCount(L, SE) && needsLFTR(L, DT)) { + PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD); + if (IndVar) { + // Check preconditions for proper SCEVExpander operation. SCEV does not + // express SCEVExpander's dependencies, such as LoopSimplify. Instead any + // pass that uses the SCEVExpander must do it. This does not work well for + // loop passes because SCEVExpander makes assumptions about all loops, while + // LoopPassManager only forces the current loop to be simplified. + // + // FIXME: SCEV expansion has no way to bail out, so the caller must + // explicitly check any assumptions made by SCEV. Brittle. + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount); + if (!AR || AR->getLoop()->getLoopPreheader()) + (void)LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, + Rewriter); + } } - // Rewrite IV-derived expressions. - if (EnableIVRewrite) - RewriteIVExpressions(L, Rewriter); - // Clear the rewriter cache, because values that are in the rewriter's cache // can be deleted in the loop below, causing the AssertingVH in the cache to // trigger. @@ -1965,16 +1709,6 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // loop may be sunk below the loop to reduce register pressure. SinkUnusedInvariants(L); - // For completeness, inform IVUsers of the IV use in the newly-created - // loop exit test instruction. - if (IU && NewICmp) { - ICmpInst *NewICmpInst = dyn_cast<ICmpInst>(NewICmp); - if (NewICmpInst) { - SmallPtrSet<Loop*, 16> SimplifiedLoopNests; - IU->AddUsersIfInteresting(cast<Instruction>(NewICmpInst->getOperand(0)), - SimplifiedLoopNests); - } - } // Clean up dead instructions. Changed |= DeleteDeadPHIs(L->getHeader()); // Check a post-condition. @@ -1984,8 +1718,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Verify that LFTR, and any other change have not interfered with SCEV's // ability to compute trip count. #ifndef NDEBUG - if (!EnableIVRewrite && VerifyIndvars && - !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { + if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { SE->forgetLoop(L); const SCEV *NewBECount = SE->getBackedgeTakenCount(L); if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) < diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 82d918e..fe4700b 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -77,11 +77,11 @@ #include <algorithm> using namespace llvm; -static cl::opt<bool> EnableNested( - "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops")); - -static cl::opt<bool> EnableRetry( - "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry")); +/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for +/// bail out. This threshold is far beyond the number of users that LSR can +/// conceivably solve, so it should not affect generated code, but catches the +/// worst cases before LSR burns too much compile time and stack space. +static const unsigned MaxIVUsers = 200; // Temporary flag to cleanup congruent phis after LSR phi expansion. // It's currently disabled until we can determine whether it's truly useful or @@ -710,8 +710,9 @@ static bool isHighCostExpansion(const SCEV *S, Value *UVal = U->getValue(); for (Value::use_iterator UI = UVal->use_begin(), UE = UVal->use_end(); UI != UE; ++UI) { - Instruction *User = cast<Instruction>(*UI); - if (User->getOpcode() == Instruction::Mul + // If U is a constant, it may be used by a ConstantExpr. + Instruction *User = dyn_cast<Instruction>(*UI); + if (User && User->getOpcode() == Instruction::Mul && SE.isSCEVable(User->getType())) { return SE.getSCEV(User) == Mul; } @@ -824,36 +825,20 @@ void Cost::RateRegister(const SCEV *Reg, const Loop *L, ScalarEvolution &SE, DominatorTree &DT) { if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { - if (AR->getLoop() == L) - AddRecCost += 1; /// TODO: This should be a function of the stride. - // If this is an addrec for another loop, don't second-guess its addrec phi // nodes. LSR isn't currently smart enough to reason about more than one - // loop at a time. LSR has either already run on inner loops, will not run - // on other loops, and cannot be expected to change sibling loops. If the - // AddRec exists, consider it's register free and leave it alone. Otherwise, - // do not consider this formula at all. - else if (!EnableNested || L->contains(AR->getLoop()) || - (!AR->getLoop()->contains(L) && - DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) { + // loop at a time. LSR has already run on inner loops, will not run on outer + // loops, and cannot be expected to change sibling loops. + if (AR->getLoop() != L) { + // If the AddRec exists, consider it's register free and leave it alone. if (isExistingPhi(AR, SE)) return; - // For !EnableNested, never rewrite IVs in other loops. - if (!EnableNested) { - Loose(); - return; - } - // If this isn't one of the addrecs that the loop already has, it - // would require a costly new phi and add. TODO: This isn't - // precisely modeled right now. - ++NumBaseAdds; - if (!Regs.count(AR->getStart())) { - RateRegister(AR->getStart(), Regs, L, SE, DT); - if (isLoser()) - return; - } + // Otherwise, do not consider this formula at all. + Loose(); + return; } + AddRecCost += 1; /// TODO: This should be a function of the stride. // Add the step value register, if it needs one. // TODO: The non-affine case isn't precisely modeled here. @@ -1303,10 +1288,19 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM, // If we have low-level target information, ask the target if it can fold an // integer immediate on an icmp. if (AM.BaseOffs != 0) { - if (TLI) return TLI->isLegalICmpImmediate(-(uint64_t)AM.BaseOffs); - return false; + if (!TLI) + return false; + // We have one of: + // ICmpZero BaseReg + Offset => ICmp BaseReg, -Offset + // ICmpZero -1*ScaleReg + Offset => ICmp ScaleReg, Offset + // Offs is the ICmp immediate. + int64_t Offs = AM.BaseOffs; + if (AM.Scale == 0) + Offs = -(uint64_t)Offs; // The cast does the right thing with INT64_MIN. + return TLI->isLegalICmpImmediate(Offs); } + // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg return true; case LSRUse::Basic: @@ -2193,7 +2187,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() { do { const SCEV *S = Worklist.pop_back_val(); if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { - if (EnableNested || AR->getLoop() == L) + if (AR->getLoop() == L) Strides.insert(AR->getStepRecurrence(SE)); Worklist.push_back(AR->getStart()); } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { @@ -2463,7 +2457,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, if (!isCompatibleIVType(PrevIV, NextIV)) continue; - // A phi nodes terminates a chain. + // A phi node terminates a chain. if (isa<PHINode>(UserInst) && isa<PHINode>(IVChainVec[ChainIdx].back().UserInst)) continue; @@ -2519,13 +2513,14 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, for (Value::use_iterator UseIter = IVOper->use_begin(), UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) { Instruction *OtherUse = dyn_cast<Instruction>(*UseIter); + if (!OtherUse || OtherUse == UserInst) + continue; if (SE.isSCEVable(OtherUse->getType()) && !isa<SCEVUnknown>(SE.getSCEV(OtherUse)) && IU.isIVUserOrOperand(OtherUse)) { continue; } - if (OtherUse && OtherUse != UserInst) - NearUsers.insert(OtherUse); + NearUsers.insert(OtherUse); } // Since this user is part of the chain, it's no longer considered a use @@ -3986,24 +3981,29 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, if (LU.Regs.count(*I)) ReqRegs.insert(*I); - bool AnySatisfiedReqRegs = false; SmallPtrSet<const SCEV *, 16> NewRegs; Cost NewCost; -retry: for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), E = LU.Formulae.end(); I != E; ++I) { const Formula &F = *I; // Ignore formulae which do not use any of the required registers. + bool SatisfiedReqReg = true; for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(), JE = ReqRegs.end(); J != JE; ++J) { const SCEV *Reg = *J; if ((!F.ScaledReg || F.ScaledReg != Reg) && std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) == - F.BaseRegs.end()) - goto skip; + F.BaseRegs.end()) { + SatisfiedReqReg = false; + break; + } + } + if (!SatisfiedReqReg) { + // If none of the formulae satisfied the required registers, then we could + // clear ReqRegs and try again. Currently, we simply give up in this case. + continue; } - AnySatisfiedReqRegs = true; // Evaluate the cost of the current formula. If it's already worse than // the current best, prune the search at that point. @@ -4030,18 +4030,6 @@ retry: } Workspace.pop_back(); } - skip:; - } - - if (!EnableRetry && !AnySatisfiedReqRegs) - return; - - // If none of the formulae had all of the required registers, relax the - // constraint so that we don't exclude all formulae. - if (!AnySatisfiedReqRegs) { - assert(!ReqRegs.empty() && "Solver failed even without required registers"); - ReqRegs.clear(); - goto retry; } } @@ -4537,6 +4525,17 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) // If there's no interesting work to be done, bail early. if (IU.empty()) return; + // If there's too much analysis to be done, bail early. We won't be able to + // model the problem anyway. + unsigned NumUsers = 0; + for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { + if (++NumUsers > MaxIVUsers) { + DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << *L + << "\n"); + return; + } + } + #ifndef NDEBUG // All dominating loops must have preheaders, or SCEVExpander may not be able // to materialize an AddRecExpr whose Start is an outer AddRecExpr. @@ -4566,7 +4565,7 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) if (IU.empty()) return; // Skip nested loops until we can model them better with formulae. - if (!EnableNested && !L->empty()) { + if (!L->empty()) { DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n"); return; } diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index 22dbfe3..09a186f 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -197,13 +197,13 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } if (TripCount) { // Reduce unroll count to be modulo of TripCount for partial unrolling - Count = CurrentThreshold / LoopSize; + Count = Threshold / LoopSize; while (Count != 0 && TripCount%Count != 0) Count--; } else if (UnrollRuntime) { // Reduce unroll count to be a lower power-of-two value - while (Count != 0 && Size > CurrentThreshold) { + while (Count != 0 && Size > Threshold) { Count >>= 1; Size = LoopSize*Count; } diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 053eb0c..00ecc74 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -64,63 +64,63 @@ STATISTIC(TotalInsts, "Total number of instructions analyzed"); static cl::opt<unsigned> Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden); - + namespace { - + class LUAnalysisCache { typedef DenseMap<const SwitchInst*, SmallPtrSet<const Value *, 8> > UnswitchedValsMap; - + typedef UnswitchedValsMap::iterator UnswitchedValsIt; - + struct LoopProperties { unsigned CanBeUnswitchedCount; unsigned SizeEstimation; UnswitchedValsMap UnswitchedVals; }; - - // Here we use std::map instead of DenseMap, since we need to keep valid + + // Here we use std::map instead of DenseMap, since we need to keep valid // LoopProperties pointer for current loop for better performance. typedef std::map<const Loop*, LoopProperties> LoopPropsMap; typedef LoopPropsMap::iterator LoopPropsMapIt; - + LoopPropsMap LoopsProperties; UnswitchedValsMap* CurLoopInstructions; LoopProperties* CurrentLoopProperties; - + // Max size of code we can produce on remained iterations. unsigned MaxSize; - + public: - + LUAnalysisCache() : CurLoopInstructions(NULL), CurrentLoopProperties(NULL), MaxSize(Threshold) {} - + // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. bool countLoop(const Loop* L); - + // Clean all data related to given loop. void forgetLoop(const Loop* L); - + // Mark case value as unswitched. // Since SI instruction can be partly unswitched, in order to avoid // extra unswitching in cloned loops keep track all unswitched values. void setUnswitched(const SwitchInst* SI, const Value* V); - + // Check was this case value unswitched before or not. bool isUnswitched(const SwitchInst* SI, const Value* V); - + // Clone all loop-unswitch related loop properties. // Redistribute unswitching quotas. // Note, that new loop data is stored inside the VMap. void cloneData(const Loop* NewLoop, const Loop* OldLoop, const ValueToValueMapTy& VMap); }; - + class LoopUnswitch : public LoopPass { LoopInfo *LI; // Loop information LPPassManager *LPM; @@ -130,7 +130,7 @@ namespace { std::vector<Loop*> LoopProcessWorklist; LUAnalysisCache BranchesInfo; - + bool OptimizeForSize; bool redoLoop; @@ -138,9 +138,9 @@ namespace { DominatorTree *DT; BasicBlock *loopHeader; BasicBlock *loopPreheader; - + // LoopBlocks contains all of the basic blocks of the loop, including the - // preheader of the loop, the body of the loop, and the exit blocks of the + // preheader of the loop, the body of the loop, and the exit blocks of the // loop, in that order. std::vector<BasicBlock*> LoopBlocks; // NewBlocks contained cloned copy of basic blocks from LoopBlocks. @@ -148,8 +148,8 @@ namespace { public: static char ID; // Pass ID, replacement for typeid - explicit LoopUnswitch(bool Os = false) : - LoopPass(ID), OptimizeForSize(Os), redoLoop(false), + explicit LoopUnswitch(bool Os = false) : + LoopPass(ID), OptimizeForSize(Os), redoLoop(false), currentLoop(NULL), DT(NULL), loopHeader(NULL), loopPreheader(NULL) { initializeLoopUnswitchPass(*PassRegistry::getPassRegistry()); @@ -186,7 +186,7 @@ namespace { if (I != LoopProcessWorklist.end()) LoopProcessWorklist.erase(I); } - + void initLoopData() { loopHeader = currentLoop->getHeader(); loopPreheader = currentLoop->getLoopPreheader(); @@ -205,7 +205,7 @@ namespace { Constant *Val, bool isEqual); void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, - BasicBlock *TrueDest, + BasicBlock *TrueDest, BasicBlock *FalseDest, Instruction *InsertPt); @@ -222,12 +222,12 @@ namespace { // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. bool LUAnalysisCache::countLoop(const Loop* L) { - + std::pair<LoopPropsMapIt, bool> InsertRes = LoopsProperties.insert(std::make_pair(L, LoopProperties())); - + LoopProperties& Props = InsertRes.first->second; - + if (InsertRes.second) { // New loop. @@ -235,39 +235,39 @@ bool LUAnalysisCache::countLoop(const Loop* L) { // expansion, and the number of basic blocks, to avoid loops with // large numbers of branches which cause loop unswitching to go crazy. // This is a very ad-hoc heuristic. - + // FIXME: This is overly conservative because it does not take into // consideration code simplification opportunities and code that can // be shared by the resultant unswitched loops. CodeMetrics Metrics; - for (Loop::block_iterator I = L->block_begin(), + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) - Metrics.analyzeBasicBlock(*I); + Metrics.analyzeBasicBlock(*I); Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5); Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation); MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount; - } - + } + if (!Props.CanBeUnswitchedCount) { DEBUG(dbgs() << "NOT unswitching loop %" << L->getHeader()->getName() << ", cost too high: " << L->getBlocks().size() << "\n"); - + return false; } - + // Be careful. This links are good only before new loop addition. CurrentLoopProperties = &Props; CurLoopInstructions = &Props.UnswitchedVals; - + return true; } // Clean all data related to given loop. void LUAnalysisCache::forgetLoop(const Loop* L) { - + LoopPropsMapIt LIt = LoopsProperties.find(L); if (LIt != LoopsProperties.end()) { @@ -275,9 +275,9 @@ void LUAnalysisCache::forgetLoop(const Loop* L) { MaxSize += Props.CanBeUnswitchedCount * Props.SizeEstimation; LoopsProperties.erase(LIt); } - + CurrentLoopProperties = NULL; - CurLoopInstructions = NULL; + CurLoopInstructions = NULL; } // Mark case value as unswitched. @@ -289,7 +289,7 @@ void LUAnalysisCache::setUnswitched(const SwitchInst* SI, const Value* V) { // Check was this case value unswitched before or not. bool LUAnalysisCache::isUnswitched(const SwitchInst* SI, const Value* V) { - return (*CurLoopInstructions)[SI].count(V); + return (*CurLoopInstructions)[SI].count(V); } // Clone all loop-unswitch related loop properties. @@ -297,20 +297,20 @@ bool LUAnalysisCache::isUnswitched(const SwitchInst* SI, const Value* V) { // Note, that new loop data is stored inside the VMap. void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop, const ValueToValueMapTy& VMap) { - + LoopProperties& NewLoopProps = LoopsProperties[NewLoop]; LoopProperties& OldLoopProps = *CurrentLoopProperties; UnswitchedValsMap& Insts = OldLoopProps.UnswitchedVals; - + // Reallocate "can-be-unswitched quota" --OldLoopProps.CanBeUnswitchedCount; unsigned Quota = OldLoopProps.CanBeUnswitchedCount; NewLoopProps.CanBeUnswitchedCount = Quota / 2; OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2; - + NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation; - + // Clone unswitched values info: // for new loop switches we clone info about values that was // already unswitched and has redundant successors. @@ -319,7 +319,7 @@ void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop, Value* NewI = VMap.lookup(OldInst); const SwitchInst* NewInst = cast_or_null<SwitchInst>(NewI); assert(NewInst && "All instructions that are in SrcBB must be in VMap."); - + NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst]; } } @@ -333,18 +333,18 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) -Pass *llvm::createLoopUnswitchPass(bool Os) { - return new LoopUnswitch(Os); +Pass *llvm::createLoopUnswitchPass(bool Os) { + return new LoopUnswitch(Os); } /// FindLIVLoopCondition - Cond is a condition that occurs in L. If it is /// invariant in the loop, or has an invariant piece, return the invariant. /// Otherwise, return null. static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { - + // We started analyze new instruction, increment scanned instructions counter. ++TotalInsts; - + // We can never unswitch on vector conditions. if (Cond->getType()->isVectorTy()) return 0; @@ -369,7 +369,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed)) return RHS; } - + return 0; } @@ -394,19 +394,36 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { return Changed; } -/// processCurrentLoop - Do actual work and unswitch loop if possible +/// processCurrentLoop - Do actual work and unswitch loop if possible /// and profitable. bool LoopUnswitch::processCurrentLoop() { bool Changed = false; initLoopData(); - + // If LoopSimplify was unable to form a preheader, don't do any unswitching. if (!loopPreheader) return false; - + + // Loops with indirectbr cannot be cloned. + if (!currentLoop->isSafeToClone()) + return false; + + // Loops with invokes, whose unwind edge escapes the loop, cannot be + // unswitched because splitting their edges are non-trivial and don't preserve + // loop simplify information. + for (Loop::block_iterator I = currentLoop->block_begin(), + E = currentLoop->block_end(); I != E; ++I) + if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator())) + if (!currentLoop->contains(II->getUnwindDest())) + return false; + + // Without dedicated exits, splitting the exit edge may fail. + if (!currentLoop->hasDedicatedExits()) + return false; + LLVMContext &Context = loopHeader->getContext(); - + // Probably we reach the quota of branches for this loop. If so // stop unswitching. if (!BranchesInfo.countLoop(currentLoop)) @@ -415,7 +432,7 @@ bool LoopUnswitch::processCurrentLoop() { // Loop over all of the basic blocks in the loop. If we find an interior // block that is branching on a loop-invariant condition, we can unswitch this // loop. - for (Loop::block_iterator I = currentLoop->block_begin(), + for (Loop::block_iterator I = currentLoop->block_begin(), E = currentLoop->block_end(); I != E; ++I) { TerminatorInst *TI = (*I)->getTerminator(); if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { @@ -424,24 +441,24 @@ bool LoopUnswitch::processCurrentLoop() { if (BI->isConditional()) { // See if this, or some part of it, is loop invariant. If so, we can // unswitch on it if we desire. - Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), + Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop, Changed); - if (LoopCond && UnswitchIfProfitable(LoopCond, + if (LoopCond && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) { ++NumBranches; return true; } - } + } } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { - Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), + Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop, Changed); - unsigned NumCases = SI->getNumCases(); + unsigned NumCases = SI->getNumCases(); if (LoopCond && NumCases) { // Find a value to unswitch on: // FIXME: this should chose the most expensive case! // FIXME: scan for a case with a non-critical edge? Constant *UnswitchVal = NULL; - + // Do not process same value again and again. // At this point we have some cases already unswitched and // some not yet unswitched. Let's find the first not yet unswitched one. @@ -453,7 +470,7 @@ bool LoopUnswitch::processCurrentLoop() { break; } } - + if (!UnswitchVal) continue; @@ -463,14 +480,14 @@ bool LoopUnswitch::processCurrentLoop() { } } } - + // Scan the instructions to check for unswitchable values. - for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); + for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); BBI != E; ++BBI) if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) { - Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), + Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop, Changed); - if (LoopCond && UnswitchIfProfitable(LoopCond, + if (LoopCond && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) { ++NumSelects; return true; @@ -500,7 +517,7 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, ExitBB = BB; return true; } - + // Otherwise, this is an unvisited intra-loop node. Check all successors. for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) { // Check to see if the successor is a trivial loop exit. @@ -513,12 +530,12 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) if (I->mayHaveSideEffects()) return false; - + return true; } /// isTrivialLoopExitBlock - Return true if the specified block unconditionally -/// leads to an exit from the specified loop, and has no side-effects in the +/// leads to an exit from the specified loop, and has no side-effects in the /// process. If so, return the block that is exited to, otherwise return null. static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { std::set<BasicBlock*> Visited; @@ -546,39 +563,39 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, BasicBlock *Header = currentLoop->getHeader(); TerminatorInst *HeaderTerm = Header->getTerminator(); LLVMContext &Context = Header->getContext(); - + BasicBlock *LoopExitBB = 0; if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) { // If the header block doesn't end with a conditional branch on Cond, we // can't handle it. if (!BI->isConditional() || BI->getCondition() != Cond) return false; - - // Check to see if a successor of the branch is guaranteed to - // exit through a unique exit block without having any + + // Check to see if a successor of the branch is guaranteed to + // exit through a unique exit block without having any // side-effects. If so, determine the value of Cond that causes it to do // this. - if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, + if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, BI->getSuccessor(0)))) { if (Val) *Val = ConstantInt::getTrue(Context); - } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, + } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, BI->getSuccessor(1)))) { if (Val) *Val = ConstantInt::getFalse(Context); } } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) { // If this isn't a switch on Cond, we can't handle it. if (SI->getCondition() != Cond) return false; - + // Check to see if a successor of the switch is guaranteed to go to the - // latch block or exit through a one exit block without having any + // latch block or exit through a one exit block without having any // side-effects. If so, determine the value of Cond that causes it to do - // this. + // this. // Note that we can't trivially unswitch on the default case or // on already unswitched cases. for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) { BasicBlock* LoopExitCandidate; - if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, + if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, i.getCaseSuccessor()))) { // Okay, we found a trivial case, remember the value that is trivial. ConstantInt* CaseVal = i.getCaseValue(); @@ -598,9 +615,9 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, // contains phi nodes, this isn't trivial. if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) return false; // Can't handle this. - + if (LoopExit) *LoopExit = LoopExitBB; - + // We already know that nothing uses any scalar values defined inside of this // loop. As such, we just have to check to see if this loop will execute any // side-effecting instructions (e.g. stores, calls, volatile loads) in the @@ -686,17 +703,17 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, /// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable /// condition in it (a cond branch from its header block to its latch block, -/// where the path through the loop that doesn't execute its body has no +/// where the path through the loop that doesn't execute its body has no /// side-effects), unswitch it. This doesn't involve any code duplication, just /// moving the conditional branch outside of the loop and updating loop info. -void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, - Constant *Val, +void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, + Constant *Val, BasicBlock *ExitBlock) { DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %" << loopHeader->getName() << " [" << L->getBlocks().size() << " blocks] in Function " << L->getHeader()->getParent()->getName() << " on cond: " << *Val << " == " << *Cond << "\n"); - + // First step, split the preheader, so that we know that there is a safe place // to insert the conditional branch. We will change loopPreheader to have a // conditional branch on Cond. @@ -705,24 +722,24 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, // Now that we have a place to insert the conditional branch, create a place // to branch to: this is the exit block out of the loop that we should // short-circuit to. - + // Split this block now, so that the loop maintains its exit block, and so // that the jump from the preheader can execute the contents of the exit block // without actually branching to it (the exit block should be dominated by the // loop header, not the preheader). assert(!L->contains(ExitBlock) && "Exit block is in the loop?"); BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this); - - // Okay, now we have a position to branch from and a position to branch to, + + // Okay, now we have a position to branch from and a position to branch to, // insert the new conditional branch. - EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, + EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, loopPreheader->getTerminator()); LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L); loopPreheader->getTerminator()->eraseFromParent(); // We need to reprocess this loop, it could be unswitched again. redoLoop = true; - + // Now that we know that the loop is never entered when this condition is a // particular value, rewrite the loop with this info. We know that this will // at least eliminate the old branch. @@ -732,7 +749,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, /// SplitExitEdges - Split all of the edges from inside the loop to their exit /// blocks. Update the appropriate Phi nodes as we do so. -void LoopUnswitch::SplitExitEdges(Loop *L, +void LoopUnswitch::SplitExitEdges(Loop *L, const SmallVector<BasicBlock *, 8> &ExitBlocks){ for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { @@ -752,10 +769,10 @@ void LoopUnswitch::SplitExitEdges(Loop *L, } } -/// UnswitchNontrivialCondition - We determined that the loop is profitable -/// to unswitch when LIC equal Val. Split it into loop versions and test the +/// UnswitchNontrivialCondition - We determined that the loop is profitable +/// to unswitch when LIC equal Val. Split it into loop versions and test the /// condition outside of either loop. Return the loops created as Out1/Out2. -void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, +void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, Loop *L) { Function *F = loopHeader->getParent(); DEBUG(dbgs() << "loop-unswitch: Unswitching loop %" @@ -798,7 +815,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, ValueToValueMapTy VMap; for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) { BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F); - + NewBlocks.push_back(NewBB); VMap[LoopBlocks[i]] = NewBB; // Keep the BB mapping. LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L); @@ -828,7 +845,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // The new exit block should be in the same loop as the old one. if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i])) ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase()); - + assert(NewExit->getTerminator()->getNumSuccessors() == 1 && "Exit block should have been split to have one successor!"); BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0); @@ -863,7 +880,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) RemapInstruction(I, VMap,RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); - + // Rewrite the original preheader to select between versions of the loop. BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator()); assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] && @@ -882,7 +899,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // the condition that we're unswitching on), we don't rewrite the second // iteration. WeakVH LICHandle(LIC); - + // Now we rewrite the original code to know that the condition is true and the // new code to know that the condition is false. RewriteLoopBodyWithConditionConstant(L, LIC, Val, false); @@ -897,7 +914,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, /// RemoveFromWorklist - Remove all instances of I from the worklist vector /// specified. -static void RemoveFromWorklist(Instruction *I, +static void RemoveFromWorklist(Instruction *I, std::vector<Instruction*> &Worklist) { std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(), Worklist.end(), I); @@ -910,7 +927,7 @@ static void RemoveFromWorklist(Instruction *I, /// ReplaceUsesOfWith - When we find that I really equals V, remove I from the /// program, replacing all uses with V and update the worklist. -static void ReplaceUsesOfWith(Instruction *I, Value *V, +static void ReplaceUsesOfWith(Instruction *I, Value *V, std::vector<Instruction*> &Worklist, Loop *L, LPPassManager *LPM) { DEBUG(dbgs() << "Replace with '" << *V << "': " << *I); @@ -943,10 +960,10 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB, if (BasicBlock *Pred = BB->getSinglePredecessor()) { // If it has one pred, fold phi nodes in BB. while (isa<PHINode>(BB->begin())) - ReplaceUsesOfWith(BB->begin(), - cast<PHINode>(BB->begin())->getIncomingValue(0), + ReplaceUsesOfWith(BB->begin(), + cast<PHINode>(BB->begin())->getIncomingValue(0), Worklist, L, LPM); - + // If this is the header of a loop and the only pred is the latch, we now // have an unreachable loop. if (Loop *L = LI->getLoopFor(BB)) @@ -957,15 +974,15 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB, LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L); Pred->getTerminator()->eraseFromParent(); new UnreachableInst(BB->getContext(), Pred); - + // The loop is now broken, remove it from LI. RemoveLoopFromHierarchy(L); - + // Reprocess the header, which now IS dead. RemoveBlockIfDead(BB, Worklist, L); return; } - + // If pred ends in a uncond branch, add uncond branch to worklist so that // the two blocks will get merged. if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator())) @@ -976,11 +993,11 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB, } DEBUG(dbgs() << "Nuking dead block: " << *BB); - + // Remove the instructions in the basic block from the worklist. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { RemoveFromWorklist(I, Worklist); - + // Anything that uses the instructions in this basic block should have their // uses replaced with undefs. // If I is not void type then replaceAllUsesWith undef. @@ -988,7 +1005,7 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB, if (!I->getType()->isVoidTy()) I->replaceAllUsesWith(UndefValue::get(I->getType())); } - + // If this is the edge to the header block for a loop, remove the loop and // promote all subloops. if (Loop *BBLoop = LI->getLoopFor(BB)) { @@ -1004,8 +1021,8 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB, // Remove the block from the loop info, which removes it from any loops it // was in. LI->removeBlock(BB); - - + + // Remove phi node entries in successors for this block. TerminatorInst *TI = BB->getTerminator(); SmallVector<BasicBlock*, 4> Succs; @@ -1013,13 +1030,13 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB, Succs.push_back(TI->getSuccessor(i)); TI->getSuccessor(i)->removePredecessor(BB); } - + // Unique the successors, remove anything with multiple uses. array_pod_sort(Succs.begin(), Succs.end()); Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end()); - + // Remove the basic block, including all of the instructions contained in it. - LPM->deleteSimpleAnalysisValue(BB, L); + LPM->deleteSimpleAnalysisValue(BB, L); BB->eraseFromParent(); // Remove successor blocks here that are not dead, so that we know we only // have dead blocks in this list. Nondead blocks have a way of becoming dead, @@ -1037,7 +1054,7 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB, --i; } } - + for (unsigned i = 0, e = Succs.size(); i != e; ++i) RemoveBlockIfDead(Succs[i], Worklist, L); } @@ -1060,14 +1077,14 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, Constant *Val, bool IsEqual) { assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?"); - + // FIXME: Support correlated properties, like: // for (...) // if (li1 < li2) // ... // if (li1 > li2) // ... - + // FOLD boolean conditions (X|LIC), (X&LIC). Fold conditional branches, // selects, switches. std::vector<Instruction*> Worklist; @@ -1082,9 +1099,9 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, if (IsEqual) Replacement = Val; else - Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()), + Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()), !cast<ConstantInt>(Val)->getZExtValue()); - + for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end(); UI != E; ++UI) { Instruction *U = dyn_cast<Instruction>(*UI); @@ -1092,15 +1109,15 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, continue; Worklist.push_back(U); } - + for (std::vector<Instruction*>::iterator UI = Worklist.begin(); UI != Worklist.end(); ++UI) - (*UI)->replaceUsesOfWith(LIC, Replacement); - + (*UI)->replaceUsesOfWith(LIC, Replacement); + SimplifyCode(Worklist, L); return; } - + // Otherwise, we don't know the precise value of LIC, but we do know that it // is certainly NOT "Val". As such, simplify any uses in the loop that we // can. This case occurs when we unswitch switch statements. @@ -1112,27 +1129,27 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, Worklist.push_back(U); - // TODO: We could do other simplifications, for example, turning + // TODO: We could do other simplifications, for example, turning // 'icmp eq LIC, Val' -> false. // If we know that LIC is not Val, use this info to simplify code. SwitchInst *SI = dyn_cast<SwitchInst>(U); if (SI == 0 || !isa<ConstantInt>(Val)) continue; - + SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val)); // Default case is live for multiple values. if (DeadCase == SI->case_default()) continue; - - // Found a dead case value. Don't remove PHI nodes in the + + // Found a dead case value. Don't remove PHI nodes in the // successor if they become single-entry, those PHI nodes may // be in the Users list. BasicBlock *Switch = SI->getParent(); BasicBlock *SISucc = DeadCase.getCaseSuccessor(); BasicBlock *Latch = L->getLoopLatch(); - + BranchesInfo.setUnswitched(SI, Val); - + if (!SI->findCaseDest(SISucc)) continue; // Edge is critical. // If the DeadCase successor dominates the loop latch, then the // transformation isn't safe since it will delete the sole predecessor edge @@ -1172,7 +1189,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, if (DT) DT->addNewBlock(Abort, NewSISucc); } - + SimplifyCode(Worklist, L); } @@ -1193,7 +1210,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { // Simple DCE. if (isInstructionTriviallyDead(I)) { DEBUG(dbgs() << "Remove dead instruction '" << *I); - + // Add uses to the worklist, which may be dead now. for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i))) @@ -1225,24 +1242,24 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { if (!SinglePred) continue; // Nothing to do. assert(SinglePred == Pred && "CFG broken"); - DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- " + DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- " << Succ->getName() << "\n"); - + // Resolve any single entry PHI nodes in Succ. while (PHINode *PN = dyn_cast<PHINode>(Succ->begin())) ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM); - + // If Succ has any successors with PHI nodes, update them to have // entries coming from Pred instead of Succ. Succ->replaceAllUsesWith(Pred); - + // Move all of the successor contents from Succ to Pred. Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(), Succ->end()); LPM->deleteSimpleAnalysisValue(BI, L); BI->eraseFromParent(); RemoveFromWorklist(BI, Worklist); - + // Remove Succ from the loop tree. LI->removeBlock(Succ); LPM->deleteSimpleAnalysisValue(Succ, L); @@ -1250,7 +1267,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { ++NumSimplify; continue; } - + if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){ // Conditional branch. Turn it into an unconditional branch, then // remove dead blocks. diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp index 9fdea8d..29234da 100644 --- a/lib/Transforms/Scalar/ObjCARC.cpp +++ b/lib/Transforms/Scalar/ObjCARC.cpp @@ -162,6 +162,7 @@ namespace { IC_MoveWeak, ///< objc_moveWeak (derived) IC_CopyWeak, ///< objc_copyWeak (derived) IC_DestroyWeak, ///< objc_destroyWeak (derived) + IC_StoreStrong, ///< objc_storeStrong (derived) IC_CallOrUser, ///< could call objc_release and/or "use" pointers IC_Call, ///< could call objc_release IC_User, ///< could "use" a pointer @@ -262,6 +263,7 @@ static InstructionClass GetFunctionClass(const Function *F) { return StringSwitch<InstructionClass>(F->getName()) .Case("objc_storeWeak", IC_StoreWeak) .Case("objc_initWeak", IC_InitWeak) + .Case("objc_storeStrong", IC_StoreStrong) .Default(IC_CallOrUser); // Second argument is i8**. if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1)) @@ -618,22 +620,35 @@ static bool DoesObjCBlockEscape(const Value *BlockPtr) { const User *UUser = *UI; // Special - Use by a call (callee or argument) is not considered // to be an escape. - if (isa<CallInst>(UUser) || isa<InvokeInst>(UUser)) - continue; - // Use by an instruction which copies the value is an escape if the - // result is an escape. - if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) || - isa<PHINode>(UUser) || isa<SelectInst>(UUser)) { - Worklist.push_back(UUser); + switch (GetBasicInstructionClass(UUser)) { + case IC_StoreWeak: + case IC_InitWeak: + case IC_StoreStrong: + case IC_Autorelease: + case IC_AutoreleaseRV: + // These special functions make copies of their pointer arguments. + return true; + case IC_User: + case IC_None: + // Use by an instruction which copies the value is an escape if the + // result is an escape. + if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) || + isa<PHINode>(UUser) || isa<SelectInst>(UUser)) { + Worklist.push_back(UUser); + continue; + } + // Use by a load is not an escape. + if (isa<LoadInst>(UUser)) + continue; + // Use by a store is not an escape if the use is the address. + if (const StoreInst *SI = dyn_cast<StoreInst>(UUser)) + if (V != SI->getValueOperand()) + continue; + break; + default: + // Regular calls and other stuff are not considered escapes. continue; } - // Use by a load is not an escape. - if (isa<LoadInst>(UUser)) - continue; - // Use by a store is not an escape if the use is the address. - if (const StoreInst *SI = dyn_cast<StoreInst>(UUser)) - if (V != SI->getValueOperand()) - continue; // Otherwise, conservatively assume an escape. return true; } @@ -883,7 +898,7 @@ bool ObjCARCExpand::runOnFunction(Function &F) { // These calls return their argument verbatim, as a low-level // optimization. However, this makes high-level optimizations // harder. Undo any uses of this optimization that the front-end - // emitted here. We'll redo them in a later pass. + // emitted here. We'll redo them in the contract pass. Changed = true; Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0)); break; @@ -997,7 +1012,11 @@ bool ObjCARCAPElim::runOnModule(Module &M) { return false; // Find the llvm.global_ctors variable, as the first step in - // identifying the global constructors. + // identifying the global constructors. In theory, unnecessary autorelease + // pools could occur anywhere, but in practice it's pretty rare. Global + // ctors are a place where autorelease pools get inserted automatically, + // so it's pretty common for them to be unnecessary, and it's pretty + // profitable to eliminate them. GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); if (!GV) return false; @@ -1014,7 +1033,11 @@ bool ObjCARCAPElim::runOnModule(Module &M) { Value *Op = *OI; // llvm.global_ctors is an array of pairs where the second members // are constructor functions. - Function *F = cast<Function>(cast<ConstantStruct>(Op)->getOperand(1)); + Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1)); + // If the user used a constructor function with the wrong signature and + // it got bitcasted or whatever, look the other way. + if (!F) + continue; // Only look at function definitions. if (F->isDeclaration()) continue; @@ -1678,9 +1701,16 @@ namespace { void CheckForCFGHazards(const BasicBlock *BB, DenseMap<const BasicBlock *, BBState> &BBStates, BBState &MyStates) const; + bool VisitInstructionBottomUp(Instruction *Inst, + BasicBlock *BB, + MapVector<Value *, RRInfo> &Retains, + BBState &MyStates); bool VisitBottomUp(BasicBlock *BB, DenseMap<const BasicBlock *, BBState> &BBStates, MapVector<Value *, RRInfo> &Retains); + bool VisitInstructionTopDown(Instruction *Inst, + DenseMap<Value *, RRInfo> &Releases, + BBState &MyStates); bool VisitTopDown(BasicBlock *BB, DenseMap<const BasicBlock *, BBState> &BBStates, DenseMap<Value *, RRInfo> &Releases); @@ -1956,6 +1986,7 @@ namespace { /// use here. enum DependenceKind { NeedsPositiveRetainCount, + AutoreleasePoolBoundary, CanChangeRetainCount, RetainAutoreleaseDep, ///< Blocks objc_retainAutorelease. RetainAutoreleaseRVDep, ///< Blocks objc_retainAutoreleaseReturnValue. @@ -1985,6 +2016,19 @@ Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg, } } + case AutoreleasePoolBoundary: { + InstructionClass Class = GetInstructionClass(Inst); + switch (Class) { + case IC_AutoreleasepoolPop: + case IC_AutoreleasepoolPush: + // These mark the end and begin of an autorelease pool scope. + return true; + default: + // Nothing else does this. + return false; + } + } + case CanChangeRetainCount: { InstructionClass Class = GetInstructionClass(Inst); switch (Class) { @@ -2002,6 +2046,7 @@ Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg, case RetainAutoreleaseDep: switch (GetBasicInstructionClass(Inst)) { case IC_AutoreleasepoolPop: + case IC_AutoreleasepoolPush: // Don't merge an objc_autorelease with an objc_retain inside a different // autoreleasepool scope. return true; @@ -2136,17 +2181,26 @@ ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) { /// return true. bool ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { - // Check for the argument being from an immediately preceding call. + // Check for the argument being from an immediately preceding call or invoke. Value *Arg = GetObjCArg(RetainRV); CallSite CS(Arg); - if (Instruction *Call = CS.getInstruction()) + if (Instruction *Call = CS.getInstruction()) { if (Call->getParent() == RetainRV->getParent()) { BasicBlock::iterator I = Call; ++I; while (isNoopInstruction(I)) ++I; if (&*I == RetainRV) return false; + } else if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { + BasicBlock *RetainRVParent = RetainRV->getParent(); + if (II->getNormalDest() == RetainRVParent) { + BasicBlock::iterator I = RetainRVParent->begin(); + while (isNoopInstruction(I)) ++I; + if (&*I == RetainRV) + return false; + } } + } // Check for being preceded by an objc_autoreleaseReturnValue on the same // pointer. In this case, we can delete the pair. @@ -2232,6 +2286,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { case IC_DestroyWeak: { CallInst *CI = cast<CallInst>(Inst); if (isNullOrUndef(CI->getArgOperand(0))) { + Changed = true; Type *Ty = CI->getArgOperand(0)->getType(); new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), Constant::getNullValue(Ty), @@ -2247,6 +2302,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { CallInst *CI = cast<CallInst>(Inst); if (isNullOrUndef(CI->getArgOperand(0)) || isNullOrUndef(CI->getArgOperand(1))) { + Changed = true; Type *Ty = CI->getArgOperand(0)->getType(); new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), Constant::getNullValue(Ty), @@ -2360,9 +2416,34 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // Check that there is nothing that cares about the reference // count between the call and the phi. - FindDependencies(NeedsPositiveRetainCount, Arg, - Inst->getParent(), Inst, - DependingInstructions, Visited, PA); + switch (Class) { + case IC_Retain: + case IC_RetainBlock: + // These can always be moved up. + break; + case IC_Release: + // These can't be moved across things that care about the retain count. + FindDependencies(NeedsPositiveRetainCount, Arg, + Inst->getParent(), Inst, + DependingInstructions, Visited, PA); + break; + case IC_Autorelease: + // These can't be moved across autorelease pool scope boundaries. + FindDependencies(AutoreleasePoolBoundary, Arg, + Inst->getParent(), Inst, + DependingInstructions, Visited, PA); + break; + case IC_RetainRV: + case IC_AutoreleaseRV: + // Don't move these; the RV optimization depends on the autoreleaseRV + // being tail called, and the retainRV being immediately after a call + // (which might still happen if we get lucky with codegen layout, but + // it's not worth taking the chance). + continue; + default: + llvm_unreachable("Invalid dependence flavor"); + } + if (DependingInstructions.size() == 1 && *DependingInstructions.begin() == PN) { Changed = true; @@ -2516,6 +2597,164 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, } bool +ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, + BasicBlock *BB, + MapVector<Value *, RRInfo> &Retains, + BBState &MyStates) { + bool NestingDetected = false; + InstructionClass Class = GetInstructionClass(Inst); + const Value *Arg = 0; + + switch (Class) { + case IC_Release: { + Arg = GetObjCArg(Inst); + + PtrState &S = MyStates.getPtrBottomUpState(Arg); + + // If we see two releases in a row on the same pointer. If so, make + // a note, and we'll cicle back to revisit it after we've + // hopefully eliminated the second release, which may allow us to + // eliminate the first release too. + // Theoretically we could implement removal of nested retain+release + // pairs by making PtrState hold a stack of states, but this is + // simple and avoids adding overhead for the non-nested case. + if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) + NestingDetected = true; + + S.RRI.clear(); + + MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind); + S.SetSeq(ReleaseMetadata ? S_MovableRelease : S_Release); + S.RRI.ReleaseMetadata = ReleaseMetadata; + S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented(); + S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); + S.RRI.Calls.insert(Inst); + + S.IncrementRefCount(); + S.IncrementNestCount(); + break; + } + case IC_RetainBlock: + // An objc_retainBlock call with just a use may need to be kept, + // because it may be copying a block from the stack to the heap. + if (!IsRetainBlockOptimizable(Inst)) + break; + // FALLTHROUGH + case IC_Retain: + case IC_RetainRV: { + Arg = GetObjCArg(Inst); + + PtrState &S = MyStates.getPtrBottomUpState(Arg); + S.DecrementRefCount(); + S.SetAtLeastOneRefCount(); + S.DecrementNestCount(); + + switch (S.GetSeq()) { + case S_Stop: + case S_Release: + case S_MovableRelease: + case S_Use: + S.RRI.ReverseInsertPts.clear(); + // FALL THROUGH + case S_CanRelease: + // Don't do retain+release tracking for IC_RetainRV, because it's + // better to let it remain as the first instruction after a call. + if (Class != IC_RetainRV) { + S.RRI.IsRetainBlock = Class == IC_RetainBlock; + Retains[Inst] = S.RRI; + } + S.ClearSequenceProgress(); + break; + case S_None: + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + return NestingDetected; + } + case IC_AutoreleasepoolPop: + // Conservatively, clear MyStates for all known pointers. + MyStates.clearBottomUpPointers(); + return NestingDetected; + case IC_AutoreleasepoolPush: + case IC_None: + // These are irrelevant. + return NestingDetected; + default: + break; + } + + // Consider any other possible effects of this instruction on each + // pointer being tracked. + for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(), + ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) { + const Value *Ptr = MI->first; + if (Ptr == Arg) + continue; // Handled above. + PtrState &S = MI->second; + Sequence Seq = S.GetSeq(); + + // Check for possible releases. + if (CanAlterRefCount(Inst, Ptr, PA, Class)) { + S.DecrementRefCount(); + switch (Seq) { + case S_Use: + S.SetSeq(S_CanRelease); + continue; + case S_CanRelease: + case S_Release: + case S_MovableRelease: + case S_Stop: + case S_None: + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + } + + // Check for possible direct uses. + switch (Seq) { + case S_Release: + case S_MovableRelease: + if (CanUse(Inst, Ptr, PA, Class)) { + assert(S.RRI.ReverseInsertPts.empty()); + // If this is an invoke instruction, we're scanning it as part of + // one of its successor blocks, since we can't insert code after it + // in its own block, and we don't want to split critical edges. + if (isa<InvokeInst>(Inst)) + S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt()); + else + S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst))); + S.SetSeq(S_Use); + } else if (Seq == S_Release && + (Class == IC_User || Class == IC_CallOrUser)) { + // Non-movable releases depend on any possible objc pointer use. + S.SetSeq(S_Stop); + assert(S.RRI.ReverseInsertPts.empty()); + // As above; handle invoke specially. + if (isa<InvokeInst>(Inst)) + S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt()); + else + S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst))); + } + break; + case S_Stop: + if (CanUse(Inst, Ptr, PA, Class)) + S.SetSeq(S_Use); + break; + case S_CanRelease: + case S_Use: + case S_None: + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + } + + return NestingDetected; +} + +bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB, DenseMap<const BasicBlock *, BBState> &BBStates, MapVector<Value *, RRInfo> &Retains) { @@ -2560,144 +2799,164 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, // Visit all the instructions, bottom-up. for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) { Instruction *Inst = llvm::prior(I); - InstructionClass Class = GetInstructionClass(Inst); - const Value *Arg = 0; - switch (Class) { - case IC_Release: { - Arg = GetObjCArg(Inst); + // Invoke instructions are visited as part of their successors (below). + if (isa<InvokeInst>(Inst)) + continue; + + NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates); + } + + // If there's a predecessor with an invoke, visit the invoke as + // if it were part of this block, since we can't insert code after + // an invoke in its own block, and we don't want to split critical + // edges. + for (pred_iterator PI(BB), PE(BB, false); PI != PE; ++PI) { + BasicBlock *Pred = *PI; + TerminatorInst *PredTI = cast<TerminatorInst>(&Pred->back()); + if (isa<InvokeInst>(PredTI)) + NestingDetected |= VisitInstructionBottomUp(PredTI, BB, Retains, MyStates); + } + + return NestingDetected; +} + +bool +ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, + DenseMap<Value *, RRInfo> &Releases, + BBState &MyStates) { + bool NestingDetected = false; + InstructionClass Class = GetInstructionClass(Inst); + const Value *Arg = 0; + + switch (Class) { + case IC_RetainBlock: + // An objc_retainBlock call with just a use may need to be kept, + // because it may be copying a block from the stack to the heap. + if (!IsRetainBlockOptimizable(Inst)) + break; + // FALLTHROUGH + case IC_Retain: + case IC_RetainRV: { + Arg = GetObjCArg(Inst); - PtrState &S = MyStates.getPtrBottomUpState(Arg); + PtrState &S = MyStates.getPtrTopDownState(Arg); - // If we see two releases in a row on the same pointer. If so, make + // Don't do retain+release tracking for IC_RetainRV, because it's + // better to let it remain as the first instruction after a call. + if (Class != IC_RetainRV) { + // If we see two retains in a row on the same pointer. If so, make // a note, and we'll cicle back to revisit it after we've - // hopefully eliminated the second release, which may allow us to - // eliminate the first release too. + // hopefully eliminated the second retain, which may allow us to + // eliminate the first retain too. // Theoretically we could implement removal of nested retain+release // pairs by making PtrState hold a stack of states, but this is // simple and avoids adding overhead for the non-nested case. - if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) + if (S.GetSeq() == S_Retain) NestingDetected = true; + S.SetSeq(S_Retain); S.RRI.clear(); - - MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind); - S.SetSeq(ReleaseMetadata ? S_MovableRelease : S_Release); - S.RRI.ReleaseMetadata = ReleaseMetadata; - S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented(); - S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); + S.RRI.IsRetainBlock = Class == IC_RetainBlock; + // Don't check S.IsKnownIncremented() here because it's not + // sufficient. + S.RRI.KnownSafe = S.IsKnownNested(); S.RRI.Calls.insert(Inst); + } - S.IncrementRefCount(); - S.IncrementNestCount(); + S.SetAtLeastOneRefCount(); + S.IncrementRefCount(); + S.IncrementNestCount(); + return NestingDetected; + } + case IC_Release: { + Arg = GetObjCArg(Inst); + + PtrState &S = MyStates.getPtrTopDownState(Arg); + S.DecrementRefCount(); + S.DecrementNestCount(); + + switch (S.GetSeq()) { + case S_Retain: + case S_CanRelease: + S.RRI.ReverseInsertPts.clear(); + // FALL THROUGH + case S_Use: + S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind); + S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); + Releases[Inst] = S.RRI; + S.ClearSequenceProgress(); + break; + case S_None: break; + case S_Stop: + case S_Release: + case S_MovableRelease: + llvm_unreachable("top-down pointer in release state!"); } - case IC_RetainBlock: - // An objc_retainBlock call with just a use may need to be kept, - // because it may be copying a block from the stack to the heap. - if (!IsRetainBlockOptimizable(Inst)) - break; - // FALLTHROUGH - case IC_Retain: - case IC_RetainRV: { - Arg = GetObjCArg(Inst); + break; + } + case IC_AutoreleasepoolPop: + // Conservatively, clear MyStates for all known pointers. + MyStates.clearTopDownPointers(); + return NestingDetected; + case IC_AutoreleasepoolPush: + case IC_None: + // These are irrelevant. + return NestingDetected; + default: + break; + } - PtrState &S = MyStates.getPtrBottomUpState(Arg); + // Consider any other possible effects of this instruction on each + // pointer being tracked. + for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(), + ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) { + const Value *Ptr = MI->first; + if (Ptr == Arg) + continue; // Handled above. + PtrState &S = MI->second; + Sequence Seq = S.GetSeq(); + + // Check for possible releases. + if (CanAlterRefCount(Inst, Ptr, PA, Class)) { S.DecrementRefCount(); - S.SetAtLeastOneRefCount(); - S.DecrementNestCount(); + switch (Seq) { + case S_Retain: + S.SetSeq(S_CanRelease); + assert(S.RRI.ReverseInsertPts.empty()); + S.RRI.ReverseInsertPts.insert(Inst); - switch (S.GetSeq()) { - case S_Stop: - case S_Release: - case S_MovableRelease: + // One call can't cause a transition from S_Retain to S_CanRelease + // and S_CanRelease to S_Use. If we've made the first transition, + // we're done. + continue; case S_Use: - S.RRI.ReverseInsertPts.clear(); - // FALL THROUGH case S_CanRelease: - // Don't do retain+release tracking for IC_RetainRV, because it's - // better to let it remain as the first instruction after a call. - if (Class != IC_RetainRV) { - S.RRI.IsRetainBlock = Class == IC_RetainBlock; - Retains[Inst] = S.RRI; - } - S.ClearSequenceProgress(); - break; case S_None: break; - case S_Retain: - llvm_unreachable("bottom-up pointer in retain state!"); - } - continue; - } - case IC_AutoreleasepoolPop: - // Conservatively, clear MyStates for all known pointers. - MyStates.clearBottomUpPointers(); - continue; - case IC_AutoreleasepoolPush: - case IC_None: - // These are irrelevant. - continue; - default: - break; - } - - // Consider any other possible effects of this instruction on each - // pointer being tracked. - for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(), - ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) { - const Value *Ptr = MI->first; - if (Ptr == Arg) - continue; // Handled above. - PtrState &S = MI->second; - Sequence Seq = S.GetSeq(); - - // Check for possible releases. - if (CanAlterRefCount(Inst, Ptr, PA, Class)) { - S.DecrementRefCount(); - switch (Seq) { - case S_Use: - S.SetSeq(S_CanRelease); - continue; - case S_CanRelease: - case S_Release: - case S_MovableRelease: - case S_Stop: - case S_None: - break; - case S_Retain: - llvm_unreachable("bottom-up pointer in retain state!"); - } - } - - // Check for possible direct uses. - switch (Seq) { + case S_Stop: case S_Release: case S_MovableRelease: - if (CanUse(Inst, Ptr, PA, Class)) { - assert(S.RRI.ReverseInsertPts.empty()); - S.RRI.ReverseInsertPts.insert(Inst); - S.SetSeq(S_Use); - } else if (Seq == S_Release && - (Class == IC_User || Class == IC_CallOrUser)) { - // Non-movable releases depend on any possible objc pointer use. - S.SetSeq(S_Stop); - assert(S.RRI.ReverseInsertPts.empty()); - S.RRI.ReverseInsertPts.insert(Inst); - } - break; - case S_Stop: - if (CanUse(Inst, Ptr, PA, Class)) - S.SetSeq(S_Use); - break; - case S_CanRelease: - case S_Use: - case S_None: - break; - case S_Retain: - llvm_unreachable("bottom-up pointer in retain state!"); + llvm_unreachable("top-down pointer in release state!"); } } + + // Check for possible direct uses. + switch (Seq) { + case S_CanRelease: + if (CanUse(Inst, Ptr, PA, Class)) + S.SetSeq(S_Use); + break; + case S_Retain: + case S_Use: + case S_None: + break; + case S_Stop: + case S_Release: + case S_MovableRelease: + llvm_unreachable("top-down pointer in release state!"); + } } return NestingDetected; @@ -2751,138 +3010,7 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, // Visit all the instructions, top-down. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { Instruction *Inst = I; - InstructionClass Class = GetInstructionClass(Inst); - const Value *Arg = 0; - - switch (Class) { - case IC_RetainBlock: - // An objc_retainBlock call with just a use may need to be kept, - // because it may be copying a block from the stack to the heap. - if (!IsRetainBlockOptimizable(Inst)) - break; - // FALLTHROUGH - case IC_Retain: - case IC_RetainRV: { - Arg = GetObjCArg(Inst); - - PtrState &S = MyStates.getPtrTopDownState(Arg); - - // Don't do retain+release tracking for IC_RetainRV, because it's - // better to let it remain as the first instruction after a call. - if (Class != IC_RetainRV) { - // If we see two retains in a row on the same pointer. If so, make - // a note, and we'll cicle back to revisit it after we've - // hopefully eliminated the second retain, which may allow us to - // eliminate the first retain too. - // Theoretically we could implement removal of nested retain+release - // pairs by making PtrState hold a stack of states, but this is - // simple and avoids adding overhead for the non-nested case. - if (S.GetSeq() == S_Retain) - NestingDetected = true; - - S.SetSeq(S_Retain); - S.RRI.clear(); - S.RRI.IsRetainBlock = Class == IC_RetainBlock; - // Don't check S.IsKnownIncremented() here because it's not - // sufficient. - S.RRI.KnownSafe = S.IsKnownNested(); - S.RRI.Calls.insert(Inst); - } - - S.SetAtLeastOneRefCount(); - S.IncrementRefCount(); - S.IncrementNestCount(); - continue; - } - case IC_Release: { - Arg = GetObjCArg(Inst); - - PtrState &S = MyStates.getPtrTopDownState(Arg); - S.DecrementRefCount(); - S.DecrementNestCount(); - - switch (S.GetSeq()) { - case S_Retain: - case S_CanRelease: - S.RRI.ReverseInsertPts.clear(); - // FALL THROUGH - case S_Use: - S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind); - S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); - Releases[Inst] = S.RRI; - S.ClearSequenceProgress(); - break; - case S_None: - break; - case S_Stop: - case S_Release: - case S_MovableRelease: - llvm_unreachable("top-down pointer in release state!"); - } - break; - } - case IC_AutoreleasepoolPop: - // Conservatively, clear MyStates for all known pointers. - MyStates.clearTopDownPointers(); - continue; - case IC_AutoreleasepoolPush: - case IC_None: - // These are irrelevant. - continue; - default: - break; - } - - // Consider any other possible effects of this instruction on each - // pointer being tracked. - for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(), - ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) { - const Value *Ptr = MI->first; - if (Ptr == Arg) - continue; // Handled above. - PtrState &S = MI->second; - Sequence Seq = S.GetSeq(); - - // Check for possible releases. - if (CanAlterRefCount(Inst, Ptr, PA, Class)) { - S.DecrementRefCount(); - switch (Seq) { - case S_Retain: - S.SetSeq(S_CanRelease); - assert(S.RRI.ReverseInsertPts.empty()); - S.RRI.ReverseInsertPts.insert(Inst); - - // One call can't cause a transition from S_Retain to S_CanRelease - // and S_CanRelease to S_Use. If we've made the first transition, - // we're done. - continue; - case S_Use: - case S_CanRelease: - case S_None: - break; - case S_Stop: - case S_Release: - case S_MovableRelease: - llvm_unreachable("top-down pointer in release state!"); - } - } - - // Check for possible direct uses. - switch (Seq) { - case S_CanRelease: - if (CanUse(Inst, Ptr, PA, Class)) - S.SetSeq(S_Use); - break; - case S_Retain: - case S_Use: - case S_None: - break; - case S_Stop: - case S_Release: - case S_MovableRelease: - llvm_unreachable("top-down pointer in release state!"); - } - } + NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates); } CheckForCFGHazards(BB, BBStates, MyStates); @@ -3032,35 +3160,17 @@ void ObjCARCOpt::MoveCalls(Value *Arg, for (SmallPtrSet<Instruction *, 2>::const_iterator PI = RetainsToMove.ReverseInsertPts.begin(), PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) { - Instruction *LastUse = *PI; - Instruction *InsertPts[] = { 0, 0, 0 }; - if (InvokeInst *II = dyn_cast<InvokeInst>(LastUse)) { - // We can't insert code immediately after an invoke instruction, so - // insert code at the beginning of both successor blocks instead. - // The invoke's return value isn't available in the unwind block, - // but our releases will never depend on it, because they must be - // paired with retains from before the invoke. - InsertPts[0] = II->getNormalDest()->getFirstInsertionPt(); - if (!II->getMetadata(NoObjCARCExceptionsMDKind)) - InsertPts[1] = II->getUnwindDest()->getFirstInsertionPt(); - } else { - // Insert code immediately after the last use. - InsertPts[0] = llvm::next(BasicBlock::iterator(LastUse)); - } - - for (Instruction **I = InsertPts; *I; ++I) { - Instruction *InsertPt = *I; - Value *MyArg = ArgTy == ParamTy ? Arg : - new BitCastInst(Arg, ParamTy, "", InsertPt); - CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg, - "", InsertPt); - // Attach a clang.imprecise_release metadata tag, if appropriate. - if (MDNode *M = ReleasesToMove.ReleaseMetadata) - Call->setMetadata(ImpreciseReleaseMDKind, M); - Call->setDoesNotThrow(); - if (ReleasesToMove.IsTailCallRelease) - Call->setTailCall(); - } + Instruction *InsertPt = *PI; + Value *MyArg = ArgTy == ParamTy ? Arg : + new BitCastInst(Arg, ParamTy, "", InsertPt); + CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg, + "", InsertPt); + // Attach a clang.imprecise_release metadata tag, if appropriate. + if (MDNode *M = ReleasesToMove.ReleaseMetadata) + Call->setMetadata(ImpreciseReleaseMDKind, M); + Call->setDoesNotThrow(); + if (ReleasesToMove.IsTailCallRelease) + Call->setTailCall(); } // Delete the original retain and release calls. @@ -3080,6 +3190,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg, } } +/// PerformCodePlacement - Identify pairings between the retains and releases, +/// and delete and/or move them. bool ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates, @@ -3093,6 +3205,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> SmallVector<Instruction *, 4> NewReleases; SmallVector<Instruction *, 8> DeadInsts; + // Visit each retain. for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(), E = Retains.end(); I != E; ++I) { Value *V = I->first; @@ -3566,6 +3679,7 @@ bool ObjCARCOpt::doInitialization(Module &M) { if (!EnableARCOpts) return false; + // If nothing in the Module uses ARC, don't do anything. Run = ModuleHasARC(M); if (!Run) return false; @@ -3900,6 +4014,7 @@ void ObjCARCContract::ContractRelease(Instruction *Release, } bool ObjCARCContract::doInitialization(Module &M) { + // If nothing in the Module uses ARC, don't do anything. Run = ModuleHasARC(M); if (!Run) return false; @@ -3975,6 +4090,7 @@ bool ObjCARCContract::runOnFunction(Function &F) { --BBI; while (isNoopInstruction(BBI)) --BBI; if (&*BBI == GetObjCArg(Inst)) { + Changed = true; InlineAsm *IA = InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()), /*isVarArg=*/false), @@ -4024,16 +4140,19 @@ bool ObjCARCContract::runOnFunction(Function &F) { Use &U = UI.getUse(); unsigned OperandNo = UI.getOperandNo(); ++UI; // Increment UI now, because we may unlink its element. - Instruction *UserInst = dyn_cast<Instruction>(U.getUser()); - if (!UserInst) - continue; - // FIXME: dominates should return true for unreachable UserInst. - if (!DT->isReachableFromEntry(UserInst->getParent()) || - DT->dominates(Inst, UserInst)) { + + // If the call's return value dominates a use of the call's argument + // value, rewrite the use to use the return value. We check for + // reachability here because an unreachable call is considered to + // trivially dominate itself, which would lead us to rewriting its + // argument in terms of its return value, which would lead to + // infinite loops in GetObjCArg. + if (DT->isReachableFromEntry(U) && + DT->dominates(Inst, U)) { Changed = true; Instruction *Replacement = Inst; Type *UseTy = U.get()->getType(); - if (PHINode *PHI = dyn_cast<PHINode>(UserInst)) { + if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) { // For PHI nodes, insert the bitcast in the predecessor block. unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo); @@ -4042,6 +4161,9 @@ bool ObjCARCContract::runOnFunction(Function &F) { if (Replacement->getType() != UseTy) Replacement = new BitCastInst(Replacement, UseTy, "", &BB->back()); + // While we're here, rewrite all edges for this PHI, rather + // than just one use at a time, to minimize the number of + // bitcasts we emit. for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) if (PHI->getIncomingBlock(i) == BB) { @@ -4054,7 +4176,8 @@ bool ObjCARCContract::runOnFunction(Function &F) { } } else { if (Replacement->getType() != UseTy) - Replacement = new BitCastInst(Replacement, UseTy, "", UserInst); + Replacement = new BitCastInst(Replacement, UseTy, "", + cast<Instruction>(U.getUser())); U.set(Replacement); } } diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index 8f98a5b..cb408a1 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -74,7 +74,7 @@ static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { namespace { class Reassociate : public FunctionPass { DenseMap<BasicBlock*, unsigned> RankMap; - DenseMap<AssertingVH<>, unsigned> ValueRankMap; + DenseMap<AssertingVH<Value>, unsigned> ValueRankMap; SmallVector<WeakVH, 8> RedoInsts; SmallVector<WeakVH, 8> DeadInsts; bool MadeChange; @@ -210,7 +210,7 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { /// LowerNegateToMultiply - Replace 0-X with X*-1. /// static Instruction *LowerNegateToMultiply(Instruction *Neg, - DenseMap<AssertingVH<>, unsigned> &ValueRankMap) { + DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) { Constant *Cst = Constant::getAllOnesValue(Neg->getType()); Instruction *Res = BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg); @@ -492,7 +492,7 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) { /// only used by an add, transform this into (X+(0-Y)) to promote better /// reassociation. static Instruction *BreakUpSubtract(Instruction *Sub, - DenseMap<AssertingVH<>, unsigned> &ValueRankMap) { + DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) { // Convert a subtract into an add and a neg instruction. This allows sub // instructions to be commuted with other add instructions. // @@ -517,8 +517,8 @@ static Instruction *BreakUpSubtract(Instruction *Sub, /// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used /// by one, change this into a multiply by a constant to assist with further /// reassociation. -static Instruction *ConvertShiftToMul(Instruction *Shl, - DenseMap<AssertingVH<>, unsigned> &ValueRankMap) { +static Instruction *ConvertShiftToMul(Instruction *Shl, + DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) { // If an operand of this shift is a reassociable multiply, or if the shift // is used by a reassociable multiply or add, turn into a multiply. if (isReassociableOp(Shl->getOperand(0), Instruction::Mul) || diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 5ce82b9..16b64a5 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -1925,8 +1925,8 @@ bool IPSCCP::runOnModule(Module &M) { ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType())); } - // If we inferred constant or undef values for globals variables, we can delete - // the global and any stores that remain to it. + // If we inferred constant or undef values for globals variables, we can + // delete the global and any stores that remain to it. const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals(); for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(), E = TG.end(); I != E; ++I) { diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index d36a18f..026fea1 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -13,7 +13,7 @@ // each member (if possible). Then, if possible, it transforms the individual // alloca instructions into nice clean scalar SSA form. // -// This combines a simple SRoA algorithm with the Mem2Reg algorithm because +// This combines a simple SRoA algorithm with the Mem2Reg algorithm because they // often interact, especially for C++ programs. As such, iterating between // SRoA, then Mem2Reg until we run out of things to promote works well. // @@ -574,8 +574,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // transform it into a store of the expanded constant value. if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { assert(MSI->getRawDest() == Ptr && "Consistency error!"); - signed SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue(); - if (SNumBytes > 0) { + int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue(); + if (SNumBytes > 0 && (SNumBytes >> 32) == 0) { unsigned NumBytes = static_cast<unsigned>(SNumBytes); unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue(); diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp index 9c49ec1..f7b6941 100644 --- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -1583,21 +1583,16 @@ void SimplifyLibCalls::InitOptimizations() { Optimizations["llvm.exp2.f64"] = &Exp2; Optimizations["llvm.exp2.f32"] = &Exp2; -#ifdef HAVE_FLOORF - Optimizations["floor"] = &UnaryDoubleFP; -#endif -#ifdef HAVE_CEILF - Optimizations["ceil"] = &UnaryDoubleFP; -#endif -#ifdef HAVE_ROUNDF - Optimizations["round"] = &UnaryDoubleFP; -#endif -#ifdef HAVE_RINTF - Optimizations["rint"] = &UnaryDoubleFP; -#endif -#ifdef HAVE_NEARBYINTF - Optimizations["nearbyint"] = &UnaryDoubleFP; -#endif + if (TLI->has(LibFunc::floor) && TLI->has(LibFunc::floorf)) + Optimizations["floor"] = &UnaryDoubleFP; + if (TLI->has(LibFunc::ceil) && TLI->has(LibFunc::ceilf)) + Optimizations["ceil"] = &UnaryDoubleFP; + if (TLI->has(LibFunc::round) && TLI->has(LibFunc::roundf)) + Optimizations["round"] = &UnaryDoubleFP; + if (TLI->has(LibFunc::rint) && TLI->has(LibFunc::rintf)) + Optimizations["rint"] = &UnaryDoubleFP; + if (TLI->has(LibFunc::nearbyint) && TLI->has(LibFunc::nearbyintf)) + Optimizations["nearbyint"] = &UnaryDoubleFP; // Integer Optimizations Optimizations["ffs"] = &FFS; diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index 1b28c35..20052a4 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -23,8 +23,11 @@ #include "llvm/LLVMContext.h" #include "llvm/Metadata.h" #include "llvm/Support/CFG.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/DebugInfo.h" #include "llvm/ADT/SmallVector.h" #include <map> @@ -197,7 +200,6 @@ namespace { const Function *OldFunc; ValueToValueMapTy &VMap; bool ModuleLevelChanges; - SmallVectorImpl<ReturnInst*> &Returns; const char *NameSuffix; ClonedCodeInfo *CodeInfo; const TargetData *TD; @@ -205,24 +207,18 @@ namespace { PruningFunctionCloner(Function *newFunc, const Function *oldFunc, ValueToValueMapTy &valueMap, bool moduleLevelChanges, - SmallVectorImpl<ReturnInst*> &returns, const char *nameSuffix, ClonedCodeInfo *codeInfo, const TargetData *td) : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap), ModuleLevelChanges(moduleLevelChanges), - Returns(returns), NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) { + NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) { } /// CloneBlock - The specified block is found to be reachable, clone it and /// anything that it can reach. void CloneBlock(const BasicBlock *BB, std::vector<const BasicBlock*> &ToClone); - - public: - /// ConstantFoldMappedInstruction - Constant fold the specified instruction, - /// mapping its operands through VMap if they are available. - Constant *ConstantFoldMappedInstruction(const Instruction *I); }; } @@ -230,7 +226,7 @@ namespace { /// anything that it can reach. void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, std::vector<const BasicBlock*> &ToClone){ - TrackingVH<Value> &BBEntry = VMap[BB]; + WeakVH &BBEntry = VMap[BB]; // Have we already cloned this block? if (BBEntry) return; @@ -262,19 +258,33 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, // loop doesn't include the terminator. for (BasicBlock::const_iterator II = BB->begin(), IE = --BB->end(); II != IE; ++II) { - // If this instruction constant folds, don't bother cloning the instruction, - // instead, just add the constant to the value map. - if (Constant *C = ConstantFoldMappedInstruction(II)) { - VMap[II] = C; - continue; + Instruction *NewInst = II->clone(); + + // Eagerly remap operands to the newly cloned instruction, except for PHI + // nodes for which we defer processing until we update the CFG. + if (!isa<PHINode>(NewInst)) { + RemapInstruction(NewInst, VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); + + // If we can simplify this instruction to some other value, simply add + // a mapping to that value rather than inserting a new instruction into + // the basic block. + if (Value *V = SimplifyInstruction(NewInst, TD)) { + // On the off-chance that this simplifies to an instruction in the old + // function, map it back into the new function. + if (Value *MappedV = VMap.lookup(V)) + V = MappedV; + + VMap[II] = V; + delete NewInst; + continue; + } } - Instruction *NewInst = II->clone(); if (II->hasName()) NewInst->setName(II->getName()+NameSuffix); - NewBB->getInstList().push_back(NewInst); VMap[II] = NewInst; // Add instruction map to value. - + NewBB->getInstList().push_back(NewInst); hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II)); if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) { if (isa<ConstantInt>(AI->getArraySize())) @@ -340,33 +350,6 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && BB != &BB->getParent()->front(); } - - if (ReturnInst *RI = dyn_cast<ReturnInst>(NewBB->getTerminator())) - Returns.push_back(RI); -} - -/// ConstantFoldMappedInstruction - Constant fold the specified instruction, -/// mapping its operands through VMap if they are available. -Constant *PruningFunctionCloner:: -ConstantFoldMappedInstruction(const Instruction *I) { - SmallVector<Constant*, 8> Ops; - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) - if (Constant *Op = dyn_cast_or_null<Constant>(MapValue(I->getOperand(i), - VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges))) - Ops.push_back(Op); - else - return 0; // All operands not constant! - - if (const CmpInst *CI = dyn_cast<CmpInst>(I)) - return ConstantFoldCompareInstOperands(CI->getPredicate(), Ops[0], Ops[1], - TD); - - if (const LoadInst *LI = dyn_cast<LoadInst>(I)) - if (!LI->isVolatile()) - return ConstantFoldLoadFromConstPtr(Ops[0], TD); - - return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Ops, TD); } /// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto, @@ -393,7 +376,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, #endif PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges, - Returns, NameSuffix, CodeInfo, TD); + NameSuffix, CodeInfo, TD); // Clone the entry block, and anything recursively reachable from it. std::vector<const BasicBlock*> CloneWorklist; @@ -418,25 +401,19 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, // Add the new block to the new function. NewFunc->getBasicBlockList().push_back(NewBB); - - // Loop over all of the instructions in the block, fixing up operand - // references as we go. This uses VMap to do all the hard work. - // - BasicBlock::iterator I = NewBB->begin(); // Handle PHI nodes specially, as we have to remove references to dead // blocks. - if (PHINode *PN = dyn_cast<PHINode>(I)) { - // Skip over all PHI nodes, remembering them for later. - BasicBlock::const_iterator OldI = BI->begin(); - for (; (PN = dyn_cast<PHINode>(I)); ++I, ++OldI) - PHIToResolve.push_back(cast<PHINode>(OldI)); - } - - // Otherwise, remap the rest of the instructions normally. - for (; I != NewBB->end(); ++I) - RemapInstruction(I, VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); + for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); I != E; ++I) + if (const PHINode *PN = dyn_cast<PHINode>(I)) + PHIToResolve.push_back(PN); + else + break; + + // Finally, remap the terminator instructions, as those can't be remapped + // until all BBs are mapped. + RemapInstruction(NewBB->getTerminator(), VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); } // Defer PHI resolution until rest of function is resolved, PHI resolution @@ -518,31 +495,55 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, ++OldI; } } - // NOTE: We cannot eliminate single entry phi nodes here, because of - // VMap. Single entry phi nodes can have multiple VMap entries - // pointing at them. Thus, deleting one would require scanning the VMap - // to update any entries in it that would require that. This would be - // really slow. } - + + // Make a second pass over the PHINodes now that all of them have been + // remapped into the new function, simplifying the PHINode and performing any + // recursive simplifications exposed. This will transparently update the + // WeakVH in the VMap. Notably, we rely on that so that if we coalesce + // two PHINodes, the iteration over the old PHIs remains valid, and the + // mapping will just map us to the new node (which may not even be a PHI + // node). + for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx) + if (PHINode *PN = dyn_cast<PHINode>(VMap[PHIToResolve[Idx]])) + recursivelySimplifyInstruction(PN, TD); + // Now that the inlined function body has been fully constructed, go through // and zap unconditional fall-through branches. This happen all the time when // specializing code: code specialization turns conditional branches into // uncond branches, and this code folds them. - Function::iterator I = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]); + Function::iterator Begin = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]); + Function::iterator I = Begin; while (I != NewFunc->end()) { + // Check if this block has become dead during inlining or other + // simplifications. Note that the first block will appear dead, as it has + // not yet been wired up properly. + if (I != Begin && (pred_begin(I) == pred_end(I) || + I->getSinglePredecessor() == I)) { + BasicBlock *DeadBB = I++; + DeleteDeadBlock(DeadBB); + continue; + } + + // We need to simplify conditional branches and switches with a constant + // operand. We try to prune these out when cloning, but if the + // simplification required looking through PHI nodes, those are only + // available after forming the full basic block. That may leave some here, + // and we still want to prune the dead code as early as possible. + ConstantFoldTerminator(I); + BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator()); if (!BI || BI->isConditional()) { ++I; continue; } - // Note that we can't eliminate uncond branches if the destination has - // single-entry PHI nodes. Eliminating the single-entry phi nodes would - // require scanning the VMap to update any entries that point to the phi - // node. BasicBlock *Dest = BI->getSuccessor(0); - if (!Dest->getSinglePredecessor() || isa<PHINode>(Dest->begin())) { + if (!Dest->getSinglePredecessor()) { ++I; continue; } - + + // We shouldn't be able to get single-entry PHI nodes here, as instsimplify + // above should have zapped all of them.. + assert(!isa<PHINode>(Dest->begin())); + // We know all single-entry PHI nodes in the inlined function have been // removed, so we just need to splice the blocks. BI->eraseFromParent(); @@ -558,4 +559,13 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, // Do not increment I, iteratively merge all things this block branches to. } + + // Make a final pass over the basic blocks from theh old function to gather + // any return instructions which survived folding. We have to do this here + // because we can iteratively remove and merge returns above. + for (Function::iterator I = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]), + E = NewFunc->end(); + I != E; ++I) + if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) + Returns.push_back(RI); } diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index b84de05..d2b167a 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -31,10 +31,12 @@ #include "llvm/Support/IRBuilder.h" using namespace llvm; -bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI, bool InsertLifetime) { +bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI, + bool InsertLifetime) { return InlineFunction(CallSite(CI), IFI, InsertLifetime); } -bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI, bool InsertLifetime) { +bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI, + bool InsertLifetime) { return InlineFunction(CallSite(II), IFI, InsertLifetime); } @@ -434,8 +436,8 @@ static bool hasLifetimeMarkers(AllocaInst *AI) { return false; } -/// updateInlinedAtInfo - Helper function used by fixupLineNumbers to recursively -/// update InlinedAtEntry of a DebugLoc. +/// updateInlinedAtInfo - Helper function used by fixupLineNumbers to +/// recursively update InlinedAtEntry of a DebugLoc. static DebugLoc updateInlinedAtInfo(const DebugLoc &DL, const DebugLoc &InlinedAtDL, LLVMContext &Ctx) { @@ -445,7 +447,7 @@ static DebugLoc updateInlinedAtInfo(const DebugLoc &DL, return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx), NewInlinedAtDL.getAsMDNode(Ctx)); } - + return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx), InlinedAtDL.getAsMDNode(Ctx)); } @@ -453,7 +455,7 @@ static DebugLoc updateInlinedAtInfo(const DebugLoc &DL, /// fixupLineNumbers - Update inlined instructions' line numbers to /// to encode location where these instructions are inlined. static void fixupLineNumbers(Function *Fn, Function::iterator FI, - Instruction *TheCall) { + Instruction *TheCall) { DebugLoc TheCallDL = TheCall->getDebugLoc(); if (TheCallDL.isUnknown()) return; @@ -484,7 +486,8 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, /// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now /// exists in the instruction stream. Similarly this will inline a recursive /// function by one level. -bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, bool InsertLifetime) { +bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, + bool InsertLifetime) { Instruction *TheCall = CS.getInstruction(); assert(TheCall->getParent() && TheCall->getParent()->getParent() && "Instruction not in function!"); diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 5f895eb..d1c4d59 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -355,22 +355,27 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN) { /// instructions in other blocks as well in this block. bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD) { bool MadeChange = false; - for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { + +#ifndef NDEBUG + // In debug builds, ensure that the terminator of the block is never replaced + // or deleted by these simplifications. The idea of simplification is that it + // cannot introduce new instructions, and there is no way to replace the + // terminator of a block without introducing a new instruction. + AssertingVH<Instruction> TerminatorVH(--BB->end()); +#endif + + for (BasicBlock::iterator BI = BB->begin(), E = --BB->end(); BI != E; ) { + assert(!BI->isTerminator()); Instruction *Inst = BI++; - - if (Value *V = SimplifyInstruction(Inst, TD)) { - WeakVH BIHandle(BI); - ReplaceAndSimplifyAllUses(Inst, V, TD); + + WeakVH BIHandle(BI); + if (recursivelySimplifyInstruction(Inst, TD)) { MadeChange = true; if (BIHandle != BI) BI = BB->begin(); continue; } - if (Inst->isTerminator()) - break; - - WeakVH BIHandle(BI); MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst); if (BIHandle != BI) BI = BB->begin(); @@ -408,17 +413,11 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred, WeakVH PhiIt = &BB->front(); while (PHINode *PN = dyn_cast<PHINode>(PhiIt)) { PhiIt = &*++BasicBlock::iterator(cast<Instruction>(PhiIt)); + Value *OldPhiIt = PhiIt; - Value *PNV = SimplifyInstruction(PN, TD); - if (PNV == 0) continue; + if (!recursivelySimplifyInstruction(PN, TD)) + continue; - // If we're able to simplify the phi to a single value, substitute the new - // value into all of its uses. - assert(PNV != PN && "SimplifyInstruction broken!"); - - Value *OldPhiIt = PhiIt; - ReplaceAndSimplifyAllUses(PN, PNV, TD); - // If recursive simplification ended up deleting the next PHI node we would // iterate to, then our iterator is invalid, restart scanning from the top // of the block. @@ -763,9 +762,8 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, assert(V->getType()->isPointerTy() && "getOrEnforceKnownAlignment expects a pointer!"); unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64; - APInt Mask = APInt::getAllOnesValue(BitWidth); APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD); + ComputeMaskedBits(V, KnownZero, KnownOne, TD); unsigned TrailZ = KnownZero.countTrailingOnes(); // Avoid trouble with rediculously large TrailZ values, such as diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index 512b689..e15497a 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -149,6 +149,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, return false; } + // Loops with indirectbr cannot be cloned. + if (!L->isSafeToClone()) { + DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n"); + return false; + } + BasicBlock *Header = L->getHeader(); BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index d53a46e..66dd2c9 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -2562,7 +2562,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI) { Value *Cond = SI->getCondition(); unsigned Bits = cast<IntegerType>(Cond->getType())->getBitWidth(); APInt KnownZero(Bits, 0), KnownOne(Bits, 0); - ComputeMaskedBits(Cond, APInt::getAllOnesValue(Bits), KnownZero, KnownOne); + ComputeMaskedBits(Cond, KnownZero, KnownOne); // Gather dead cases. SmallVector<ConstantInt*, 8> DeadCases; diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp index e00565d..4030bef 100644 --- a/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -46,7 +46,6 @@ namespace { LoopInfo *LI; DominatorTree *DT; ScalarEvolution *SE; - IVUsers *IU; // NULL for DisableIVRewrite const TargetData *TD; // May be NULL SmallVectorImpl<WeakVH> &DeadInsts; @@ -59,7 +58,6 @@ namespace { L(Loop), LI(LPM->getAnalysisIfAvailable<LoopInfo>()), SE(SE), - IU(IVU), TD(LPM->getAnalysisIfAvailable<TargetData>()), DeadInsts(Dead), Changed(false) { @@ -229,13 +227,6 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem, Rem->replaceAllUsesWith(Sel); } - // Inform IVUsers about the new users. - if (IU) { - if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0))) { - SmallPtrSet<Loop*, 16> SimplifiedLoopNests; - IU->AddUsersIfInteresting(I, SimplifiedLoopNests); - } - } DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n'); ++NumElimRem; Changed = true; @@ -401,36 +392,4 @@ bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, LPPassManager *LPM, return Changed; } -/// simplifyIVUsers - Perform simplification on instructions recorded by the -/// IVUsers pass. -/// -/// This is the old approach to IV simplification to be replaced by -/// SimplifyLoopIVs. -bool simplifyIVUsers(IVUsers *IU, ScalarEvolution *SE, LPPassManager *LPM, - SmallVectorImpl<WeakVH> &Dead) { - SimplifyIndvar SIV(IU->getLoop(), SE, LPM, Dead); - - // Each round of simplification involves a round of eliminating operations - // followed by a round of widening IVs. A single IVUsers worklist is used - // across all rounds. The inner loop advances the user. If widening exposes - // more uses, then another pass through the outer loop is triggered. - for (IVUsers::iterator I = IU->begin(); I != IU->end(); ++I) { - Instruction *UseInst = I->getUser(); - Value *IVOperand = I->getOperandValToReplace(); - - if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) { - SIV.eliminateIVComparison(ICmp, IVOperand); - continue; - } - if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) { - bool IsSigned = Rem->getOpcode() == Instruction::SRem; - if (IsSigned || Rem->getOpcode() == Instruction::URem) { - SIV.eliminateIVRemainder(Rem, IVOperand, IsSigned); - continue; - } - } - } - return SIV.hasChanged(); -} - } // namespace llvm diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index 32eec79..9d62306 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -84,6 +84,10 @@ NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize floating-point values")); static cl::opt<bool> +NoPointers("bb-vectorize-no-pointers", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize pointer values")); + +static cl::opt<bool> NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize casting (conversion) operations")); @@ -96,6 +100,14 @@ NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); static cl::opt<bool> +NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize select instructions")); + +static cl::opt<bool> +NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize getelementptr instructions")); + +static cl::opt<bool> NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize loads and stores")); @@ -140,10 +152,21 @@ STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); namespace { struct BBVectorize : public BasicBlockPass { static char ID; // Pass identification, replacement for typeid - BBVectorize() : BasicBlockPass(ID) { + + const VectorizeConfig Config; + + BBVectorize(const VectorizeConfig &C = VectorizeConfig()) + : BasicBlockPass(ID), Config(C) { initializeBBVectorizePass(*PassRegistry::getPassRegistry()); } + BBVectorize(Pass *P, const VectorizeConfig &C) + : BasicBlockPass(ID), Config(C) { + AA = &P->getAnalysis<AliasAnalysis>(); + SE = &P->getAnalysis<ScalarEvolution>(); + TD = P->getAnalysisIfAvailable<TargetData>(); + } + typedef std::pair<Value *, Value *> ValuePair; typedef std::pair<ValuePair, size_t> ValuePairWithDepth; typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair @@ -280,18 +303,15 @@ namespace { Instruction *&InsertionPt, Instruction *I, Instruction *J); - virtual bool runOnBasicBlock(BasicBlock &BB) { - AA = &getAnalysis<AliasAnalysis>(); - SE = &getAnalysis<ScalarEvolution>(); - TD = getAnalysisIfAvailable<TargetData>(); - + bool vectorizeBB(BasicBlock &BB) { bool changed = false; // Iterate a sufficient number of times to merge types of size 1 bit, // then 2 bits, then 4, etc. up to half of the target vector width of the // target vector register. - for (unsigned v = 2, n = 1; v <= VectorBits && (!MaxIter || n <= MaxIter); + for (unsigned v = 2, n = 1; + v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter); v *= 2, ++n) { - DEBUG(dbgs() << "BBV: fusing loop #" << n << + DEBUG(dbgs() << "BBV: fusing loop #" << n << " for " << BB.getName() << " in " << BB.getParent()->getName() << "...\n"); if (vectorizePairs(BB)) @@ -304,6 +324,14 @@ namespace { return changed; } + virtual bool runOnBasicBlock(BasicBlock &BB) { + AA = &getAnalysis<AliasAnalysis>(); + SE = &getAnalysis<ScalarEvolution>(); + TD = getAnalysisIfAvailable<TargetData>(); + + return vectorizeBB(BB); + } + virtual void getAnalysisUsage(AnalysisUsage &AU) const { BasicBlockPass::getAnalysisUsage(AU); AU.addRequired<AliasAnalysis>(); @@ -333,7 +361,7 @@ namespace { // candidate chains where longer chains are considered to be better. // Note: when this function returns 0, the resulting instructions are // not actually fused. - static inline size_t getDepthFactor(Value *V) { + inline size_t getDepthFactor(Value *V) { // InsertElement and ExtractElement have a depth factor of zero. This is // for two reasons: First, they cannot be usefully fused. Second, because // the pass generates a lot of these, they can confuse the simple metric @@ -347,8 +375,8 @@ namespace { // Give a load or store half of the required depth so that load/store // pairs will vectorize. - if (!NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) - return ReqChainDepth/2; + if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) + return Config.ReqChainDepth/2; return 1; } @@ -421,9 +449,9 @@ namespace { case Intrinsic::exp: case Intrinsic::exp2: case Intrinsic::pow: - return !NoMath; + return Config.VectorizeMath; case Intrinsic::fma: - return !NoFMA; + return Config.VectorizeFMA; } } @@ -517,24 +545,34 @@ namespace { } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { // Vectorize simple loads if possbile: IsSimpleLoadStore = L->isSimple(); - if (!IsSimpleLoadStore || NoMemOps) + if (!IsSimpleLoadStore || !Config.VectorizeMemOps) return false; } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { // Vectorize simple stores if possbile: IsSimpleLoadStore = S->isSimple(); - if (!IsSimpleLoadStore || NoMemOps) + if (!IsSimpleLoadStore || !Config.VectorizeMemOps) return false; } else if (CastInst *C = dyn_cast<CastInst>(I)) { // We can vectorize casts, but not casts of pointer types, etc. - if (NoCasts) + if (!Config.VectorizeCasts) return false; Type *SrcTy = C->getSrcTy(); - if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy()) + if (!SrcTy->isSingleValueType()) return false; Type *DestTy = C->getDestTy(); - if (!DestTy->isSingleValueType() || DestTy->isPointerTy()) + if (!DestTy->isSingleValueType()) + return false; + } else if (isa<SelectInst>(I)) { + if (!Config.VectorizeSelect) + return false; + } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { + if (!Config.VectorizeGEP) + return false; + + // Currently, vector GEPs exist only with one index. + if (G->getNumIndices() != 1) return false; } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) || isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) { @@ -566,14 +604,21 @@ namespace { !(VectorType::isValidElementType(T2) || T2->isVectorTy())) return false; - if (NoInts && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) + if (!Config.VectorizeInts + && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) + return false; + + if (!Config.VectorizeFloats + && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) return false; - if (NoFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) + if ((!Config.VectorizePointers || TD == 0) && + (T1->getScalarType()->isPointerTy() || + T2->getScalarType()->isPointerTy())) return false; - if (T1->getPrimitiveSizeInBits() > VectorBits/2 || - T2->getPrimitiveSizeInBits() > VectorBits/2) + if (T1->getPrimitiveSizeInBits() > Config.VectorBits/2 || + T2->getPrimitiveSizeInBits() > Config.VectorBits/2) return false; return true; @@ -601,7 +646,7 @@ namespace { LI->isVolatile() != LJ->isVolatile() || LI->getOrdering() != LJ->getOrdering() || LI->getSynchScope() != LJ->getSynchScope()) - return false; + return false; } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) { if (SI->getValueOperand()->getType() != SJ->getValueOperand()->getType() || @@ -622,7 +667,7 @@ namespace { int64_t OffsetInElmts = 0; if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, OffsetInElmts) && abs64(OffsetInElmts) == 1) { - if (AlignedOnly) { + if (Config.AlignedOnly) { Type *aType = isa<StoreInst>(I) ? cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); // An aligned load or store is possible only if the instruction @@ -647,6 +692,20 @@ namespace { // FIXME: We may want to vectorize non-constant shuffles also. } + // The powi intrinsic is special because only the first argument is + // vectorized, the second arguments must be equal. + CallInst *CI = dyn_cast<CallInst>(I); + Function *FI; + if (CI && (FI = CI->getCalledFunction()) && + FI->getIntrinsicID() == Intrinsic::powi) { + + Value *A1I = CI->getArgOperand(1), + *A1J = cast<CallInst>(J)->getArgOperand(1); + const SCEV *A1ISCEV = SE->getSCEV(A1I), + *A1JSCEV = SE->getSCEV(A1J); + return (A1ISCEV == A1JSCEV); + } + return true; } @@ -729,12 +788,12 @@ namespace { AliasSetTracker WriteSet(*AA); bool JAfterStart = IAfterStart; BasicBlock::iterator J = llvm::next(I); - for (unsigned ss = 0; J != E && ss <= SearchLimit; ++J, ++ss) { + for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { if (J == Start) JAfterStart = true; // Determine if J uses I, if so, exit the loop. - bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !FastDep); - if (FastDep) { + bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep); + if (Config.FastDep) { // Note: For this heuristic to be effective, independent operations // must tend to be intermixed. This is likely to be true from some // kinds of grouped loop unrolling (but not the generic LLVM pass), @@ -772,7 +831,7 @@ namespace { // If we have already found too many pairs, break here and this function // will be called again starting after the last instruction selected // during this invocation. - if (PairableInsts.size() >= MaxInsts) { + if (PairableInsts.size() >= Config.MaxInsts) { ShouldContinue = true; break; } @@ -796,16 +855,33 @@ namespace { std::vector<Value *> &PairableInsts, std::multimap<ValuePair, ValuePair> &ConnectedPairs, ValuePair P) { + StoreInst *SI, *SJ; + // For each possible pairing for this variable, look at the uses of // the first value... for (Value::use_iterator I = P.first->use_begin(), E = P.first->use_end(); I != E; ++I) { + if (isa<LoadInst>(*I)) { + // A pair cannot be connected to a load because the load only takes one + // operand (the address) and it is a scalar even after vectorization. + continue; + } else if ((SI = dyn_cast<StoreInst>(*I)) && + P.first == SI->getPointerOperand()) { + // Similarly, a pair cannot be connected to a store through its + // pointer operand. + continue; + } + VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); // For each use of the first variable, look for uses of the second // variable... for (Value::use_iterator J = P.second->use_begin(), E2 = P.second->use_end(); J != E2; ++J) { + if ((SJ = dyn_cast<StoreInst>(*J)) && + P.second == SJ->getPointerOperand()) + continue; + VPIteratorPair JPairRange = CandidatePairs.equal_range(*J); // Look for <I, J>: @@ -817,23 +893,37 @@ namespace { ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I))); } - if (SplatBreaksChain) continue; + if (Config.SplatBreaksChain) continue; // Look for cases where just the first value in the pair is used by // both members of another pair (splatting). for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) { + if ((SJ = dyn_cast<StoreInst>(*J)) && + P.first == SJ->getPointerOperand()) + continue; + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); } } - if (SplatBreaksChain) return; + if (Config.SplatBreaksChain) return; // Look for cases where just the second value in the pair is used by // both members of another pair (splatting). for (Value::use_iterator I = P.second->use_begin(), E = P.second->use_end(); I != E; ++I) { + if (isa<LoadInst>(*I)) + continue; + else if ((SI = dyn_cast<StoreInst>(*I)) && + P.second == SI->getPointerOperand()) + continue; + VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) { + if ((SJ = dyn_cast<StoreInst>(*J)) && + P.second == SJ->getPointerOperand()) + continue; + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); } @@ -1256,7 +1346,7 @@ namespace { << *J->first << " <-> " << *J->second << "} of depth " << MaxDepth << " and size " << PrunedTree.size() << " (effective size: " << EffSize << ")\n"); - if (MaxDepth >= ReqChainDepth && EffSize > BestEffSize) { + if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) { BestMaxDepth = MaxDepth; BestEffSize = EffSize; BestTree = PrunedTree; @@ -1272,7 +1362,8 @@ namespace { std::multimap<ValuePair, ValuePair> &ConnectedPairs, DenseSet<ValuePair> &PairableInstUsers, DenseMap<Value *, Value *>& ChosenPairs) { - bool UseCycleCheck = CandidatePairs.size() <= MaxCandPairsForCycleCheck; + bool UseCycleCheck = + CandidatePairs.size() <= Config.MaxCandPairsForCycleCheck; std::multimap<ValuePair, ValuePair> PairableInstUserMap; for (std::vector<Value *>::iterator I = PairableInsts.begin(), E = PairableInsts.end(); I != E; ++I) { @@ -1518,19 +1609,27 @@ namespace { ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o, FlipMemInputs); continue; - } else if (isa<CallInst>(I) && o == NumOperands-1) { + } else if (isa<CallInst>(I)) { Function *F = cast<CallInst>(I)->getCalledFunction(); unsigned IID = F->getIntrinsicID(); - BasicBlock &BB = *I->getParent(); + if (o == NumOperands-1) { + BasicBlock &BB = *I->getParent(); - Module *M = BB.getParent()->getParent(); - Type *ArgType = I->getType(); - Type *VArgType = getVecTypeForPair(ArgType); + Module *M = BB.getParent()->getParent(); + Type *ArgType = I->getType(); + Type *VArgType = getVecTypeForPair(ArgType); - // FIXME: is it safe to do this here? - ReplacedOperands[o] = Intrinsic::getDeclaration(M, - (Intrinsic::ID) IID, VArgType); - continue; + // FIXME: is it safe to do this here? + ReplacedOperands[o] = Intrinsic::getDeclaration(M, + (Intrinsic::ID) IID, VArgType); + continue; + } else if (IID == Intrinsic::powi && o == 1) { + // The second argument of powi is a single integer and we've already + // checked that both arguments are equal. As a result, we just keep + // I's second argument. + ReplacedOperands[o] = I->getOperand(o); + continue; + } } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) { ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); continue; @@ -1835,7 +1934,35 @@ INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) -BasicBlockPass *llvm::createBBVectorizePass() { - return new BBVectorize(); +BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { + return new BBVectorize(C); +} + +bool +llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { + BBVectorize BBVectorizer(P, C); + return BBVectorizer.vectorizeBB(BB); } +//===----------------------------------------------------------------------===// +VectorizeConfig::VectorizeConfig() { + VectorBits = ::VectorBits; + VectorizeInts = !::NoInts; + VectorizeFloats = !::NoFloats; + VectorizePointers = !::NoPointers; + VectorizeCasts = !::NoCasts; + VectorizeMath = !::NoMath; + VectorizeFMA = !::NoFMA; + VectorizeSelect = !::NoSelect; + VectorizeGEP = !::NoGEP; + VectorizeMemOps = !::NoMemOps; + AlignedOnly = ::AlignedOnly; + ReqChainDepth= ::ReqChainDepth; + SearchLimit = ::SearchLimit; + MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; + SplatBreaksChain = ::SplatBreaksChain; + MaxInsts = ::MaxInsts; + MaxIter = ::MaxIter; + NoMemOpBoost = ::NoMemOpBoost; + FastDep = ::FastDep; +} |