diff options
Diffstat (limited to 'lib/Transforms')
140 files changed, 17081 insertions, 8113 deletions
diff --git a/lib/Transforms/IPO/Android.mk b/lib/Transforms/IPO/Android.mk index 1fe7d63..f08b0ad 100644 --- a/lib/Transforms/IPO/Android.mk +++ b/lib/Transforms/IPO/Android.mk @@ -16,6 +16,7 @@ transforms_ipo_SRC_FILES := \ Inliner.cpp \ Internalize.cpp \ LoopExtractor.cpp \ + LowerBitSets.cpp \ MergeFunctions.cpp \ PartialInlining.cpp \ PassManagerBuilder.cpp \ diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp index c4706e8..7e48ce3 100644 --- a/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -554,14 +554,14 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, BasicBlock *BB = Load->getParent(); AliasAnalysis::Location Loc = AA.getLocation(Load); - if (AA.canInstructionRangeModify(BB->front(), *Load, Loc)) + if (AA.canInstructionRangeModRef(BB->front(), *Load, Loc, + AliasAnalysis::Mod)) return false; // Pointer is invalidated! // Now check every path from the entry block to the load for transparency. // To do this, we perform a depth first search on the inverse CFG from the // loading block. - for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { - BasicBlock *P = *PI; + for (BasicBlock *P : predecessors(BB)) { for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks)) if (AA.canBasicBlockModify(*TranspBB, Loc)) return false; diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt index 90c1c33..3df17b9 100644 --- a/lib/Transforms/IPO/CMakeLists.txt +++ b/lib/Transforms/IPO/CMakeLists.txt @@ -14,12 +14,17 @@ add_llvm_library(LLVMipo Inliner.cpp Internalize.cpp LoopExtractor.cpp + LowerBitSets.cpp MergeFunctions.cpp PartialInlining.cpp PassManagerBuilder.cpp PruneEH.cpp StripDeadPrototypes.cpp StripSymbols.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/IPO ) add_dependencies(LLVMipo intrinsics_gen) diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp index 4045c09..4431311 100644 --- a/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -146,7 +146,7 @@ namespace { private: Liveness MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses); Liveness SurveyUse(const Use *U, UseVector &MaybeLiveUses, - unsigned RetValNum = 0); + unsigned RetValNum = -1U); Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses); void SurveyFunction(const Function &F); @@ -387,14 +387,32 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn) /// for void functions and 1 for functions not returning a struct. It returns /// the number of struct elements for functions returning a struct. static unsigned NumRetVals(const Function *F) { - if (F->getReturnType()->isVoidTy()) + Type *RetTy = F->getReturnType(); + if (RetTy->isVoidTy()) return 0; - else if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) + else if (StructType *STy = dyn_cast<StructType>(RetTy)) return STy->getNumElements(); + else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy)) + return ATy->getNumElements(); else return 1; } +/// Returns the sub-type a function will return at a given Idx. Should +/// correspond to the result type of an ExtractValue instruction executed with +/// just that one Idx (i.e. only top-level structure is considered). +static Type *getRetComponentType(const Function *F, unsigned Idx) { + Type *RetTy = F->getReturnType(); + assert(!RetTy->isVoidTy() && "void type has no subtype"); + + if (StructType *STy = dyn_cast<StructType>(RetTy)) + return STy->getElementType(Idx); + else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy)) + return ATy->getElementType(); + else + return RetTy; +} + /// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not /// live, it adds Use to the MaybeLiveUses argument. Returns the determined /// liveness of Use. @@ -425,9 +443,24 @@ DAE::Liveness DAE::SurveyUse(const Use *U, // function's return value is live. We use RetValNum here, for the case // that U is really a use of an insertvalue instruction that uses the // original Use. - RetOrArg Use = CreateRet(RI->getParent()->getParent(), RetValNum); - // We might be live, depending on the liveness of Use. - return MarkIfNotLive(Use, MaybeLiveUses); + const Function *F = RI->getParent()->getParent(); + if (RetValNum != -1U) { + RetOrArg Use = CreateRet(F, RetValNum); + // We might be live, depending on the liveness of Use. + return MarkIfNotLive(Use, MaybeLiveUses); + } else { + DAE::Liveness Result = MaybeLive; + for (unsigned i = 0; i < NumRetVals(F); ++i) { + RetOrArg Use = CreateRet(F, i); + // We might be live, depending on the liveness of Use. If any + // sub-value is live, then the entire value is considered live. This + // is a conservative choice, and better tracking is possible. + DAE::Liveness SubResult = MarkIfNotLive(Use, MaybeLiveUses); + if (Result != Live) + Result = SubResult; + } + return Result; + } } if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) { if (U->getOperandNo() != InsertValueInst::getAggregateOperandIndex() @@ -541,7 +574,6 @@ void DAE::SurveyFunction(const Function &F) { // Keep track of the number of live retvals, so we can skip checks once all // of them turn out to be live. unsigned NumLiveRetVals = 0; - Type *STy = dyn_cast<StructType>(F.getReturnType()); // Loop all uses of the function. for (const Use &U : F.uses()) { // If the function is PASSED IN as an argument, its address has been @@ -563,34 +595,35 @@ void DAE::SurveyFunction(const Function &F) { // Now, check how our return value(s) is/are used in this caller. Don't // bother checking return values if all of them are live already. - if (NumLiveRetVals != RetCount) { - if (STy) { - // Check all uses of the return value. - for (const User *U : TheCall->users()) { - const ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U); - if (Ext && Ext->hasIndices()) { - // This use uses a part of our return value, survey the uses of - // that part and store the results for this index only. - unsigned Idx = *Ext->idx_begin(); - if (RetValLiveness[Idx] != Live) { - RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]); - if (RetValLiveness[Idx] == Live) - NumLiveRetVals++; - } - } else { - // Used by something else than extractvalue. Mark all return - // values as live. - for (unsigned i = 0; i != RetCount; ++i ) - RetValLiveness[i] = Live; - NumLiveRetVals = RetCount; - break; - } + if (NumLiveRetVals == RetCount) + continue; + + // Check all uses of the return value. + for (const Use &U : TheCall->uses()) { + if (ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U.getUser())) { + // This use uses a part of our return value, survey the uses of + // that part and store the results for this index only. + unsigned Idx = *Ext->idx_begin(); + if (RetValLiveness[Idx] != Live) { + RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]); + if (RetValLiveness[Idx] == Live) + NumLiveRetVals++; } } else { - // Single return value - RetValLiveness[0] = SurveyUses(TheCall, MaybeLiveRetUses[0]); - if (RetValLiveness[0] == Live) + // Used by something else than extractvalue. Survey, but assume that the + // result applies to all sub-values. + UseVector MaybeLiveAggregateUses; + if (SurveyUse(&U, MaybeLiveAggregateUses) == Live) { NumLiveRetVals = RetCount; + RetValLiveness.assign(RetCount, Live); + break; + } else { + for (unsigned i = 0; i != RetCount; ++i) { + if (RetValLiveness[i] != Live) + MaybeLiveRetUses[i].append(MaybeLiveAggregateUses.begin(), + MaybeLiveAggregateUses.end()); + } + } } } } @@ -775,39 +808,29 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { if (RetTy->isVoidTy() || HasLiveReturnedArg) { NRetTy = RetTy; } else { - StructType *STy = dyn_cast<StructType>(RetTy); - if (STy) - // Look at each of the original return values individually. - for (unsigned i = 0; i != RetCount; ++i) { - RetOrArg Ret = CreateRet(F, i); - if (LiveValues.erase(Ret)) { - RetTypes.push_back(STy->getElementType(i)); - NewRetIdxs[i] = RetTypes.size() - 1; - } else { - ++NumRetValsEliminated; - DEBUG(dbgs() << "DAE - Removing return value " << i << " from " - << F->getName() << "\n"); - } - } - else - // We used to return a single value. - if (LiveValues.erase(CreateRet(F, 0))) { - RetTypes.push_back(RetTy); - NewRetIdxs[0] = 0; + // Look at each of the original return values individually. + for (unsigned i = 0; i != RetCount; ++i) { + RetOrArg Ret = CreateRet(F, i); + if (LiveValues.erase(Ret)) { + RetTypes.push_back(getRetComponentType(F, i)); + NewRetIdxs[i] = RetTypes.size() - 1; } else { - DEBUG(dbgs() << "DAE - Removing return value from " << F->getName() - << "\n"); ++NumRetValsEliminated; + DEBUG(dbgs() << "DAE - Removing return value " << i << " from " + << F->getName() << "\n"); + } + } + if (RetTypes.size() > 1) { + // More than one return type? Reduce it down to size. + if (StructType *STy = dyn_cast<StructType>(RetTy)) { + // Make the new struct packed if we used to return a packed struct + // already. + NRetTy = StructType::get(STy->getContext(), RetTypes, STy->isPacked()); + } else { + assert(isa<ArrayType>(RetTy) && "unexpected multi-value return"); + NRetTy = ArrayType::get(RetTypes[0], RetTypes.size()); } - if (RetTypes.size() > 1) - // More than one return type? Return a struct with them. Also, if we used - // to return a struct and didn't change the number of return values, - // return a struct again. This prevents changing {something} into - // something and {} into void. - // Make the new struct packed if we used to return a packed struct - // already. - NRetTy = StructType::get(STy->getContext(), RetTypes, STy->isPacked()); - else if (RetTypes.size() == 1) + } else if (RetTypes.size() == 1) // One return type? Just a simple value then, but only if we didn't use to // return a struct with that simple value before. NRetTy = RetTypes.front(); @@ -959,9 +982,9 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { if (!Call->getType()->isX86_MMXTy()) Call->replaceAllUsesWith(Constant::getNullValue(Call->getType())); } else { - assert(RetTy->isStructTy() && + assert((RetTy->isStructTy() || RetTy->isArrayTy()) && "Return type changed, but not into a void. The old return type" - " must have been a struct!"); + " must have been a struct or an array!"); Instruction *InsertPt = Call; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { BasicBlock::iterator IP = II->getNormalDest()->begin(); @@ -969,9 +992,9 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { InsertPt = IP; } - // We used to return a struct. Instead of doing smart stuff with all the - // uses of this struct, we will just rebuild it using - // extract/insertvalue chaining and let instcombine clean that up. + // We used to return a struct or array. Instead of doing smart stuff + // with all the uses, we will just rebuild it using extract/insertvalue + // chaining and let instcombine clean that up. // // Start out building up our return value from undef Value *RetVal = UndefValue::get(RetTy); @@ -1034,8 +1057,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { if (NFTy->getReturnType()->isVoidTy()) { RetVal = nullptr; } else { - assert (RetTy->isStructTy()); - // The original return value was a struct, insert + assert(RetTy->isStructTy() || RetTy->isArrayTy()); + // The original return value was a struct or array, insert // extractvalue/insertvalue chains to extract only the values we need // to return and insert them into our new result. // This does generate messy code, but we'll let it to instcombine to diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp index 823ae53..8925e4c 100644 --- a/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/lib/Transforms/IPO/FunctionAttrs.cpp @@ -31,7 +31,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; #define DEBUG_TYPE "functionattrs" @@ -124,7 +124,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<AliasAnalysis>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); CallGraphSCCPass::getAnalysisUsage(AU); } @@ -139,7 +139,7 @@ INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs", "Deduce function attributes", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(FunctionAttrs, "functionattrs", "Deduce function attributes", false, false) @@ -1702,7 +1702,7 @@ bool FunctionAttrs::annotateLibraryCalls(const CallGraphSCC &SCC) { bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) { AA = &getAnalysis<AliasAnalysis>(); - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); bool Changed = annotateLibraryCalls(SCC); Changed |= AddReadAttrs(SCC); diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp index 705e929..0c844fe 100644 --- a/lib/Transforms/IPO/GlobalDCE.cpp +++ b/lib/Transforms/IPO/GlobalDCE.cpp @@ -219,6 +219,9 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) { if (F->hasPrefixData()) MarkUsedGlobalsAsNeeded(F->getPrefixData()); + if (F->hasPrologueData()) + MarkUsedGlobalsAsNeeded(F->getPrologueData()); + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U) diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index 6e0ae83..45e04f1 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -38,7 +38,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/GlobalStatus.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -68,7 +68,7 @@ STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed"); namespace { struct GlobalOpt : public ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } static char ID; // Pass identification, replacement for typeid GlobalOpt() : ModulePass(ID) { @@ -95,7 +95,7 @@ namespace { char GlobalOpt::ID = 0; INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt", "Global Variable Optimizer", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(GlobalOpt, "globalopt", "Global Variable Optimizer", false, false) @@ -3042,7 +3042,7 @@ bool GlobalOpt::runOnModule(Module &M) { DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); bool LocalChange = true; while (LocalChange) { diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp index b4d31d8..fcacec3 100644 --- a/lib/Transforms/IPO/IPO.cpp +++ b/lib/Transforms/IPO/IPO.cpp @@ -16,7 +16,7 @@ #include "llvm-c/Initialization.h" #include "llvm-c/Transforms/IPO.h" #include "llvm/InitializePasses.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Transforms/IPO.h" using namespace llvm; @@ -36,6 +36,7 @@ void llvm::initializeIPO(PassRegistry &Registry) { initializeLoopExtractorPass(Registry); initializeBlockExtractorPassPass(Registry); initializeSingleLoopExtractorPass(Registry); + initializeLowerBitSetsPass(Registry); initializeMergeFunctionsPass(Registry); initializePartialInlinerPass(Registry); initializePruneEHPass(Registry); diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp index 819b2e0..dc56a02 100644 --- a/lib/Transforms/IPO/InlineAlways.cpp +++ b/lib/Transforms/IPO/InlineAlways.cpp @@ -15,7 +15,7 @@ #include "llvm/Transforms/IPO.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/IR/CallSite.h" @@ -68,7 +68,7 @@ char AlwaysInliner::ID = 0; INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline", "Inliner for always_inline functions", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) INITIALIZE_PASS_END(AlwaysInliner, "always-inline", diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp index d9a2b9e..9b01d81 100644 --- a/lib/Transforms/IPO/InlineSimple.cpp +++ b/lib/Transforms/IPO/InlineSimple.cpp @@ -13,7 +13,7 @@ #include "llvm/Transforms/IPO.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/IR/CallSite.h" @@ -76,7 +76,7 @@ char SimpleInliner::ID = 0; INITIALIZE_PASS_BEGIN(SimpleInliner, "inline", "Function Integration/Inlining", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) INITIALIZE_PASS_END(SimpleInliner, "inline", diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index 3abe7a8..305ad7a 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -17,7 +17,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/IR/CallSite.h" @@ -29,7 +29,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -77,7 +77,7 @@ Inliner::Inliner(char &ID, int Threshold, bool InsertLifetime) /// always explicitly call the implementation here. void Inliner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<AliasAnalysis>(); - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); CallGraphSCCPass::getAnalysisUsage(AU); } @@ -97,25 +97,17 @@ static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) { AttributeSet OldSSPAttr = AttributeSet::get(Caller->getContext(), AttributeSet::FunctionIndex, B); - AttributeSet CallerAttr = Caller->getAttributes(), - CalleeAttr = Callee->getAttributes(); - if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackProtectReq)) { + if (Callee->hasFnAttribute(Attribute::StackProtectReq)) { Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); Caller->addFnAttr(Attribute::StackProtectReq); - } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackProtectStrong) && - !CallerAttr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackProtectReq)) { + } else if (Callee->hasFnAttribute(Attribute::StackProtectStrong) && + !Caller->hasFnAttribute(Attribute::StackProtectReq)) { Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); Caller->addFnAttr(Attribute::StackProtectStrong); - } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackProtect) && - !CallerAttr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackProtectReq) && - !CallerAttr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackProtectStrong)) + } else if (Callee->hasFnAttribute(Attribute::StackProtect) && + !Caller->hasFnAttribute(Attribute::StackProtectReq) && + !Caller->hasFnAttribute(Attribute::StackProtectStrong)) Caller->addFnAttr(Attribute::StackProtect); } @@ -273,8 +265,7 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const { // would decrease the threshold. Function *Caller = CS.getCaller(); bool OptSize = Caller && !Caller->isDeclaration() && - Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize); + Caller->hasFnAttribute(Attribute::OptimizeForSize); if (!(InlineLimit.getNumOccurrences() > 0) && OptSize && OptSizeThreshold < thres) thres = OptSizeThreshold; @@ -283,17 +274,14 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const { // and the caller does not need to minimize its size. Function *Callee = CS.getCalledFunction(); bool InlineHint = Callee && !Callee->isDeclaration() && - Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::InlineHint); - if (InlineHint && HintThreshold > thres - && !Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::MinSize)) + Callee->hasFnAttribute(Attribute::InlineHint); + if (InlineHint && HintThreshold > thres && + !Caller->hasFnAttribute(Attribute::MinSize)) thres = HintThreshold; // Listen to the cold attribute when it would decrease the threshold. bool ColdCallee = Callee && !Callee->isDeclaration() && - Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::Cold); + Callee->hasFnAttribute(Attribute::Cold); // Command line argument for InlineLimit will override the default // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold, // do not use the default cold threshold even if it is smaller. @@ -443,10 +431,11 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID, bool Inliner::runOnSCC(CallGraphSCC &SCC) { CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); - AssumptionTracker *AT = &getAnalysis<AssumptionTracker>(); + AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - const TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; AliasAnalysis *AA = &getAnalysis<AliasAnalysis>(); SmallPtrSet<Function*, 8> SCCFunctions; @@ -506,8 +495,8 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { InlinedArrayAllocasTy InlinedArrayAllocas; - InlineFunctionInfo InlineInfo(&CG, DL, AA, AT); - + InlineFunctionInfo InlineInfo(&CG, DL, AA, ACT); + // Now that we have all of the call sites, loop over them and inline them if // it looks profitable to do so. bool Changed = false; @@ -658,9 +647,7 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) { // Handle the case when this function is called and we only want to care // about always-inline functions. This is a bit of a hack to share code // between here and the InlineAlways pass. - if (AlwaysInlineOnly && - !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::AlwaysInline)) + if (AlwaysInlineOnly && !F->hasFnAttribute(Attribute::AlwaysInline)) continue; // If the only remaining users of the function are dead constants, remove diff --git a/lib/Transforms/IPO/LLVMBuild.txt b/lib/Transforms/IPO/LLVMBuild.txt index 77e0b22..575dce4 100644 --- a/lib/Transforms/IPO/LLVMBuild.txt +++ b/lib/Transforms/IPO/LLVMBuild.txt @@ -20,4 +20,4 @@ type = Library name = IPO parent = Transforms library_name = ipo -required_libraries = Analysis Core IPA InstCombine Scalar Support Target TransformUtils Vectorize +required_libraries = Analysis Core IPA InstCombine Scalar Support TransformUtils Vectorize diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp index 20414aa..41334ca 100644 --- a/lib/Transforms/IPO/LoopExtractor.cpp +++ b/lib/Transforms/IPO/LoopExtractor.cpp @@ -242,7 +242,7 @@ void BlockExtractorPass::SplitLandingPadPreds(Function *F) { if (!Split) continue; SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", nullptr, NewBBs); + SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", NewBBs); } } diff --git a/lib/Transforms/IPO/LowerBitSets.cpp b/lib/Transforms/IPO/LowerBitSets.cpp new file mode 100644 index 0000000..0a22a80 --- /dev/null +++ b/lib/Transforms/IPO/LowerBitSets.cpp @@ -0,0 +1,612 @@ +//===-- LowerBitSets.cpp - Bitset lowering pass ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass lowers bitset metadata and calls to the llvm.bitset.test intrinsic. +// See http://llvm.org/docs/LangRef.html#bitsets for more information. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/LowerBitSets.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "lowerbitsets" + +STATISTIC(NumBitSetsCreated, "Number of bitsets created"); +STATISTIC(NumBitSetCallsLowered, "Number of bitset calls lowered"); +STATISTIC(NumBitSetDisjointSets, "Number of disjoint sets of bitsets"); + +bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const { + if (Offset < ByteOffset) + return false; + + if ((Offset - ByteOffset) % (uint64_t(1) << AlignLog2) != 0) + return false; + + uint64_t BitOffset = (Offset - ByteOffset) >> AlignLog2; + if (BitOffset >= BitSize) + return false; + + return (Bits[BitOffset / 8] >> (BitOffset % 8)) & 1; +} + +bool BitSetInfo::containsValue( + const DataLayout *DL, + const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout, Value *V, + uint64_t COffset) const { + if (auto GV = dyn_cast<GlobalVariable>(V)) { + auto I = GlobalLayout.find(GV); + if (I == GlobalLayout.end()) + return false; + return containsGlobalOffset(I->second + COffset); + } + + if (auto GEP = dyn_cast<GEPOperator>(V)) { + APInt APOffset(DL->getPointerSizeInBits(0), 0); + bool Result = GEP->accumulateConstantOffset(*DL, APOffset); + if (!Result) + return false; + COffset += APOffset.getZExtValue(); + return containsValue(DL, GlobalLayout, GEP->getPointerOperand(), + COffset); + } + + if (auto Op = dyn_cast<Operator>(V)) { + if (Op->getOpcode() == Instruction::BitCast) + return containsValue(DL, GlobalLayout, Op->getOperand(0), COffset); + + if (Op->getOpcode() == Instruction::Select) + return containsValue(DL, GlobalLayout, Op->getOperand(1), COffset) && + containsValue(DL, GlobalLayout, Op->getOperand(2), COffset); + } + + return false; +} + +BitSetInfo BitSetBuilder::build() { + if (Min > Max) + Min = 0; + + // Normalize each offset against the minimum observed offset, and compute + // the bitwise OR of each of the offsets. The number of trailing zeros + // in the mask gives us the log2 of the alignment of all offsets, which + // allows us to compress the bitset by only storing one bit per aligned + // address. + uint64_t Mask = 0; + for (uint64_t &Offset : Offsets) { + Offset -= Min; + Mask |= Offset; + } + + BitSetInfo BSI; + BSI.ByteOffset = Min; + + BSI.AlignLog2 = 0; + // FIXME: Can probably do something smarter if all offsets are 0. + if (Mask != 0) + BSI.AlignLog2 = countTrailingZeros(Mask, ZB_Undefined); + + // Build the compressed bitset while normalizing the offsets against the + // computed alignment. + BSI.BitSize = ((Max - Min) >> BSI.AlignLog2) + 1; + uint64_t ByteSize = (BSI.BitSize + 7) / 8; + BSI.Bits.resize(ByteSize); + for (uint64_t Offset : Offsets) { + Offset >>= BSI.AlignLog2; + BSI.Bits[Offset / 8] |= 1 << (Offset % 8); + } + + return BSI; +} + +void GlobalLayoutBuilder::addFragment(const std::set<uint64_t> &F) { + // Create a new fragment to hold the layout for F. + Fragments.emplace_back(); + std::vector<uint64_t> &Fragment = Fragments.back(); + uint64_t FragmentIndex = Fragments.size() - 1; + + for (auto ObjIndex : F) { + uint64_t OldFragmentIndex = FragmentMap[ObjIndex]; + if (OldFragmentIndex == 0) { + // We haven't seen this object index before, so just add it to the current + // fragment. + Fragment.push_back(ObjIndex); + } else { + // This index belongs to an existing fragment. Copy the elements of the + // old fragment into this one and clear the old fragment. We don't update + // the fragment map just yet, this ensures that any further references to + // indices from the old fragment in this fragment do not insert any more + // indices. + std::vector<uint64_t> &OldFragment = Fragments[OldFragmentIndex]; + Fragment.insert(Fragment.end(), OldFragment.begin(), OldFragment.end()); + OldFragment.clear(); + } + } + + // Update the fragment map to point our object indices to this fragment. + for (uint64_t ObjIndex : Fragment) + FragmentMap[ObjIndex] = FragmentIndex; +} + +namespace { + +struct LowerBitSets : public ModulePass { + static char ID; + LowerBitSets() : ModulePass(ID) { + initializeLowerBitSetsPass(*PassRegistry::getPassRegistry()); + } + + const DataLayout *DL; + IntegerType *Int1Ty; + IntegerType *Int8Ty; + IntegerType *Int32Ty; + Type *Int32PtrTy; + IntegerType *Int64Ty; + Type *IntPtrTy; + + // The llvm.bitsets named metadata. + NamedMDNode *BitSetNM; + + // Mapping from bitset mdstrings to the call sites that test them. + DenseMap<MDString *, std::vector<CallInst *>> BitSetTestCallSites; + + BitSetInfo + buildBitSet(MDString *BitSet, + const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout); + Value *createBitSetTest(IRBuilder<> &B, const BitSetInfo &BSI, + GlobalVariable *BitSetGlobal, Value *BitOffset); + Value * + lowerBitSetCall(CallInst *CI, const BitSetInfo &BSI, + GlobalVariable *BitSetGlobal, GlobalVariable *CombinedGlobal, + const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout); + void buildBitSetsFromGlobals(Module &M, + const std::vector<MDString *> &BitSets, + const std::vector<GlobalVariable *> &Globals); + bool buildBitSets(Module &M); + bool eraseBitSetMetadata(Module &M); + + bool doInitialization(Module &M) override; + bool runOnModule(Module &M) override; +}; + +} // namespace + +INITIALIZE_PASS_BEGIN(LowerBitSets, "lowerbitsets", + "Lower bitset metadata", false, false) +INITIALIZE_PASS_END(LowerBitSets, "lowerbitsets", + "Lower bitset metadata", false, false) +char LowerBitSets::ID = 0; + +ModulePass *llvm::createLowerBitSetsPass() { return new LowerBitSets; } + +bool LowerBitSets::doInitialization(Module &M) { + DL = M.getDataLayout(); + if (!DL) + report_fatal_error("Data layout required"); + + Int1Ty = Type::getInt1Ty(M.getContext()); + Int8Ty = Type::getInt8Ty(M.getContext()); + Int32Ty = Type::getInt32Ty(M.getContext()); + Int32PtrTy = PointerType::getUnqual(Int32Ty); + Int64Ty = Type::getInt64Ty(M.getContext()); + IntPtrTy = DL->getIntPtrType(M.getContext(), 0); + + BitSetNM = M.getNamedMetadata("llvm.bitsets"); + + BitSetTestCallSites.clear(); + + return false; +} + +/// Build a bit set for BitSet using the object layouts in +/// GlobalLayout. +BitSetInfo LowerBitSets::buildBitSet( + MDString *BitSet, + const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) { + BitSetBuilder BSB; + + // Compute the byte offset of each element of this bitset. + if (BitSetNM) { + for (MDNode *Op : BitSetNM->operands()) { + if (Op->getOperand(0) != BitSet || !Op->getOperand(1)) + continue; + auto OpGlobal = cast<GlobalVariable>( + cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); + uint64_t Offset = + cast<ConstantInt>(cast<ConstantAsMetadata>(Op->getOperand(2)) + ->getValue())->getZExtValue(); + + Offset += GlobalLayout.find(OpGlobal)->second; + + BSB.addOffset(Offset); + } + } + + return BSB.build(); +} + +/// Build a test that bit BitOffset mod sizeof(Bits)*8 is set in +/// Bits. This pattern matches to the bt instruction on x86. +static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits, + Value *BitOffset) { + auto BitsType = cast<IntegerType>(Bits->getType()); + unsigned BitWidth = BitsType->getBitWidth(); + + BitOffset = B.CreateZExtOrTrunc(BitOffset, BitsType); + Value *BitIndex = + B.CreateAnd(BitOffset, ConstantInt::get(BitsType, BitWidth - 1)); + Value *BitMask = B.CreateShl(ConstantInt::get(BitsType, 1), BitIndex); + Value *MaskedBits = B.CreateAnd(Bits, BitMask); + return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0)); +} + +/// Build a test that bit BitOffset is set in BSI, where +/// BitSetGlobal is a global containing the bits in BSI. +Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, const BitSetInfo &BSI, + GlobalVariable *BitSetGlobal, + Value *BitOffset) { + if (BSI.Bits.size() <= 8) { + // If the bit set is sufficiently small, we can avoid a load by bit testing + // a constant. + IntegerType *BitsTy; + if (BSI.Bits.size() <= 4) + BitsTy = Int32Ty; + else + BitsTy = Int64Ty; + + uint64_t Bits = 0; + for (auto I = BSI.Bits.rbegin(), E = BSI.Bits.rend(); I != E; ++I) { + Bits <<= 8; + Bits |= *I; + } + Constant *BitsConst = ConstantInt::get(BitsTy, Bits); + return createMaskedBitTest(B, BitsConst, BitOffset); + } else { + // TODO: We might want to use the memory variant of the bt instruction + // with the previously computed bit offset at -Os. This instruction does + // exactly what we want but has been benchmarked as being slower than open + // coding the load+bt. + Value *BitSetGlobalOffset = + B.CreateLShr(BitOffset, ConstantInt::get(IntPtrTy, 5)); + Value *BitSetEntryAddr = B.CreateGEP( + ConstantExpr::getBitCast(BitSetGlobal, Int32PtrTy), BitSetGlobalOffset); + Value *BitSetEntry = B.CreateLoad(BitSetEntryAddr); + + return createMaskedBitTest(B, BitSetEntry, BitOffset); + } +} + +/// Lower a llvm.bitset.test call to its implementation. Returns the value to +/// replace the call with. +Value *LowerBitSets::lowerBitSetCall( + CallInst *CI, const BitSetInfo &BSI, GlobalVariable *BitSetGlobal, + GlobalVariable *CombinedGlobal, + const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) { + Value *Ptr = CI->getArgOperand(0); + + if (BSI.containsValue(DL, GlobalLayout, Ptr)) + return ConstantInt::getTrue(BitSetGlobal->getParent()->getContext()); + + Constant *GlobalAsInt = ConstantExpr::getPtrToInt(CombinedGlobal, IntPtrTy); + Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd( + GlobalAsInt, ConstantInt::get(IntPtrTy, BSI.ByteOffset)); + + BasicBlock *InitialBB = CI->getParent(); + + IRBuilder<> B(CI); + + Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy); + + if (BSI.isSingleOffset()) + return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt); + + Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt); + + Value *BitOffset; + if (BSI.AlignLog2 == 0) { + BitOffset = PtrOffset; + } else { + // We need to check that the offset both falls within our range and is + // suitably aligned. We can check both properties at the same time by + // performing a right rotate by log2(alignment) followed by an integer + // comparison against the bitset size. The rotate will move the lower + // order bits that need to be zero into the higher order bits of the + // result, causing the comparison to fail if they are nonzero. The rotate + // also conveniently gives us a bit offset to use during the load from + // the bitset. + Value *OffsetSHR = + B.CreateLShr(PtrOffset, ConstantInt::get(IntPtrTy, BSI.AlignLog2)); + Value *OffsetSHL = B.CreateShl( + PtrOffset, ConstantInt::get(IntPtrTy, DL->getPointerSizeInBits(0) - + BSI.AlignLog2)); + BitOffset = B.CreateOr(OffsetSHR, OffsetSHL); + } + + Constant *BitSizeConst = ConstantInt::get(IntPtrTy, BSI.BitSize); + Value *OffsetInRange = B.CreateICmpULT(BitOffset, BitSizeConst); + + // If the bit set is all ones, testing against it is unnecessary. + if (BSI.isAllOnes()) + return OffsetInRange; + + TerminatorInst *Term = SplitBlockAndInsertIfThen(OffsetInRange, CI, false); + IRBuilder<> ThenB(Term); + + // Now that we know that the offset is in range and aligned, load the + // appropriate bit from the bitset. + Value *Bit = createBitSetTest(ThenB, BSI, BitSetGlobal, BitOffset); + + // The value we want is 0 if we came directly from the initial block + // (having failed the range or alignment checks), or the loaded bit if + // we came from the block in which we loaded it. + B.SetInsertPoint(CI); + PHINode *P = B.CreatePHI(Int1Ty, 2); + P->addIncoming(ConstantInt::get(Int1Ty, 0), InitialBB); + P->addIncoming(Bit, ThenB.GetInsertBlock()); + return P; +} + +/// Given a disjoint set of bitsets and globals, layout the globals, build the +/// bit sets and lower the llvm.bitset.test calls. +void LowerBitSets::buildBitSetsFromGlobals( + Module &M, + const std::vector<MDString *> &BitSets, + const std::vector<GlobalVariable *> &Globals) { + // Build a new global with the combined contents of the referenced globals. + std::vector<Constant *> GlobalInits; + for (GlobalVariable *G : Globals) { + GlobalInits.push_back(G->getInitializer()); + uint64_t InitSize = DL->getTypeAllocSize(G->getInitializer()->getType()); + + // Compute the amount of padding required to align the next element to the + // next power of 2. + uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize; + + // Cap at 128 was found experimentally to have a good data/instruction + // overhead tradeoff. + if (Padding > 128) + Padding = RoundUpToAlignment(InitSize, 128) - InitSize; + + GlobalInits.push_back( + ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding))); + } + if (!GlobalInits.empty()) + GlobalInits.pop_back(); + Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits); + auto CombinedGlobal = + new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true, + GlobalValue::PrivateLinkage, NewInit); + + const StructLayout *CombinedGlobalLayout = + DL->getStructLayout(cast<StructType>(NewInit->getType())); + + // Compute the offsets of the original globals within the new global. + DenseMap<GlobalVariable *, uint64_t> GlobalLayout; + for (unsigned I = 0; I != Globals.size(); ++I) + // Multiply by 2 to account for padding elements. + GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2); + + // For each bitset in this disjoint set... + for (MDString *BS : BitSets) { + // Build the bitset. + BitSetInfo BSI = buildBitSet(BS, GlobalLayout); + + // Create a global in which to store it. + ++NumBitSetsCreated; + Constant *BitsConst = ConstantDataArray::get(M.getContext(), BSI.Bits); + auto BitSetGlobal = new GlobalVariable( + M, BitsConst->getType(), /*isConstant=*/true, + GlobalValue::PrivateLinkage, BitsConst, BS->getString() + ".bits"); + + // Lower each call to llvm.bitset.test for this bitset. + for (CallInst *CI : BitSetTestCallSites[BS]) { + ++NumBitSetCallsLowered; + Value *Lowered = + lowerBitSetCall(CI, BSI, BitSetGlobal, CombinedGlobal, GlobalLayout); + CI->replaceAllUsesWith(Lowered); + CI->eraseFromParent(); + } + } + + // Build aliases pointing to offsets into the combined global for each + // global from which we built the combined global, and replace references + // to the original globals with references to the aliases. + for (unsigned I = 0; I != Globals.size(); ++I) { + // Multiply by 2 to account for padding elements. + Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0), + ConstantInt::get(Int32Ty, I * 2)}; + Constant *CombinedGlobalElemPtr = + ConstantExpr::getGetElementPtr(CombinedGlobal, CombinedGlobalIdxs); + GlobalAlias *GAlias = GlobalAlias::create( + Globals[I]->getType()->getElementType(), + Globals[I]->getType()->getAddressSpace(), Globals[I]->getLinkage(), + "", CombinedGlobalElemPtr, &M); + GAlias->takeName(Globals[I]); + Globals[I]->replaceAllUsesWith(GAlias); + Globals[I]->eraseFromParent(); + } +} + +/// Lower all bit sets in this module. +bool LowerBitSets::buildBitSets(Module &M) { + Function *BitSetTestFunc = + M.getFunction(Intrinsic::getName(Intrinsic::bitset_test)); + if (!BitSetTestFunc) + return false; + + // Equivalence class set containing bitsets and the globals they reference. + // This is used to partition the set of bitsets in the module into disjoint + // sets. + typedef EquivalenceClasses<PointerUnion<GlobalVariable *, MDString *>> + GlobalClassesTy; + GlobalClassesTy GlobalClasses; + + for (const Use &U : BitSetTestFunc->uses()) { + auto CI = cast<CallInst>(U.getUser()); + + auto BitSetMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1)); + if (!BitSetMDVal || !isa<MDString>(BitSetMDVal->getMetadata())) + report_fatal_error( + "Second argument of llvm.bitset.test must be metadata string"); + auto BitSet = cast<MDString>(BitSetMDVal->getMetadata()); + + // Add the call site to the list of call sites for this bit set. We also use + // BitSetTestCallSites to keep track of whether we have seen this bit set + // before. If we have, we don't need to re-add the referenced globals to the + // equivalence class. + std::pair<DenseMap<MDString *, std::vector<CallInst *>>::iterator, + bool> Ins = + BitSetTestCallSites.insert( + std::make_pair(BitSet, std::vector<CallInst *>())); + Ins.first->second.push_back(CI); + if (!Ins.second) + continue; + + // Add the bitset to the equivalence class. + GlobalClassesTy::iterator GCI = GlobalClasses.insert(BitSet); + GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI); + + if (!BitSetNM) + continue; + + // Verify the bitset metadata and add the referenced globals to the bitset's + // equivalence class. + for (MDNode *Op : BitSetNM->operands()) { + if (Op->getNumOperands() != 3) + report_fatal_error( + "All operands of llvm.bitsets metadata must have 3 elements"); + + if (Op->getOperand(0) != BitSet || !Op->getOperand(1)) + continue; + + auto OpConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(1)); + if (!OpConstMD) + report_fatal_error("Bit set element must be a constant"); + auto OpGlobal = dyn_cast<GlobalVariable>(OpConstMD->getValue()); + if (!OpGlobal) + report_fatal_error("Bit set element must refer to global"); + + auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(2)); + if (!OffsetConstMD) + report_fatal_error("Bit set element offset must be a constant"); + auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue()); + if (!OffsetInt) + report_fatal_error( + "Bit set element offset must be an integer constant"); + + CurSet = GlobalClasses.unionSets( + CurSet, GlobalClasses.findLeader(GlobalClasses.insert(OpGlobal))); + } + } + + if (GlobalClasses.empty()) + return false; + + // For each disjoint set we found... + for (GlobalClassesTy::iterator I = GlobalClasses.begin(), + E = GlobalClasses.end(); + I != E; ++I) { + if (!I->isLeader()) continue; + + ++NumBitSetDisjointSets; + + // Build the list of bitsets and referenced globals in this disjoint set. + std::vector<MDString *> BitSets; + std::vector<GlobalVariable *> Globals; + llvm::DenseMap<MDString *, uint64_t> BitSetIndices; + llvm::DenseMap<GlobalVariable *, uint64_t> GlobalIndices; + for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I); + MI != GlobalClasses.member_end(); ++MI) { + if ((*MI).is<MDString *>()) { + BitSetIndices[MI->get<MDString *>()] = BitSets.size(); + BitSets.push_back(MI->get<MDString *>()); + } else { + GlobalIndices[MI->get<GlobalVariable *>()] = Globals.size(); + Globals.push_back(MI->get<GlobalVariable *>()); + } + } + + // For each bitset, build a set of indices that refer to globals referenced + // by the bitset. + std::vector<std::set<uint64_t>> BitSetMembers(BitSets.size()); + if (BitSetNM) { + for (MDNode *Op : BitSetNM->operands()) { + // Op = { bitset name, global, offset } + if (!Op->getOperand(1)) + continue; + auto I = BitSetIndices.find(cast<MDString>(Op->getOperand(0))); + if (I == BitSetIndices.end()) + continue; + + auto OpGlobal = cast<GlobalVariable>( + cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); + BitSetMembers[I->second].insert(GlobalIndices[OpGlobal]); + } + } + + // Order the sets of indices by size. The GlobalLayoutBuilder works best + // when given small index sets first. + std::stable_sort( + BitSetMembers.begin(), BitSetMembers.end(), + [](const std::set<uint64_t> &O1, const std::set<uint64_t> &O2) { + return O1.size() < O2.size(); + }); + + // Create a GlobalLayoutBuilder and provide it with index sets as layout + // fragments. The GlobalLayoutBuilder tries to lay out members of fragments + // as close together as possible. + GlobalLayoutBuilder GLB(Globals.size()); + for (auto &&MemSet : BitSetMembers) + GLB.addFragment(MemSet); + + // Build a vector of globals with the computed layout. + std::vector<GlobalVariable *> OrderedGlobals(Globals.size()); + auto OGI = OrderedGlobals.begin(); + for (auto &&F : GLB.Fragments) + for (auto &&Offset : F) + *OGI++ = Globals[Offset]; + + // Order bitsets by name for determinism. + std::sort(BitSets.begin(), BitSets.end(), [](MDString *S1, MDString *S2) { + return S1->getString() < S2->getString(); + }); + + // Build the bitsets from this disjoint set. + buildBitSetsFromGlobals(M, BitSets, OrderedGlobals); + } + + return true; +} + +bool LowerBitSets::eraseBitSetMetadata(Module &M) { + if (!BitSetNM) + return false; + + M.eraseNamedMetadata(BitSetNM); + return true; +} + +bool LowerBitSets::runOnModule(Module &M) { + bool Changed = buildBitSets(M); + Changed |= eraseBitSetMetadata(M); + return Changed; +} diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 76d6dfa..4a7cb7b 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -58,13 +58,13 @@ Function* PartialInliner::unswitchFunction(Function* F) { BasicBlock* returnBlock = nullptr; BasicBlock* nonReturnBlock = nullptr; unsigned returnCount = 0; - for (succ_iterator SI = succ_begin(entryBlock), SE = succ_end(entryBlock); - SI != SE; ++SI) - if (isa<ReturnInst>((*SI)->getTerminator())) { - returnBlock = *SI; + for (BasicBlock *BB : successors(entryBlock)) { + if (isa<ReturnInst>(BB->getTerminator())) { + returnBlock = BB; returnCount++; } else - nonReturnBlock = *SI; + nonReturnBlock = BB; + } if (returnCount != 1) return nullptr; diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index da85a91..9a75050 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -19,12 +19,11 @@ #include "llvm/Analysis/Passes.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Verifier.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ManagedStatic.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Vectorize.h" @@ -118,7 +117,7 @@ void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) { } void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy, - PassManagerBase &PM) const { + legacy::PassManagerBase &PM) const { for (unsigned i = 0, e = GlobalExtensions->size(); i != e; ++i) if ((*GlobalExtensions)[i].first == ETy) (*GlobalExtensions)[i].second(*this, PM); @@ -127,8 +126,8 @@ void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy, Extensions[i].second(*this, PM); } -void -PassManagerBuilder::addInitialAliasAnalysisPasses(PassManagerBase &PM) const { +void PassManagerBuilder::addInitialAliasAnalysisPasses( + legacy::PassManagerBase &PM) const { // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that // BasicAliasAnalysis wins if they disagree. This is intended to help // support "obvious" type-punning idioms. @@ -139,11 +138,13 @@ PassManagerBuilder::addInitialAliasAnalysisPasses(PassManagerBase &PM) const { PM.add(createBasicAliasAnalysisPass()); } -void PassManagerBuilder::populateFunctionPassManager(FunctionPassManager &FPM) { +void PassManagerBuilder::populateFunctionPassManager( + legacy::FunctionPassManager &FPM) { addExtensionsToPM(EP_EarlyAsPossible, FPM); // Add LibraryInfo if we have some. - if (LibraryInfo) FPM.add(new TargetLibraryInfo(*LibraryInfo)); + if (LibraryInfo) + FPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo)); if (OptLevel == 0) return; @@ -158,7 +159,8 @@ void PassManagerBuilder::populateFunctionPassManager(FunctionPassManager &FPM) { FPM.add(createLowerExpectIntrinsicPass()); } -void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { +void PassManagerBuilder::populateModulePassManager( + legacy::PassManagerBase &MPM) { // If all optimizations are disabled, just run the always-inline pass and, // if enabled, the function merging pass. if (OptLevel == 0) { @@ -182,7 +184,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { } // Add LibraryInfo if we have some. - if (LibraryInfo) MPM.add(new TargetLibraryInfo(*LibraryInfo)); + if (LibraryInfo) + MPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo)); addInitialAliasAnalysisPasses(MPM); @@ -228,7 +231,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createTailCallEliminationPass()); // Eliminate tail calls MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions - MPM.add(createLoopRotatePass()); // Rotate Loop + // Rotate Loop - disable header duplication at -Oz + MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); MPM.add(createLICMPass()); // Hoist loop invariants MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3)); MPM.add(createInstructionCombiningPass()); @@ -248,6 +252,11 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset MPM.add(createSCCPPass()); // Constant prop with SCCP + // Delete dead bit computations (instcombine runs after to fold away the dead + // computations, and then ADCE will run later to exploit any new DCE + // opportunities that creates). + MPM.add(createBitTrackingDCEPass()); // Delete dead bit computations + // Run instcombine after redundancy elimination to exploit opportunities // opened up by them. MPM.add(createInstructionCombiningPass()); @@ -255,6 +264,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createJumpThreadingPass()); // Thread jumps MPM.add(createCorrelatedValuePropagationPass()); MPM.add(createDeadStoreEliminationPass()); // Delete dead stores + MPM.add(createLICMPass()); addExtensionsToPM(EP_ScalarOptimizerLate, MPM); @@ -373,7 +383,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { addExtensionsToPM(EP_OptimizerLast, MPM); } -void PassManagerBuilder::addLTOOptimizationPasses(PassManagerBase &PM) { +void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // Provide AliasAnalysis services for optimizations. addInitialAliasAnalysisPasses(PM); @@ -464,6 +474,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(PassManagerBase &PM) { PM.add(createJumpThreadingPass()); + // Lower bitset metadata to bitsets. + PM.add(createLowerBitSetsPass()); + // Delete basic blocks, which optimization passes may have killed. PM.add(createCFGSimplificationPass()); @@ -476,15 +489,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(PassManagerBase &PM) { PM.add(createMergeFunctionsPass()); } -void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, - TargetMachine *TM) { - if (TM) { - PM.add(new DataLayoutPass()); - TM->addAnalysisPasses(PM); - } - +void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) { if (LibraryInfo) - PM.add(new TargetLibraryInfo(*LibraryInfo)); + PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo)); if (VerifyInput) PM.add(createVerifierPass()); @@ -567,7 +574,7 @@ void LLVMPassManagerBuilderPopulateFunctionPassManager(LLVMPassManagerBuilderRef PMB, LLVMPassManagerRef PM) { PassManagerBuilder *Builder = unwrap(PMB); - FunctionPassManager *FPM = unwrap<FunctionPassManager>(PM); + legacy::FunctionPassManager *FPM = unwrap<legacy::FunctionPassManager>(PM); Builder->populateFunctionPassManager(*FPM); } @@ -575,7 +582,7 @@ void LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB, LLVMPassManagerRef PM) { PassManagerBuilder *Builder = unwrap(PMB); - PassManagerBase *MPM = unwrap(PM); + legacy::PassManagerBase *MPM = unwrap(PM); Builder->populateModulePassManager(*MPM); } @@ -584,7 +591,7 @@ void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB, LLVMBool Internalize, LLVMBool RunInliner) { PassManagerBuilder *Builder = unwrap(PMB); - PassManagerBase *LPM = unwrap(PM); + legacy::PassManagerBase *LPM = unwrap(PM); // A small backwards compatibility hack. populateLTOPassManager used to take // an RunInliner option. diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp index b2c4a09..1943b93 100644 --- a/lib/Transforms/IPO/PruneEH.cpp +++ b/lib/Transforms/IPO/PruneEH.cpp @@ -18,8 +18,10 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/LibCallSemantics.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -175,7 +177,7 @@ bool PruneEH::SimplifyFunction(Function *F) { bool MadeChange = false; for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) - if (II->doesNotThrow()) { + if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(II)) { SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); // Insert a call instruction before the invoke. CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II); @@ -200,7 +202,7 @@ bool PruneEH::SimplifyFunction(Function *F) { BB->getInstList().pop_back(); // If the unwind block is now dead, nuke it. - if (pred_begin(UnwindBlock) == pred_end(UnwindBlock)) + if (pred_empty(UnwindBlock)) DeleteBasicBlock(UnwindBlock); // Delete the new BB. ++NumRemoved; @@ -234,7 +236,7 @@ bool PruneEH::SimplifyFunction(Function *F) { /// updating the callgraph to reflect any now-obsolete edges due to calls that /// exist in the BB. void PruneEH::DeleteBasicBlock(BasicBlock *BB) { - assert(pred_begin(BB) == pred_end(BB) && "BB is not dead!"); + assert(pred_empty(BB) && "BB is not dead!"); CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); CallGraphNode *CGN = CG[BB->getParent()]; diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp index 3412b9e..816978e 100644 --- a/lib/Transforms/IPO/StripSymbols.cpp +++ b/lib/Transforms/IPO/StripSymbols.cpp @@ -301,8 +301,8 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { // For each compile unit, find the live set of global variables/functions and // replace the current list of potentially dead global variables/functions // with the live list. - SmallVector<Value *, 64> LiveGlobalVariables; - SmallVector<Value *, 64> LiveSubprograms; + SmallVector<Metadata *, 64> LiveGlobalVariables; + SmallVector<Metadata *, 64> LiveSubprograms; DenseSet<const MDNode *> VisitedSet; for (DICompileUnit DIC : F.compile_units()) { diff --git a/lib/Transforms/InstCombine/CMakeLists.txt b/lib/Transforms/InstCombine/CMakeLists.txt index a25696e..0ed8e62 100644 --- a/lib/Transforms/InstCombine/CMakeLists.txt +++ b/lib/Transforms/InstCombine/CMakeLists.txt @@ -12,6 +12,10 @@ add_llvm_library(LLVMInstCombine InstCombineShifts.cpp InstCombineSimplifyDemanded.cpp InstCombineVectorOps.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/InstCombine ) add_dependencies(LLVMInstCombine intrinsics_gen) diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 902b640..752f79d 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/DataLayout.h" @@ -751,8 +751,7 @@ Value *FAddCombine::createNaryFAdd return LastVal; } -Value *FAddCombine::createFSub - (Value *Opnd0, Value *Opnd1) { +Value *FAddCombine::createFSub(Value *Opnd0, Value *Opnd1) { Value *V = Builder->CreateFSub(Opnd0, Opnd1); if (Instruction *I = dyn_cast<Instruction>(V)) createInstPostProc(I); @@ -760,15 +759,14 @@ Value *FAddCombine::createFSub } Value *FAddCombine::createFNeg(Value *V) { - Value *Zero = cast<Value>(ConstantFP::get(V->getType(), 0.0)); + Value *Zero = cast<Value>(ConstantFP::getZeroValueForNegation(V->getType())); Value *NewV = createFSub(Zero, V); if (Instruction *I = dyn_cast<Instruction>(NewV)) createInstPostProc(I, true); // fneg's don't receive instruction numbers. return NewV; } -Value *FAddCombine::createFAdd - (Value *Opnd0, Value *Opnd1) { +Value *FAddCombine::createFAdd(Value *Opnd0, Value *Opnd1) { Value *V = Builder->CreateFAdd(Opnd0, Opnd1); if (Instruction *I = dyn_cast<Instruction>(V)) createInstPostProc(I); @@ -789,8 +787,7 @@ Value *FAddCombine::createFDiv(Value *Opnd0, Value *Opnd1) { return V; } -void FAddCombine::createInstPostProc(Instruction *NewInstr, - bool NoNumber) { +void FAddCombine::createInstPostProc(Instruction *NewInstr, bool NoNumber) { NewInstr->setDebugLoc(Instr->getDebugLoc()); // Keep track of the number of instruction created. @@ -840,8 +837,7 @@ unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) { // <C, V> "fmul V, C" false // // NOTE: Keep this function in sync with FAddCombine::calcInstrNumber. -Value *FAddCombine::createAddendVal - (const FAddend &Opnd, bool &NeedNeg) { +Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) { const FAddendCoef &Coeff = Opnd.getCoef(); if (Opnd.isConstant()) { @@ -894,7 +890,6 @@ static bool checkRippleForAdd(const APInt &Op0KnownZero, /// (sext (add LHS, RHS)) === (add (sext LHS), (sext RHS)) /// This basically requires proving that the add in the original type would not /// overflow to change the sign bit or have a carry out. -/// TODO: Handle this for Vectors. bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction *CxtI) { // There are different heuristics we can use for this. Here are some simple @@ -918,42 +913,25 @@ bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS, ComputeNumSignBits(RHS, 0, CxtI) > 1) return true; - if (IntegerType *IT = dyn_cast<IntegerType>(LHS->getType())) { - int BitWidth = IT->getBitWidth(); - APInt LHSKnownZero(BitWidth, 0); - APInt LHSKnownOne(BitWidth, 0); - computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI); - - APInt RHSKnownZero(BitWidth, 0); - APInt RHSKnownOne(BitWidth, 0); - computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI); - - // Addition of two 2's compliment numbers having opposite signs will never - // overflow. - if ((LHSKnownOne[BitWidth - 1] && RHSKnownZero[BitWidth - 1]) || - (LHSKnownZero[BitWidth - 1] && RHSKnownOne[BitWidth - 1])) - return true; - - // Check if carry bit of addition will not cause overflow. - if (checkRippleForAdd(LHSKnownZero, RHSKnownZero)) - return true; - if (checkRippleForAdd(RHSKnownZero, LHSKnownZero)) - return true; - } - return false; -} + unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); + APInt LHSKnownZero(BitWidth, 0); + APInt LHSKnownOne(BitWidth, 0); + computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI); -/// WillNotOverflowUnsignedAdd - Return true if we can prove that: -/// (zext (add LHS, RHS)) === (add (zext LHS), (zext RHS)) -bool InstCombiner::WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS, - Instruction *CxtI) { - // There are different heuristics we can use for this. Here is a simple one. - // If the sign bit of LHS and that of RHS are both zero, no unsigned wrap. - bool LHSKnownNonNegative, LHSKnownNegative; - bool RHSKnownNonNegative, RHSKnownNegative; - ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, 0, AT, CxtI, DT); - ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, 0, AT, CxtI, DT); - if (LHSKnownNonNegative && RHSKnownNonNegative) + APInt RHSKnownZero(BitWidth, 0); + APInt RHSKnownOne(BitWidth, 0); + computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI); + + // Addition of two 2's compliment numbers having opposite signs will never + // overflow. + if ((LHSKnownOne[BitWidth - 1] && RHSKnownZero[BitWidth - 1]) || + (LHSKnownZero[BitWidth - 1] && RHSKnownOne[BitWidth - 1])) + return true; + + // Check if carry bit of addition will not cause overflow. + if (checkRippleForAdd(LHSKnownZero, RHSKnownZero)) + return true; + if (checkRippleForAdd(RHSKnownZero, LHSKnownZero)) return true; return false; @@ -972,24 +950,22 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS, ComputeNumSignBits(RHS, 0, CxtI) > 1) return true; - if (IntegerType *IT = dyn_cast<IntegerType>(LHS->getType())) { - unsigned BitWidth = IT->getBitWidth(); - APInt LHSKnownZero(BitWidth, 0); - APInt LHSKnownOne(BitWidth, 0); - computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI); + unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); + APInt LHSKnownZero(BitWidth, 0); + APInt LHSKnownOne(BitWidth, 0); + computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI); - APInt RHSKnownZero(BitWidth, 0); - APInt RHSKnownOne(BitWidth, 0); - computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI); + APInt RHSKnownZero(BitWidth, 0); + APInt RHSKnownOne(BitWidth, 0); + computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI); - // Subtraction of two 2's compliment numbers having identical signs will - // never overflow. - if ((LHSKnownOne[BitWidth - 1] && RHSKnownOne[BitWidth - 1]) || - (LHSKnownZero[BitWidth - 1] && RHSKnownZero[BitWidth - 1])) - return true; + // Subtraction of two 2's compliment numbers having identical signs will + // never overflow. + if ((LHSKnownOne[BitWidth - 1] && RHSKnownOne[BitWidth - 1]) || + (LHSKnownZero[BitWidth - 1] && RHSKnownZero[BitWidth - 1])) + return true; - // TODO: implement logic similar to checkRippleForAdd - } + // TODO: implement logic similar to checkRippleForAdd return false; } @@ -1000,8 +976,8 @@ bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, // If the LHS is negative and the RHS is non-negative, no unsigned wrap. bool LHSKnownNonNegative, LHSKnownNegative; bool RHSKnownNonNegative, RHSKnownNegative; - ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, 0, AT, CxtI, DT); - ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, 0, AT, CxtI, DT); + ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0, CxtI); + ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0, CxtI); if (LHSKnownNegative && RHSKnownNonNegative) return true; @@ -1077,7 +1053,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { return ReplaceInstUsesWith(I, V); if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(), - I.hasNoUnsignedWrap(), DL, TLI, DT, AT)) + I.hasNoUnsignedWrap(), DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // (A*B)+(A*C) -> A*(B+C) etc @@ -1335,7 +1311,9 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { Changed = true; I.setHasNoSignedWrap(true); } - if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedAdd(LHS, RHS, &I)) { + if (!I.hasNoUnsignedWrap() && + computeOverflowForUnsignedAdd(LHS, RHS, &I) == + OverflowResult::NeverOverflows) { Changed = true; I.setHasNoUnsignedWrap(true); } @@ -1350,8 +1328,8 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL, - TLI, DT, AT)) + if (Value *V = + SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); if (isa<Constant>(RHS)) { @@ -1529,7 +1507,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { return ReplaceInstUsesWith(I, V); if (Value *V = SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(), - I.hasNoUnsignedWrap(), DL, TLI, DT, AT)) + I.hasNoUnsignedWrap(), DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // (A*B)-(A*C) -> A*(B-C) etc @@ -1717,10 +1695,18 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), DL, - TLI, DT, AT)) + if (Value *V = + SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); + // fsub nsz 0, X ==> fsub nsz -0.0, X + if (I.getFastMathFlags().noSignedZeros() && match(Op0, m_Zero())) { + // Subtraction from -0.0 is the canonical form of fneg. + Instruction *NewI = BinaryOperator::CreateFNeg(Op1); + NewI->copyFastMathFlags(&I); + return NewI; + } + if (isa<Constant>(Op0)) if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) if (Instruction *NV = FoldOpIntoSelect(I, SI)) diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 55ebced..863eeaf 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Intrinsics.h" @@ -22,30 +22,12 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -/// isFreeToInvert - Return true if the specified value is free to invert (apply -/// ~ to). This happens in cases where the ~ can be eliminated. -static inline bool isFreeToInvert(Value *V) { - // ~(~(X)) -> X. - if (BinaryOperator::isNot(V)) - return true; - - // Constants can be considered to be not'ed values. - if (isa<ConstantInt>(V)) - return true; - - // Compares can be inverted if they have a single use. - if (CmpInst *CI = dyn_cast<CmpInst>(V)) - return CI->hasOneUse(); - - return false; -} - static inline Value *dyn_castNotVal(Value *V) { // If this is not(not(x)) don't return that this is a not: we want the two // not's to be folded first. if (BinaryOperator::isNot(V)) { Value *Operand = BinaryOperator::getNotArgument(V); - if (!isFreeToInvert(Operand)) + if (!IsFreeToInvert(Operand, Operand->hasOneUse())) return Operand; } @@ -117,6 +99,61 @@ static Value *getFCmpValue(bool isordered, unsigned code, return Builder->CreateFCmp(Pred, LHS, RHS); } +/// \brief Transform BITWISE_OP(BSWAP(A),BSWAP(B)) to BSWAP(BITWISE_OP(A, B)) +/// \param I Binary operator to transform. +/// \return Pointer to node that must replace the original binary operator, or +/// null pointer if no transformation was made. +Value *InstCombiner::SimplifyBSwap(BinaryOperator &I) { + IntegerType *ITy = dyn_cast<IntegerType>(I.getType()); + + // Can't do vectors. + if (I.getType()->isVectorTy()) return nullptr; + + // Can only do bitwise ops. + unsigned Op = I.getOpcode(); + if (Op != Instruction::And && Op != Instruction::Or && + Op != Instruction::Xor) + return nullptr; + + Value *OldLHS = I.getOperand(0); + Value *OldRHS = I.getOperand(1); + ConstantInt *ConstLHS = dyn_cast<ConstantInt>(OldLHS); + ConstantInt *ConstRHS = dyn_cast<ConstantInt>(OldRHS); + IntrinsicInst *IntrLHS = dyn_cast<IntrinsicInst>(OldLHS); + IntrinsicInst *IntrRHS = dyn_cast<IntrinsicInst>(OldRHS); + bool IsBswapLHS = (IntrLHS && IntrLHS->getIntrinsicID() == Intrinsic::bswap); + bool IsBswapRHS = (IntrRHS && IntrRHS->getIntrinsicID() == Intrinsic::bswap); + + if (!IsBswapLHS && !IsBswapRHS) + return nullptr; + + if (!IsBswapLHS && !ConstLHS) + return nullptr; + + if (!IsBswapRHS && !ConstRHS) + return nullptr; + + /// OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) ) + /// OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) ) + Value *NewLHS = IsBswapLHS ? IntrLHS->getOperand(0) : + Builder->getInt(ConstLHS->getValue().byteSwap()); + + Value *NewRHS = IsBswapRHS ? IntrRHS->getOperand(0) : + Builder->getInt(ConstRHS->getValue().byteSwap()); + + Value *BinOp = nullptr; + if (Op == Instruction::And) + BinOp = Builder->CreateAnd(NewLHS, NewRHS); + else if (Op == Instruction::Or) + BinOp = Builder->CreateOr(NewLHS, NewRHS); + else //if (Op == Instruction::Xor) + BinOp = Builder->CreateXor(NewLHS, NewRHS); + + Module *M = I.getParent()->getParent()->getParent(); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy); + return Builder->CreateCall(F, BinOp); +} + // OptAndOp - This handles expressions of the form ((val OP C1) & C2). Where // the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'. Op is // guaranteed to be a binary operator. @@ -785,6 +822,62 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, return nullptr; } +/// Try to fold a signed range checked with lower bound 0 to an unsigned icmp. +/// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n +/// If \p Inverted is true then the check is for the inverted range, e.g. +/// (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n +Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, + bool Inverted) { + // Check the lower range comparison, e.g. x >= 0 + // InstCombine already ensured that if there is a constant it's on the RHS. + ConstantInt *RangeStart = dyn_cast<ConstantInt>(Cmp0->getOperand(1)); + if (!RangeStart) + return nullptr; + + ICmpInst::Predicate Pred0 = (Inverted ? Cmp0->getInversePredicate() : + Cmp0->getPredicate()); + + // Accept x > -1 or x >= 0 (after potentially inverting the predicate). + if (!((Pred0 == ICmpInst::ICMP_SGT && RangeStart->isMinusOne()) || + (Pred0 == ICmpInst::ICMP_SGE && RangeStart->isZero()))) + return nullptr; + + ICmpInst::Predicate Pred1 = (Inverted ? Cmp1->getInversePredicate() : + Cmp1->getPredicate()); + + Value *Input = Cmp0->getOperand(0); + Value *RangeEnd; + if (Cmp1->getOperand(0) == Input) { + // For the upper range compare we have: icmp x, n + RangeEnd = Cmp1->getOperand(1); + } else if (Cmp1->getOperand(1) == Input) { + // For the upper range compare we have: icmp n, x + RangeEnd = Cmp1->getOperand(0); + Pred1 = ICmpInst::getSwappedPredicate(Pred1); + } else { + return nullptr; + } + + // Check the upper range comparison, e.g. x < n + ICmpInst::Predicate NewPred; + switch (Pred1) { + case ICmpInst::ICMP_SLT: NewPred = ICmpInst::ICMP_ULT; break; + case ICmpInst::ICMP_SLE: NewPred = ICmpInst::ICMP_ULE; break; + default: return nullptr; + } + + // This simplification is only valid if the upper range is not negative. + bool IsNegative, IsNotNegative; + ComputeSignBit(RangeEnd, IsNotNegative, IsNegative, /*Depth=*/0, Cmp1); + if (!IsNotNegative) + return nullptr; + + if (Inverted) + NewPred = ICmpInst::getInversePredicate(NewPred); + + return Builder->CreateICmp(NewPred, Input, RangeEnd); +} + /// FoldAndOfICmps - Fold (icmp)&(icmp) if possible. Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); @@ -807,6 +900,14 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder)) return V; + // E.g. (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n + if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/false)) + return V; + + // E.g. (icmp slt x, n) & (icmp sge x, 0) --> icmp ult x, n + if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/false)) + return V; + // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2). Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0); ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1)); @@ -1108,7 +1209,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyAndInst(Op0, Op1, DL, TLI, DT, AT)) + if (Value *V = SimplifyAndInst(Op0, Op1, DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // (A|B)&(A|C) -> A|(B&C) etc @@ -1120,6 +1221,9 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { if (SimplifyDemandedInstructionBits(I)) return &I; + if (Value *V = SimplifyBSwap(I)) + return ReplaceInstUsesWith(I, V); + if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) { const APInt &AndRHSMask = AndRHS->getValue(); @@ -1605,15 +1709,15 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Value *Mask = nullptr; Value *Masked = nullptr; if (LAnd->getOperand(0) == RAnd->getOperand(0) && - isKnownToBeAPowerOfTwo(LAnd->getOperand(1), false, 0, AT, CxtI, DT) && - isKnownToBeAPowerOfTwo(RAnd->getOperand(1), false, 0, AT, CxtI, DT)) { + isKnownToBeAPowerOfTwo(LAnd->getOperand(1), false, 0, AC, CxtI, DT) && + isKnownToBeAPowerOfTwo(RAnd->getOperand(1), false, 0, AC, CxtI, DT)) { Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1)); Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask); } else if (LAnd->getOperand(1) == RAnd->getOperand(1) && - isKnownToBeAPowerOfTwo(LAnd->getOperand(0), - false, 0, AT, CxtI, DT) && - isKnownToBeAPowerOfTwo(RAnd->getOperand(0), - false, 0, AT, CxtI, DT)) { + isKnownToBeAPowerOfTwo(LAnd->getOperand(0), false, 0, AC, CxtI, + DT) && + isKnownToBeAPowerOfTwo(RAnd->getOperand(0), false, 0, AC, CxtI, + DT)) { Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0)); Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask); } @@ -1724,6 +1828,14 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Builder->CreateAdd(B, ConstantInt::getSigned(B->getType(), -1)), A); } + // E.g. (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n + if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/true)) + return V; + + // E.g. (icmp sgt x, n) | (icmp slt x, 0) --> icmp ugt x, n + if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/true)) + return V; + // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2). if (!LHSCst || !RHSCst) return nullptr; @@ -2033,7 +2145,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyOrInst(Op0, Op1, DL, TLI, DT, AT)) + if (Value *V = SimplifyOrInst(Op0, Op1, DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // (A&B)|(A&C) -> A&(B|C) etc @@ -2045,6 +2157,9 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (SimplifyDemandedInstructionBits(I)) return &I; + if (Value *V = SimplifyBSwap(I)) + return ReplaceInstUsesWith(I, V); + if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { ConstantInt *C1 = nullptr; Value *X = nullptr; // (X & C1) | C2 --> (X | C2) & (C1|C2) @@ -2305,11 +2420,34 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (SwappedForXor) std::swap(Op0, Op1); - if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1))) - if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0))) + { + ICmpInst *LHS = dyn_cast<ICmpInst>(Op0); + ICmpInst *RHS = dyn_cast<ICmpInst>(Op1); + if (LHS && RHS) if (Value *Res = FoldOrOfICmps(LHS, RHS, &I)) return ReplaceInstUsesWith(I, Res); + // TODO: Make this recursive; it's a little tricky because an arbitrary + // number of 'or' instructions might have to be created. + Value *X, *Y; + if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) { + if (auto *Cmp = dyn_cast<ICmpInst>(X)) + if (Value *Res = FoldOrOfICmps(LHS, Cmp, &I)) + return ReplaceInstUsesWith(I, Builder->CreateOr(Res, Y)); + if (auto *Cmp = dyn_cast<ICmpInst>(Y)) + if (Value *Res = FoldOrOfICmps(LHS, Cmp, &I)) + return ReplaceInstUsesWith(I, Builder->CreateOr(Res, X)); + } + if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) { + if (auto *Cmp = dyn_cast<ICmpInst>(X)) + if (Value *Res = FoldOrOfICmps(Cmp, RHS, &I)) + return ReplaceInstUsesWith(I, Builder->CreateOr(Res, Y)); + if (auto *Cmp = dyn_cast<ICmpInst>(Y)) + if (Value *Res = FoldOrOfICmps(Cmp, RHS, &I)) + return ReplaceInstUsesWith(I, Builder->CreateOr(Res, X)); + } + } + // (fcmp uno x, c) | (fcmp uno y, c) -> (fcmp uno x, y) if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0))) if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1))) @@ -2394,7 +2532,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyXorInst(Op0, Op1, DL, TLI, DT, AT)) + if (Value *V = SimplifyXorInst(Op0, Op1, DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // (A&B)^(A&C) -> A&(B^C) etc @@ -2406,6 +2544,9 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { if (SimplifyDemandedInstructionBits(I)) return &I; + if (Value *V = SimplifyBSwap(I)) + return ReplaceInstUsesWith(I, V); + // Is this a ~ operation? if (Value *NotOp = dyn_castNotVal(&I)) { if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(NotOp)) { @@ -2426,8 +2567,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { // ~(X & Y) --> (~X | ~Y) - De Morgan's Law // ~(X | Y) === (~X & ~Y) - De Morgan's Law - if (isFreeToInvert(Op0I->getOperand(0)) && - isFreeToInvert(Op0I->getOperand(1))) { + if (IsFreeToInvert(Op0I->getOperand(0), + Op0I->getOperand(0)->hasOneUse()) && + IsFreeToInvert(Op0I->getOperand(1), + Op0I->getOperand(1)->hasOneUse())) { Value *NotX = Builder->CreateNot(Op0I->getOperand(0), "notlhs"); Value *NotY = @@ -2445,15 +2588,16 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { } } - - if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { - if (RHS->isOne() && Op0->hasOneUse()) + if (Constant *RHS = dyn_cast<Constant>(Op1)) { + if (RHS->isAllOnesValue() && Op0->hasOneUse()) // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B if (CmpInst *CI = dyn_cast<CmpInst>(Op0)) return CmpInst::Create(CI->getOpcode(), CI->getInversePredicate(), CI->getOperand(0), CI->getOperand(1)); + } + if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp). if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) { if (CmpInst *CI = dyn_cast<CmpInst>(Op0C->getOperand(0))) { diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 87e49a1..05e7162 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -11,15 +11,17 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Statepoint.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SimplifyLibCalls.h" using namespace llvm; using namespace PatternMatch; @@ -59,8 +61,8 @@ static Type *reduceToSingleValueType(Type *T) { } Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { - unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, AT, MI, DT); - unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, AT, MI, DT); + unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, AC, MI, DT); + unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, AC, MI, DT); unsigned MinAlign = std::min(DstAlign, SrcAlign); unsigned CopyAlign = MI->getAlignment(); @@ -118,15 +120,14 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { // If the memcpy has metadata describing the members, see if we can // get the TBAA tag describing our copy. if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { - if (M->getNumOperands() == 3 && - M->getOperand(0) && - isa<ConstantInt>(M->getOperand(0)) && - cast<ConstantInt>(M->getOperand(0))->isNullValue() && + if (M->getNumOperands() == 3 && M->getOperand(0) && + mdconst::hasa<ConstantInt>(M->getOperand(0)) && + mdconst::extract<ConstantInt>(M->getOperand(0))->isNullValue() && M->getOperand(1) && - isa<ConstantInt>(M->getOperand(1)) && - cast<ConstantInt>(M->getOperand(1))->getValue() == Size && - M->getOperand(2) && - isa<MDNode>(M->getOperand(2))) + mdconst::hasa<ConstantInt>(M->getOperand(1)) && + mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() == + Size && + M->getOperand(2) && isa<MDNode>(M->getOperand(2))) CopyMD = cast<MDNode>(M->getOperand(2)); } } @@ -155,7 +156,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { } Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { - unsigned Alignment = getKnownAlignment(MI->getDest(), DL, AT, MI, DT); + unsigned Alignment = getKnownAlignment(MI->getDest(), DL, AC, MI, DT); if (MI->getAlignment() < Alignment) { MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Alignment, false)); @@ -352,48 +353,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; case Intrinsic::uadd_with_overflow: { Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); - IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType()); - uint32_t BitWidth = IT->getBitWidth(); - APInt LHSKnownZero(BitWidth, 0); - APInt LHSKnownOne(BitWidth, 0); - computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, II); - bool LHSKnownNegative = LHSKnownOne[BitWidth - 1]; - bool LHSKnownPositive = LHSKnownZero[BitWidth - 1]; - - if (LHSKnownNegative || LHSKnownPositive) { - APInt RHSKnownZero(BitWidth, 0); - APInt RHSKnownOne(BitWidth, 0); - computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, II); - bool RHSKnownNegative = RHSKnownOne[BitWidth - 1]; - bool RHSKnownPositive = RHSKnownZero[BitWidth - 1]; - if (LHSKnownNegative && RHSKnownNegative) { - // The sign bit is set in both cases: this MUST overflow. - // Create a simple add instruction, and insert it into the struct. - Value *Add = Builder->CreateAdd(LHS, RHS); - Add->takeName(&CI); - Constant *V[] = { - UndefValue::get(LHS->getType()), - ConstantInt::getTrue(II->getContext()) - }; - StructType *ST = cast<StructType>(II->getType()); - Constant *Struct = ConstantStruct::get(ST, V); - return InsertValueInst::Create(Struct, Add, 0); - } - - if (LHSKnownPositive && RHSKnownPositive) { - // The sign bit is clear in both cases: this CANNOT overflow. - // Create a simple add instruction, and insert it into the struct. - Value *Add = Builder->CreateNUWAdd(LHS, RHS); - Add->takeName(&CI); - Constant *V[] = { - UndefValue::get(LHS->getType()), - ConstantInt::getFalse(II->getContext()) - }; - StructType *ST = cast<StructType>(II->getType()); - Constant *Struct = ConstantStruct::get(ST, V); - return InsertValueInst::Create(Struct, Add, 0); - } - } + OverflowResult OR = computeOverflowForUnsignedAdd(LHS, RHS, II); + if (OR == OverflowResult::NeverOverflows) + return CreateOverflowTuple(II, Builder->CreateNUWAdd(LHS, RHS), false); + if (OR == OverflowResult::AlwaysOverflows) + return CreateOverflowTuple(II, Builder->CreateAdd(LHS, RHS), true); } // FALL THROUGH uadd into sadd case Intrinsic::sadd_with_overflow: @@ -413,13 +377,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getArgOperand(1))) { // X + 0 -> {X, false} if (RHS->isZero()) { - Constant *V[] = { - UndefValue::get(II->getArgOperand(0)->getType()), - ConstantInt::getFalse(II->getContext()) - }; - Constant *Struct = - ConstantStruct::get(cast<StructType>(II->getType()), V); - return InsertValueInst::Create(Struct, II->getArgOperand(0), 0); + return CreateOverflowTuple(II, II->getArgOperand(0), false, + /*ReUseName*/false); } } @@ -428,65 +387,43 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow) { Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); if (WillNotOverflowSignedAdd(LHS, RHS, II)) { - Value *Add = Builder->CreateNSWAdd(LHS, RHS); - Add->takeName(&CI); - Constant *V[] = {UndefValue::get(Add->getType()), Builder->getFalse()}; - StructType *ST = cast<StructType>(II->getType()); - Constant *Struct = ConstantStruct::get(ST, V); - return InsertValueInst::Create(Struct, Add, 0); + return CreateOverflowTuple(II, Builder->CreateNSWAdd(LHS, RHS), false); } } break; case Intrinsic::usub_with_overflow: - case Intrinsic::ssub_with_overflow: + case Intrinsic::ssub_with_overflow: { + Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); // undef - X -> undef // X - undef -> undef - if (isa<UndefValue>(II->getArgOperand(0)) || - isa<UndefValue>(II->getArgOperand(1))) + if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS)) return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); - if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) { // X - 0 -> {X, false} - if (RHS->isZero()) { - Constant *V[] = { - UndefValue::get(II->getArgOperand(0)->getType()), - ConstantInt::getFalse(II->getContext()) - }; - Constant *Struct = - ConstantStruct::get(cast<StructType>(II->getType()), V); - return InsertValueInst::Create(Struct, II->getArgOperand(0), 0); + if (ConstRHS->isZero()) { + return CreateOverflowTuple(II, LHS, false, /*ReUseName*/false); + } + } + if (II->getIntrinsicID() == Intrinsic::ssub_with_overflow) { + if (WillNotOverflowSignedSub(LHS, RHS, II)) { + return CreateOverflowTuple(II, Builder->CreateNSWSub(LHS, RHS), false); + } + } else { + if (WillNotOverflowUnsignedSub(LHS, RHS, II)) { + return CreateOverflowTuple(II, Builder->CreateNUWSub(LHS, RHS), false); } } break; + } case Intrinsic::umul_with_overflow: { Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); - unsigned BitWidth = cast<IntegerType>(LHS->getType())->getBitWidth(); - - APInt LHSKnownZero(BitWidth, 0); - APInt LHSKnownOne(BitWidth, 0); - computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, II); - APInt RHSKnownZero(BitWidth, 0); - APInt RHSKnownOne(BitWidth, 0); - computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, II); - - // Get the largest possible values for each operand. - APInt LHSMax = ~LHSKnownZero; - APInt RHSMax = ~RHSKnownZero; - - // If multiplying the maximum values does not overflow then we can turn - // this into a plain NUW mul. - bool Overflow; - LHSMax.umul_ov(RHSMax, Overflow); - if (!Overflow) { - Value *Mul = Builder->CreateNUWMul(LHS, RHS, "umul_with_overflow"); - Constant *V[] = { - UndefValue::get(LHS->getType()), - Builder->getFalse() - }; - Constant *Struct = ConstantStruct::get(cast<StructType>(II->getType()),V); - return InsertValueInst::Create(Struct, Mul, 0); - } + OverflowResult OR = computeOverflowForUnsignedMul(LHS, RHS, II); + if (OR == OverflowResult::NeverOverflows) + return CreateOverflowTuple(II, Builder->CreateNUWMul(LHS, RHS), false); + if (OR == OverflowResult::AlwaysOverflows) + return CreateOverflowTuple(II, Builder->CreateMul(LHS, RHS), true); } // FALL THROUGH case Intrinsic::smul_with_overflow: // Canonicalize constants into the RHS. @@ -509,13 +446,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // X * 1 -> {X, false} if (RHSI->equalsInt(1)) { - Constant *V[] = { - UndefValue::get(II->getArgOperand(0)->getType()), - ConstantInt::getFalse(II->getContext()) - }; - Constant *Struct = - ConstantStruct::get(cast<StructType>(II->getType()), V); - return InsertValueInst::Create(Struct, II->getArgOperand(0), 0); + return CreateOverflowTuple(II, II->getArgOperand(0), false, + /*ReUseName*/false); + } + } + if (II->getIntrinsicID() == Intrinsic::smul_with_overflow) { + Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); + if (WillNotOverflowSignedMul(LHS, RHS, II)) { + return CreateOverflowTuple(II, Builder->CreateNSWMul(LHS, RHS), false); } } break; @@ -606,8 +544,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: // Turn PPC lvx -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, - DL, AT, II, DT) >= 16) { + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, AC, II, DT) >= + 16) { Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), PointerType::getUnqual(II->getType())); return new LoadInst(Ptr); @@ -623,8 +561,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: // Turn stvx -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, - DL, AT, II, DT) >= 16) { + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, AC, II, DT) >= + 16) { Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); @@ -638,12 +576,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); return new StoreInst(II->getArgOperand(0), Ptr, false, 1); } + case Intrinsic::ppc_qpx_qvlfs: + // Turn PPC QPX qvlfs -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, AC, II, DT) >= + 16) { + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), + PointerType::getUnqual(II->getType())); + return new LoadInst(Ptr); + } + break; + case Intrinsic::ppc_qpx_qvlfd: + // Turn PPC QPX qvlfd -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, AC, II, DT) >= + 32) { + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), + PointerType::getUnqual(II->getType())); + return new LoadInst(Ptr); + } + break; + case Intrinsic::ppc_qpx_qvstfs: + // Turn PPC QPX qvstfs -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, AC, II, DT) >= + 16) { + Type *OpPtrTy = + PointerType::getUnqual(II->getArgOperand(0)->getType()); + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); + return new StoreInst(II->getArgOperand(0), Ptr); + } + break; + case Intrinsic::ppc_qpx_qvstfd: + // Turn PPC QPX qvstfd -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, AC, II, DT) >= + 32) { + Type *OpPtrTy = + PointerType::getUnqual(II->getArgOperand(0)->getType()); + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); + return new StoreInst(II->getArgOperand(0), Ptr); + } + break; case Intrinsic::x86_sse_storeu_ps: case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: // Turn X86 storeu -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, - DL, AT, II, DT) >= 16) { + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, AC, II, DT) >= + 16) { Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(1)->getType()); Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy); @@ -774,7 +750,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // TODO: eventually we should lower this intrinsic to IR if (auto CIWidth = dyn_cast<ConstantInt>(II->getArgOperand(2))) { if (auto CIStart = dyn_cast<ConstantInt>(II->getArgOperand(3))) { - if (CIWidth->equalsInt(64) && CIStart->isZero()) { + unsigned Index = CIStart->getZExtValue(); + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = CIWidth->equalsInt(0) ? 64 : CIWidth->getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if ((Index + Length) > 64) + return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); + + if (Length == 64 && Index == 0) { Value *Vec = II->getArgOperand(1); Value *Undef = UndefValue::get(Vec->getType()); const uint32_t Mask[] = { 0, 2 }; @@ -988,7 +979,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::arm_neon_vst2lane: case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: { - unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL, AT, II, DT); + unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL, AC, II, DT); unsigned AlignArg = II->getNumArgOperands() - 1; ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg)); if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) { @@ -1128,7 +1119,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { cast<Constant>(RHS)->isNullValue()) { LoadInst* LI = cast<LoadInst>(LHS); if (isValidAssumeForContext(II, LI, DL, DT)) { - MDNode* MD = MDNode::get(II->getContext(), ArrayRef<Value*>()); + MDNode *MD = MDNode::get(II->getContext(), None); LI->setMetadata(LLVMContext::MD_nonnull, MD); return EraseInstFromFunction(*II); } @@ -1145,6 +1136,48 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::experimental_gc_relocate: { + // Translate facts known about a pointer before relocating into + // facts about the relocate value, while being careful to + // preserve relocation semantics. + GCRelocateOperands Operands(II); + Value *DerivedPtr = Operands.derivedPtr(); + + // Remove the relocation if unused, note that this check is required + // to prevent the cases below from looping forever. + if (II->use_empty()) + return EraseInstFromFunction(*II); + + // Undef is undef, even after relocation. + // TODO: provide a hook for this in GCStrategy. This is clearly legal for + // most practical collectors, but there was discussion in the review thread + // about whether it was legal for all possible collectors. + if (isa<UndefValue>(DerivedPtr)) + return ReplaceInstUsesWith(*II, DerivedPtr); + + // The relocation of null will be null for most any collector. + // TODO: provide a hook for this in GCStrategy. There might be some weird + // collector this property does not hold for. + if (isa<ConstantPointerNull>(DerivedPtr)) + return ReplaceInstUsesWith(*II, DerivedPtr); + + // isKnownNonNull -> nonnull attribute + if (isKnownNonNull(DerivedPtr)) + II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + + // isDereferenceablePointer -> deref attribute + if (DerivedPtr->isDereferenceablePointer(DL)) { + if (Argument *A = dyn_cast<Argument>(DerivedPtr)) { + uint64_t Bytes = A->getDereferenceableBytes(); + II->addDereferenceableAttr(AttributeSet::ReturnIndex, Bytes); + } + } + + // TODO: bitcast(relocate(p)) -> relocate(bitcast(p)) + // Canonicalize on the type from the uses to the defs + + // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...) + } } return visitCallSite(II); @@ -1165,6 +1198,14 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS, if (!CI->isLosslessCast()) return false; + // If this is a GC intrinsic, avoid munging types. We need types for + // statepoint reconstruction in SelectionDAG. + // TODO: This is probably something which should be expanded to all + // intrinsics since the entire point of intrinsics is that + // they are understandable by the optimizer. + if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS)) + return false; + // The size of ByVal or InAlloca arguments is derived from the type, so we // can't change to a type with a different size. If the size were // passed explicitly we could avoid this check. @@ -1188,7 +1229,11 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS, Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *DL) { if (!CI->getCalledFunction()) return nullptr; - if (Value *With = Simplifier->optimizeCall(CI)) { + auto InstCombineRAUW = [this](Instruction *From, Value *With) { + ReplaceInstUsesWith(*From, With); + }; + LibCallSimplifier Simplifier(DL, TLI, InstCombineRAUW); + if (Value *With = Simplifier.optimizeCall(CI)) { ++NumSimplified; return CI->use_empty() ? CI : ReplaceInstUsesWith(*CI, With); } @@ -1380,6 +1425,10 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); if (!Callee) return false; + // The prototype of thunks are a lie, don't try to directly call such + // functions. + if (Callee->hasFnAttribute("thunk")) + return false; Instruction *Caller = CS.getInstruction(); const AttributeSet &CallerPAL = CS.getAttributes(); @@ -1397,7 +1446,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { if (NewRetTy->isStructTy()) return false; // TODO: Handle multiple return values. - if (!CastInst::isBitCastable(NewRetTy, OldRetTy)) { + if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) { if (Callee->isDeclaration()) return false; // Cannot transform this return value. @@ -1432,12 +1481,21 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { unsigned NumActualArgs = CS.arg_size(); unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs); + // Prevent us turning: + // declare void @takes_i32_inalloca(i32* inalloca) + // call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0) + // + // into: + // call void @takes_i32_inalloca(i32* null) + if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca)) + return false; + CallSite::arg_iterator AI = CS.arg_begin(); for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) { Type *ParamTy = FT->getParamType(i); Type *ActTy = (*AI)->getType(); - if (!CastInst::isBitCastable(ActTy, ParamTy)) + if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) return false; // Cannot transform this parameter value. if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1). @@ -1532,7 +1590,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { if ((*AI)->getType() == ParamTy) { Args.push_back(*AI); } else { - Args.push_back(Builder->CreateBitCast(*AI, ParamTy)); + Args.push_back(Builder->CreateBitOrPointerCast(*AI, ParamTy)); } // Add any parameter attributes. @@ -1603,7 +1661,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { Value *NV = NC; if (OldRetTy != NV->getType() && !Caller->use_empty()) { if (!NV->getType()->isVoidTy()) { - NV = NC = CastInst::Create(CastInst::BitCast, NC, OldRetTy); + NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy); NC->setDebugLoc(Caller->getDebugLoc()); // If this is an invoke instruction, we should insert it after the first diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index aba77bb..3e2b719 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -11,11 +11,11 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; using namespace PatternMatch; @@ -1064,6 +1064,15 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) { Value *Src = CI.getOperand(0); Type *SrcTy = Src->getType(), *DestTy = CI.getType(); + // If we know that the value being extended is positive, we can use a zext + // instead. + bool KnownZero, KnownOne; + ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI); + if (KnownZero) { + Value *ZExt = Builder->CreateZExt(Src, DestTy); + return ReplaceInstUsesWith(CI, ZExt); + } + // Attempt to extend the entire input expression tree to the destination // type. Only do this if the dest type is a simple type, don't convert the // expression tree to something weird like i93 unless the source is also @@ -1269,6 +1278,8 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { // type of OpI doesn't enter into things at all. We simply evaluate // in whichever source type is larger, then convert to the // destination type. + if (SrcWidth == OpWidth) + break; if (LHSWidth < SrcWidth) LHSOrig = Builder->CreateFPExt(LHSOrig, RHSOrig->getType()); else if (RHSWidth <= SrcWidth) @@ -1330,22 +1341,57 @@ Instruction *InstCombiner::visitFPExt(CastInst &CI) { return commonCastTransforms(CI); } +// fpto{s/u}i({u/s}itofp(X)) --> X or zext(X) or sext(X) or trunc(X) +// This is safe if the intermediate type has enough bits in its mantissa to +// accurately represent all values of X. For example, this won't work with +// i64 -> float -> i64. +Instruction *InstCombiner::FoldItoFPtoI(Instruction &FI) { + if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0))) + return nullptr; + Instruction *OpI = cast<Instruction>(FI.getOperand(0)); + + Value *SrcI = OpI->getOperand(0); + Type *FITy = FI.getType(); + Type *OpITy = OpI->getType(); + Type *SrcTy = SrcI->getType(); + bool IsInputSigned = isa<SIToFPInst>(OpI); + bool IsOutputSigned = isa<FPToSIInst>(FI); + + // We can safely assume the conversion won't overflow the output range, + // because (for example) (uint8_t)18293.f is undefined behavior. + + // Since we can assume the conversion won't overflow, our decision as to + // whether the input will fit in the float should depend on the minimum + // of the input range and output range. + + // This means this is also safe for a signed input and unsigned output, since + // a negative input would lead to undefined behavior. + int InputSize = (int)SrcTy->getScalarSizeInBits() - IsInputSigned; + int OutputSize = (int)FITy->getScalarSizeInBits() - IsOutputSigned; + int ActualSize = std::min(InputSize, OutputSize); + + if (ActualSize <= OpITy->getFPMantissaWidth()) { + if (FITy->getScalarSizeInBits() > SrcTy->getScalarSizeInBits()) { + if (IsInputSigned && IsOutputSigned) + return new SExtInst(SrcI, FITy); + return new ZExtInst(SrcI, FITy); + } + if (FITy->getScalarSizeInBits() < SrcTy->getScalarSizeInBits()) + return new TruncInst(SrcI, FITy); + if (SrcTy == FITy) + return ReplaceInstUsesWith(FI, SrcI); + return new BitCastInst(SrcI, FITy); + } + return nullptr; +} + Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) { Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0)); if (!OpI) return commonCastTransforms(FI); - // fptoui(uitofp(X)) --> X - // fptoui(sitofp(X)) --> X - // This is safe if the intermediate type has enough bits in its mantissa to - // accurately represent all values of X. For example, do not do this with - // i64->float->i64. This is also safe for sitofp case, because any negative - // 'X' value would cause an undefined result for the fptoui. - if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) && - OpI->getOperand(0)->getType() == FI.getType() && - (int)FI.getType()->getScalarSizeInBits() < /*extra bit for sign */ - OpI->getType()->getFPMantissaWidth()) - return ReplaceInstUsesWith(FI, OpI->getOperand(0)); + if (Instruction *I = FoldItoFPtoI(FI)) + return I; return commonCastTransforms(FI); } @@ -1355,17 +1401,8 @@ Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) { if (!OpI) return commonCastTransforms(FI); - // fptosi(sitofp(X)) --> X - // fptosi(uitofp(X)) --> X - // This is safe if the intermediate type has enough bits in its mantissa to - // accurately represent all values of X. For example, do not do this with - // i64->float->i64. This is also safe for sitofp case, because any negative - // 'X' value would cause an undefined result for the fptoui. - if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) && - OpI->getOperand(0)->getType() == FI.getType() && - (int)FI.getType()->getScalarSizeInBits() <= - OpI->getType()->getFPMantissaWidth()) - return ReplaceInstUsesWith(FI, OpI->getOperand(0)); + if (Instruction *I = FoldItoFPtoI(FI)) + return I; return commonCastTransforms(FI); } diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 399f1c3..f48d89b 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -11,7 +11,9 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" @@ -20,12 +22,20 @@ #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Analysis/TargetLibraryInfo.h" + using namespace llvm; using namespace PatternMatch; #define DEBUG_TYPE "instcombine" +// How many times is a select replaced by one of its operands? +STATISTIC(NumSel, "Number of select opts"); + +// Initialization Routines + static ConstantInt *getOne(Constant *C) { return ConstantInt::get(cast<IntegerType>(C->getType()), 1); } @@ -1921,14 +1931,17 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) { if (DL && LHSCI->getOpcode() == Instruction::PtrToInt && DL->getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) { Value *RHSOp = nullptr; - if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) { + if (PtrToIntOperator *RHSC = dyn_cast<PtrToIntOperator>(ICI.getOperand(1))) { + Value *RHSCIOp = RHSC->getOperand(0); + if (RHSCIOp->getType()->getPointerAddressSpace() == + LHSCIOp->getType()->getPointerAddressSpace()) { + RHSOp = RHSC->getOperand(0); + // If the pointer types don't match, insert a bitcast. + if (LHSCIOp->getType() != RHSOp->getType()) + RHSOp = Builder->CreateBitCast(RHSOp, LHSCIOp->getType()); + } + } else if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy); - } else if (PtrToIntInst *RHSC = dyn_cast<PtrToIntInst>(ICI.getOperand(1))) { - RHSOp = RHSC->getOperand(0); - // If the pointer types don't match, insert a bitcast. - if (LHSCIOp->getType() != RHSOp->getType()) - RHSOp = Builder->CreateBitCast(RHSOp, LHSCIOp->getType()); - } if (RHSOp) return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSOp); @@ -2446,6 +2459,122 @@ static bool swapMayExposeCSEOpportunities(const Value * Op0, return GlobalSwapBenefits > 0; } +/// \brief Check that one use is in the same block as the definition and all +/// other uses are in blocks dominated by a given block +/// +/// \param DI Definition +/// \param UI Use +/// \param DB Block that must dominate all uses of \p DI outside +/// the parent block +/// \return true when \p UI is the only use of \p DI in the parent block +/// and all other uses of \p DI are in blocks dominated by \p DB. +/// +bool InstCombiner::dominatesAllUses(const Instruction *DI, + const Instruction *UI, + const BasicBlock *DB) const { + assert(DI && UI && "Instruction not defined\n"); + // ignore incomplete definitions + if (!DI->getParent()) + return false; + // DI and UI must be in the same block + if (DI->getParent() != UI->getParent()) + return false; + // Protect from self-referencing blocks + if (DI->getParent() == DB) + return false; + // DominatorTree available? + if (!DT) + return false; + for (const User *U : DI->users()) { + auto *Usr = cast<Instruction>(U); + if (Usr != UI && !DT->dominates(DB, Usr->getParent())) + return false; + } + return true; +} + +/// +/// true when the instruction sequence within a block is select-cmp-br. +/// +static bool isChainSelectCmpBranch(const SelectInst *SI) { + const BasicBlock *BB = SI->getParent(); + if (!BB) + return false; + auto *BI = dyn_cast_or_null<BranchInst>(BB->getTerminator()); + if (!BI || BI->getNumSuccessors() != 2) + return false; + auto *IC = dyn_cast<ICmpInst>(BI->getCondition()); + if (!IC || (IC->getOperand(0) != SI && IC->getOperand(1) != SI)) + return false; + return true; +} + +/// +/// \brief True when a select result is replaced by one of its operands +/// in select-icmp sequence. This will eventually result in the elimination +/// of the select. +/// +/// \param SI Select instruction +/// \param Icmp Compare instruction +/// \param SIOpd Operand that replaces the select +/// +/// Notes: +/// - The replacement is global and requires dominator information +/// - The caller is responsible for the actual replacement +/// +/// Example: +/// +/// entry: +/// %4 = select i1 %3, %C* %0, %C* null +/// %5 = icmp eq %C* %4, null +/// br i1 %5, label %9, label %7 +/// ... +/// ; <label>:7 ; preds = %entry +/// %8 = getelementptr inbounds %C* %4, i64 0, i32 0 +/// ... +/// +/// can be transformed to +/// +/// %5 = icmp eq %C* %0, null +/// %6 = select i1 %3, i1 %5, i1 true +/// br i1 %6, label %9, label %7 +/// ... +/// ; <label>:7 ; preds = %entry +/// %8 = getelementptr inbounds %C* %0, i64 0, i32 0 // replace by %0! +/// +/// Similar when the first operand of the select is a constant or/and +/// the compare is for not equal rather than equal. +/// +/// NOTE: The function is only called when the select and compare constants +/// are equal, the optimization can work only for EQ predicates. This is not a +/// major restriction since a NE compare should be 'normalized' to an equal +/// compare, which usually happens in the combiner and test case +/// select-cmp-br.ll +/// checks for it. +bool InstCombiner::replacedSelectWithOperand(SelectInst *SI, + const ICmpInst *Icmp, + const unsigned SIOpd) { + assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!"); + if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) { + BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1); + // The check for the unique predecessor is not the best that can be + // done. But it protects efficiently against cases like when SI's + // home block has two successors, Succ and Succ1, and Succ1 predecessor + // of Succ. Then SI can't be replaced by SIOpd because the use that gets + // replaced can be reached on either path. So the uniqueness check + // guarantees that the path all uses of SI (outside SI's parent) are on + // is disjoint from all other paths out of SI. But that information + // is more expensive to compute, and the trade-off here is in favor + // of compile-time. + if (Succ->getUniquePredecessor() && dominatesAllUses(SI, Icmp, Succ)) { + NumSel++; + SI->replaceUsesOutsideBlock(SI->getOperand(SIOpd), SI->getParent()); + return true; + } + } + return false; +} + Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { bool Changed = false; Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -2463,7 +2592,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { Changed = true; } - if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AT)) + if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // comparing -val or val with non-zero is the same as just comparing val @@ -2560,11 +2689,33 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { return Res; } - // (icmp ne/eq (sub A B) 0) -> (icmp ne/eq A, B) - if (I.isEquality() && CI->isZero() && - match(Op0, m_Sub(m_Value(A), m_Value(B)))) { - // (icmp cond A B) if cond is equality - return new ICmpInst(I.getPredicate(), A, B); + // The following transforms are only 'worth it' if the only user of the + // subtraction is the icmp. + if (Op0->hasOneUse()) { + // (icmp ne/eq (sub A B) 0) -> (icmp ne/eq A, B) + if (I.isEquality() && CI->isZero() && + match(Op0, m_Sub(m_Value(A), m_Value(B)))) + return new ICmpInst(I.getPredicate(), A, B); + + // (icmp sgt (sub nsw A B), -1) -> (icmp sge A, B) + if (I.getPredicate() == ICmpInst::ICMP_SGT && CI->isAllOnesValue() && + match(Op0, m_NSWSub(m_Value(A), m_Value(B)))) + return new ICmpInst(ICmpInst::ICMP_SGE, A, B); + + // (icmp sgt (sub nsw A B), 0) -> (icmp sgt A, B) + if (I.getPredicate() == ICmpInst::ICMP_SGT && CI->isZero() && + match(Op0, m_NSWSub(m_Value(A), m_Value(B)))) + return new ICmpInst(ICmpInst::ICMP_SGT, A, B); + + // (icmp slt (sub nsw A B), 0) -> (icmp slt A, B) + if (I.getPredicate() == ICmpInst::ICMP_SLT && CI->isZero() && + match(Op0, m_NSWSub(m_Value(A), m_Value(B)))) + return new ICmpInst(ICmpInst::ICMP_SLT, A, B); + + // (icmp slt (sub nsw A B), 1) -> (icmp sle A, B) + if (I.getPredicate() == ICmpInst::ICMP_SLT && CI->isOne() && + match(Op0, m_NSWSub(m_Value(A), m_Value(B)))) + return new ICmpInst(ICmpInst::ICMP_SLE, A, B); } // If we have an icmp le or icmp ge instruction, turn it into the @@ -2898,18 +3049,39 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // comparison into the select arms, which will cause one to be // constant folded and the select turned into a bitwise or. Value *Op1 = nullptr, *Op2 = nullptr; - if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) + ConstantInt *CI = 0; + if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) { Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC); - if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) + CI = dyn_cast<ConstantInt>(Op1); + } + if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) { Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC); + CI = dyn_cast<ConstantInt>(Op2); + } // We only want to perform this transformation if it will not lead to // additional code. This is true if either both sides of the select // fold to a constant (in which case the icmp is replaced with a select // which will usually simplify) or this is the only user of the // select (in which case we are trading a select+icmp for a simpler - // select+icmp). - if ((Op1 && Op2) || (LHSI->hasOneUse() && (Op1 || Op2))) { + // select+icmp) or all uses of the select can be replaced based on + // dominance information ("Global cases"). + bool Transform = false; + if (Op1 && Op2) + Transform = true; + else if (Op1 || Op2) { + // Local case + if (LHSI->hasOneUse()) + Transform = true; + // Global cases + else if (CI && !CI->isZero()) + // When Op1 is constant try replacing select with second operand. + // Otherwise Op2 is constant and try replacing select with first + // operand. + Transform = replacedSelectWithOperand(cast<SelectInst>(LHSI), &I, + Op1 ? 2 : 1); + } + if (Transform) { if (!Op1) Op1 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(1), RHSC, I.getName()); @@ -3255,9 +3427,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // and (A & ~B) != 0 --> (A & B) == 0 // if A is a power of 2. if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) && - match(Op1, m_Zero()) && isKnownToBeAPowerOfTwo(A, false, - 0, AT, &I, DT) && - I.isEquality()) + match(Op1, m_Zero()) && + isKnownToBeAPowerOfTwo(A, false, 0, AC, &I, DT) && I.isEquality()) return new ICmpInst(I.getInversePredicate(), Builder->CreateAnd(A, B), Op1); @@ -3448,7 +3619,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { } /// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible. -/// Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, Instruction *LHSI, Constant *RHSC) { @@ -3460,18 +3630,49 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, int MantissaWidth = LHSI->getType()->getFPMantissaWidth(); if (MantissaWidth == -1) return nullptr; // Unknown. + IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType()); + // Check to see that the input is converted from an integer type that is small // enough that preserves all bits. TODO: check here for "known" sign bits. // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e. - unsigned InputSize = LHSI->getOperand(0)->getType()->getScalarSizeInBits(); + unsigned InputSize = IntTy->getScalarSizeInBits(); // If this is a uitofp instruction, we need an extra bit to hold the sign. bool LHSUnsigned = isa<UIToFPInst>(LHSI); if (LHSUnsigned) ++InputSize; + if (I.isEquality()) { + FCmpInst::Predicate P = I.getPredicate(); + bool IsExact = false; + APSInt RHSCvt(IntTy->getBitWidth(), LHSUnsigned); + RHS.convertToInteger(RHSCvt, APFloat::rmNearestTiesToEven, &IsExact); + + // If the floating point constant isn't an integer value, we know if we will + // ever compare equal / not equal to it. + if (!IsExact) { + // TODO: Can never be -0.0 and other non-representable values + APFloat RHSRoundInt(RHS); + RHSRoundInt.roundToIntegral(APFloat::rmNearestTiesToEven); + if (RHS.compare(RHSRoundInt) != APFloat::cmpEqual) { + if (P == FCmpInst::FCMP_OEQ || P == FCmpInst::FCMP_UEQ) + return ReplaceInstUsesWith(I, Builder->getFalse()); + + assert(P == FCmpInst::FCMP_ONE || P == FCmpInst::FCMP_UNE); + return ReplaceInstUsesWith(I, Builder->getTrue()); + } + } + + // TODO: If the constant is exactly representable, is it always OK to do + // equality compares as integer? + } + + // Comparisons with zero are a special case where we know we won't lose + // information. + bool IsCmpZero = RHS.isPosZero(); + // If the conversion would lose info, don't hack on this. - if ((int)InputSize > MantissaWidth) + if ((int)InputSize > MantissaWidth && !IsCmpZero) return nullptr; // Otherwise, we can potentially simplify the comparison. We know that it @@ -3512,8 +3713,6 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, return ReplaceInstUsesWith(I, Builder->getFalse()); } - IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType()); - // Now we know that the APFloat is a normal number, zero or inf. // See if the FP constant is too large for the integer. For example, @@ -3663,7 +3862,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AT)) + if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // Simplify 'fcmp pred X, X' @@ -3766,40 +3965,42 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) { } break; case Instruction::Call: { + if (!RHSC->isNullValue()) + break; + CallInst *CI = cast<CallInst>(LHSI); - LibFunc::Func Func; + const Function *F = CI->getCalledFunction(); + if (!F) + break; + // Various optimization for fabs compared with zero. - if (RHSC->isNullValue() && CI->getCalledFunction() && - TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) && - TLI->has(Func)) { - if (Func == LibFunc::fabs || Func == LibFunc::fabsf || - Func == LibFunc::fabsl) { - switch (I.getPredicate()) { - default: break; + LibFunc::Func Func; + if (F->getIntrinsicID() == Intrinsic::fabs || + (TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) && + (Func == LibFunc::fabs || Func == LibFunc::fabsf || + Func == LibFunc::fabsl))) { + switch (I.getPredicate()) { + default: + break; // fabs(x) < 0 --> false - case FCmpInst::FCMP_OLT: - return ReplaceInstUsesWith(I, Builder->getFalse()); + case FCmpInst::FCMP_OLT: + return ReplaceInstUsesWith(I, Builder->getFalse()); // fabs(x) > 0 --> x != 0 - case FCmpInst::FCMP_OGT: - return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0), - RHSC); + case FCmpInst::FCMP_OGT: + return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0), RHSC); // fabs(x) <= 0 --> x == 0 - case FCmpInst::FCMP_OLE: - return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0), - RHSC); + case FCmpInst::FCMP_OLE: + return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0), RHSC); // fabs(x) >= 0 --> !isnan(x) - case FCmpInst::FCMP_OGE: - return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0), - RHSC); + case FCmpInst::FCMP_OGE: + return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0), RHSC); // fabs(x) == 0 --> x == 0 // fabs(x) != 0 --> x != 0 - case FCmpInst::FCMP_OEQ: - case FCmpInst::FCMP_UEQ: - case FCmpInst::FCMP_ONE: - case FCmpInst::FCMP_UNE: - return new FCmpInst(I.getPredicate(), CI->getArgOperand(0), - RHSC); - } + case FCmpInst::FCMP_OEQ: + case FCmpInst::FCMP_UEQ: + case FCmpInst::FCMP_ONE: + case FCmpInst::FCMP_UNE: + return new FCmpInst(I.getPredicate(), CI->getArgOperand(0), RHSC); } } } diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombineInternal.h index d4b252b..2fd5318 100644 --- a/lib/Transforms/InstCombine/InstCombine.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -1,4 +1,4 @@ -//===- InstCombine.h - Main InstCombine pass definition ---------*- C++ -*-===// +//===- InstCombineInternal.h - InstCombine pass internals -------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -6,21 +6,27 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +/// \file +/// +/// This file provides internal interfaces used to implement the InstCombine. +/// +//===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINE_H -#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINE_H +#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H +#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H -#include "InstCombineWorklist.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetFolder.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" -#include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" #define DEBUG_TYPE "instcombine" @@ -33,8 +39,7 @@ class DbgDeclareInst; class MemIntrinsic; class MemSetInst; -/// SelectPatternFlavor - We can match a variety of different patterns for -/// select operations. +/// \brief Specific patterns of select instructions we can match. enum SelectPatternFlavor { SPF_UNKNOWN = 0, SPF_SMIN, @@ -45,8 +50,15 @@ enum SelectPatternFlavor { SPF_NABS }; -/// getComplexity: Assign a complexity or rank value to LLVM Values... -/// 0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst +/// \brief Assign a complexity or rank value to LLVM Values. +/// +/// This routine maps IR values to various complexity ranks: +/// 0 -> undef +/// 1 -> Constants +/// 2 -> Other non-instructions +/// 3 -> Arguments +/// 3 -> Unary operations +/// 4 -> Other instructions static inline unsigned getComplexity(Value *V) { if (isa<Instruction>(V)) { if (BinaryOperator::isNeg(V) || BinaryOperator::isFNeg(V) || @@ -59,26 +71,55 @@ static inline unsigned getComplexity(Value *V) { return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2; } -/// AddOne - Add one to a Constant +/// \brief Add one to a Constant static inline Constant *AddOne(Constant *C) { return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); } -/// SubOne - Subtract one from a Constant +/// \brief Subtract one from a Constant static inline Constant *SubOne(Constant *C) { return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1)); } -/// InstCombineIRInserter - This is an IRBuilder insertion helper that works -/// just like the normal insertion helper, but also adds any new instructions -/// to the instcombine worklist. +/// \brief Return true if the specified value is free to invert (apply ~ to). +/// This happens in cases where the ~ can be eliminated. If WillInvertAllUses +/// is true, work under the assumption that the caller intends to remove all +/// uses of V and only keep uses of ~V. +/// +static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) { + // ~(~(X)) -> X. + if (BinaryOperator::isNot(V)) + return true; + + // Constants can be considered to be not'ed values. + if (isa<ConstantInt>(V)) + return true; + + // Compares can be inverted if all of their uses are being modified to use the + // ~V. + if (isa<CmpInst>(V)) + return WillInvertAllUses; + + // If `V` is of the form `A + Constant` then `-1 - V` can be folded into `(-1 + // - Constant) - A` if we are willing to invert all of the uses. + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) + if (BO->getOpcode() == Instruction::Add || + BO->getOpcode() == Instruction::Sub) + if (isa<Constant>(BO->getOperand(0)) || isa<Constant>(BO->getOperand(1))) + return WillInvertAllUses; + + return false; +} + +/// \brief An IRBuilder inserter that adds new instructions to the instcombine +/// worklist. class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter : public IRBuilderDefaultInserter<true> { InstCombineWorklist &Worklist; - AssumptionTracker *AT; + AssumptionCache *AC; public: - InstCombineIRInserter(InstCombineWorklist &WL, AssumptionTracker *AT) - : Worklist(WL), AT(AT) {} + InstCombineIRInserter(InstCombineWorklist &WL, AssumptionCache *AC) + : Worklist(WL), AC(AC) {} void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB, BasicBlock::iterator InsertPt) const { @@ -87,50 +128,64 @@ public: using namespace llvm::PatternMatch; if (match(I, m_Intrinsic<Intrinsic::assume>())) - AT->registerAssumption(cast<CallInst>(I)); + AC->registerAssumption(cast<CallInst>(I)); } }; -/// InstCombiner - The -instcombine pass. +/// \brief The core instruction combiner logic. +/// +/// This class provides both the logic to recursively visit instructions and +/// combine them, as well as the pass infrastructure for running this as part +/// of the LLVM pass pipeline. class LLVM_LIBRARY_VISIBILITY InstCombiner - : public FunctionPass, - public InstVisitor<InstCombiner, Instruction *> { - AssumptionTracker *AT; - const DataLayout *DL; - TargetLibraryInfo *TLI; - DominatorTree *DT; // not required - bool MadeIRChange; - LibCallSimplifier *Simplifier; - bool MinimizeSize; - + : public InstVisitor<InstCombiner, Instruction *> { + // FIXME: These members shouldn't be public. public: - /// Worklist - All of the instructions that need to be simplified. - InstCombineWorklist Worklist; + /// \brief A worklist of the instructions that need to be simplified. + InstCombineWorklist &Worklist; - /// Builder - This is an IRBuilder that automatically inserts new - /// instructions into the worklist when they are created. + /// \brief An IRBuilder that automatically inserts new instructions into the + /// worklist. typedef IRBuilder<true, TargetFolder, InstCombineIRInserter> BuilderTy; BuilderTy *Builder; - static char ID; // Pass identification, replacement for typeid - InstCombiner() : FunctionPass(ID), DL(nullptr), Builder(nullptr) { - MinimizeSize = false; - initializeInstCombinerPass(*PassRegistry::getPassRegistry()); - } +private: + // Mode in which we are running the combiner. + const bool MinimizeSize; -public: - bool runOnFunction(Function &F) override; + // Required analyses. + // FIXME: These can never be null and should be references. + AssumptionCache *AC; + TargetLibraryInfo *TLI; + DominatorTree *DT; - bool DoOneIteration(Function &F, unsigned ItNum); + // Optional analyses. When non-null, these can both be used to do better + // combining and will be updated to reflect any changes. + const DataLayout *DL; + LoopInfo *LI; - void getAnalysisUsage(AnalysisUsage &AU) const override; + bool MadeIRChange; - AssumptionTracker *getAssumptionTracker() const { return AT; } +public: + InstCombiner(InstCombineWorklist &Worklist, BuilderTy *Builder, + bool MinimizeSize, AssumptionCache *AC, TargetLibraryInfo *TLI, + DominatorTree *DT, const DataLayout *DL, LoopInfo *LI) + : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize), + AC(AC), TLI(TLI), DT(DT), DL(DL), LI(LI), MadeIRChange(false) {} + + /// \brief Run the combiner over the entire worklist until it is empty. + /// + /// \returns true if the IR is changed. + bool run(); + + AssumptionCache *getAssumptionCache() const { return AC; } const DataLayout *getDataLayout() const { return DL; } - + DominatorTree *getDominatorTree() const { return DT; } + LoopInfo *getLoopInfo() const { return LI; } + TargetLibraryInfo *getTargetLibraryInfo() const { return TLI; } // Visitation implementation - Implement instruction combining for different @@ -160,6 +215,7 @@ public: Instruction *visitUDiv(BinaryOperator &I); Instruction *visitSDiv(BinaryOperator &I); Instruction *visitFDiv(BinaryOperator &I); + Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted); Value *FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS); Value *FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS); Instruction *visitAnd(BinaryOperator &I); @@ -219,6 +275,7 @@ public: Instruction *FoldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1, Value *A, Value *B, Instruction &Outer, SelectPatternFlavor SPF2, Value *C); + Instruction *FoldItoFPtoI(Instruction &FI); Instruction *visitSelectInst(SelectInst &SI); Instruction *visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI); Instruction *visitCallInst(CallInst &CI); @@ -245,6 +302,16 @@ public: // visitInstruction - Specify what to return for unhandled instructions... Instruction *visitInstruction(Instruction &I) { return nullptr; } + // True when DB dominates all uses of DI execpt UI. + // UI must be in the same block as DI. + // The routine checks that the DI parent and DB are different. + bool dominatesAllUses(const Instruction *DI, const Instruction *UI, + const BasicBlock *DB) const; + + // Replace select with select operand SIOpd in SI-ICmp sequence when possible + bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp, + const unsigned SIOpd); + private: bool ShouldChangeType(Type *From, Type *To) const; Value *dyn_castNegVal(Value *V) const; @@ -253,10 +320,12 @@ private: SmallVectorImpl<Value *> &NewIndices); Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI); - /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually - /// results in any code being generated and is interesting to optimize out. If - /// the cast can be eliminated by some other simple transformation, we prefer - /// to do the simplification first. + /// \brief Classify whether a cast is worth optimizing. + /// + /// Returns true if the cast from "V to Ty" actually results in any code + /// being generated and is interesting to optimize out. If the cast can be + /// eliminated by some other simple transformation, we prefer to do the + /// simplification first. bool ShouldOptimizeCast(Instruction::CastOps opcode, const Value *V, Type *Ty); @@ -269,17 +338,18 @@ private: bool DoXform = true); Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI); bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction *CxtI); - bool WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS, Instruction *CxtI); bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction *CxtI); bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction *CxtI); + bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction *CxtI); Value *EmitGEPOffset(User *GEP); Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN); Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask); public: - // InsertNewInstBefore - insert an instruction New before instruction Old - // in the program. Add the new instruction to the worklist. - // + /// \brief Inserts an instruction \p New before instruction \p Old + /// + /// Also adds the new instruction to the worklist and returns \p New so that + /// it is suitable for use as the return from the visitation patterns. Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) { assert(New && !New->getParent() && "New instruction already inserted into a basic block!"); @@ -289,20 +359,18 @@ public: return New; } - // InsertNewInstWith - same as InsertNewInstBefore, but also sets the - // debug loc. - // + /// \brief Same as InsertNewInstBefore, but also sets the debug loc. Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) { New->setDebugLoc(Old.getDebugLoc()); return InsertNewInstBefore(New, Old); } - // ReplaceInstUsesWith - This method is to be used when an instruction is - // found to be dead, replacable with another preexisting expression. Here - // we add all uses of I to the worklist, replace all uses of I with the new - // value, then return I, so that the inst combiner will know that I was - // modified. - // + /// \brief A combiner-aware RAUW-like routine. + /// + /// This method is to be used when an instruction is found to be dead, + /// replacable with another preexisting expression. Here we add all uses of + /// I to the worklist, replace all uses of I with the new value, then return + /// I, so that the inst combiner will know that I was modified. Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) { Worklist.AddUsersToWorkList(I); // Add all modified instrs to worklist. @@ -312,16 +380,31 @@ public: V = UndefValue::get(I.getType()); DEBUG(dbgs() << "IC: Replacing " << I << "\n" - " with " << *V << '\n'); + << " with " << *V << '\n'); I.replaceAllUsesWith(V); return &I; } - // EraseInstFromFunction - When dealing with an instruction that has side - // effects or produces a void value, we can't rely on DCE to delete the - // instruction. Instead, visit methods should return the value returned by - // this function. + /// Creates a result tuple for an overflow intrinsic \p II with a given + /// \p Result and a constant \p Overflow value. If \p ReUseName is true the + /// \p Result's name is taken from \p II. + Instruction *CreateOverflowTuple(IntrinsicInst *II, Value *Result, + bool Overflow, bool ReUseName = true) { + if (ReUseName) + Result->takeName(II); + Constant *V[] = {UndefValue::get(Result->getType()), + Overflow ? Builder->getTrue() : Builder->getFalse()}; + StructType *ST = cast<StructType>(II->getType()); + Constant *Struct = ConstantStruct::get(ST, V); + return InsertValueInst::Create(Struct, Result, 0); + } + + /// \brief Combiner aware instruction erasure. + /// + /// When dealing with an instruction that has side effects or produces a void + /// value, we can't rely on DCE to delete the instruction. Instead, visit + /// methods should return the value returned by this function. Instruction *EraseInstFromFunction(Instruction &I) { DEBUG(dbgs() << "IC: ERASE " << I << '\n'); @@ -341,34 +424,48 @@ public: void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, unsigned Depth = 0, Instruction *CxtI = nullptr) const { - return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth, - AT, CxtI, DT); + return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth, AC, CxtI, + DT); } - bool MaskedValueIsZero(Value *V, const APInt &Mask, - unsigned Depth = 0, + bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0, Instruction *CxtI = nullptr) const { - return llvm::MaskedValueIsZero(V, Mask, DL, Depth, AT, CxtI, DT); + return llvm::MaskedValueIsZero(V, Mask, DL, Depth, AC, CxtI, DT); } unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0, Instruction *CxtI = nullptr) const { - return llvm::ComputeNumSignBits(Op, DL, Depth, AT, CxtI, DT); + return llvm::ComputeNumSignBits(Op, DL, Depth, AC, CxtI, DT); + } + void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne, + unsigned Depth = 0, Instruction *CxtI = nullptr) const { + return llvm::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth, AC, CxtI, + DT); + } + OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS, + const Instruction *CxtI) { + return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, AC, CxtI, DT); + } + OverflowResult computeOverflowForUnsignedAdd(Value *LHS, Value *RHS, + const Instruction *CxtI) { + return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, AC, CxtI, DT); } private: - /// SimplifyAssociativeOrCommutative - This performs a few simplifications for - /// operators which are associative or commutative. + /// \brief Performs a few simplifications for operators which are associative + /// or commutative. bool SimplifyAssociativeOrCommutative(BinaryOperator &I); - /// SimplifyUsingDistributiveLaws - This tries to simplify binary operations - /// which some other binary operation distributes over either by factorizing - /// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this - /// results in simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is - /// a win). Returns the simplified value, or null if it didn't simplify. + /// \brief Tries to simplify binary operations which some other binary + /// operation distributes over. + /// + /// It does this by either by factorizing out common terms (eg "(A*B)+(A*C)" + /// -> "A*(B+C)") or expanding out if this results in simplifications (eg: "A + /// & (B | C) -> (A&B) | (A&C)" if this is a win). Returns the simplified + /// value, or null if it didn't simplify. Value *SimplifyUsingDistributiveLaws(BinaryOperator &I); - /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value - /// based on the demanded bits. + /// \brief Attempts to replace V with a simpler value based on the demanded + /// bits. Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne, unsigned Depth, Instruction *CxtI = nullptr); @@ -380,15 +477,15 @@ private: APInt DemandedMask, APInt &KnownZero, APInt &KnownOne); - /// SimplifyDemandedInstructionBits - Inst is an integer instruction that - /// SimplifyDemandedBits knows about. See if the instruction has any - /// properties that allow us to simplify its operands. + /// \brief Tries to simplify operands to an integer instruction based on its + /// demanded bits. bool SimplifyDemandedInstructionBits(Instruction &Inst); Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts, unsigned Depth = 0); Value *SimplifyVectorOp(BinaryOperator &Inst); + Value *SimplifyBSwap(BinaryOperator &Inst); // FoldOpIntoPhi - Given a binary operator, cast instruction, or select // which has a PHI node as operand #0, see if we can fold the instruction @@ -397,9 +494,8 @@ private: // Instruction *FoldOpIntoPhi(Instruction &I); - // FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary" - // operator and they all are only used by the PHI, PHI together their - // inputs, and do the operation once, to the result of the PHI. + /// \brief Try to rotate an operation below a PHI node, using PHI nodes for + /// its operands. Instruction *FoldPHIArgOpIntoPHI(PHINode &PN); Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN); Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN); @@ -420,8 +516,9 @@ private: Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned); - /// Descale - Return a value X such that Val = X * Scale, or null if none. If - /// the multiplication is known not to overflow then NoSignedWrap is set. + /// \brief Returns a value X such that Val = X * Scale, or null if none. + /// + /// If the multiplication is known not to overflow then NoSignedWrap is set. Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap); }; diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index f3ac44c..b9eb986 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -11,12 +11,13 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Loads.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -268,9 +269,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // is only subsequently read. SmallVector<Instruction *, 4> ToDelete; if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) { - unsigned SourceAlign = getOrEnforceKnownAlignment(Copy->getSource(), - AI.getAlignment(), - DL, AT, &AI, DT); + unsigned SourceAlign = getOrEnforceKnownAlignment( + Copy->getSource(), AI.getAlignment(), DL, AC, &AI, DT); if (AI.getAlignment() <= SourceAlign) { DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n'); DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); @@ -310,6 +310,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT LoadInst *NewLoad = IC.Builder->CreateAlignedLoad( IC.Builder->CreateBitCast(Ptr, NewTy->getPointerTo(AS)), LI.getAlignment(), LI.getName()); + MDBuilder MDB(NewLoad->getContext()); for (const auto &MDPair : MD) { unsigned ID = MDPair.first; MDNode *N = MDPair.second; @@ -331,20 +332,86 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT case LLVMContext::MD_noalias: case LLVMContext::MD_nontemporal: case LLVMContext::MD_mem_parallel_loop_access: - case LLVMContext::MD_nonnull: // All of these directly apply. NewLoad->setMetadata(ID, N); break; + case LLVMContext::MD_nonnull: + // This only directly applies if the new type is also a pointer. + if (NewTy->isPointerTy()) { + NewLoad->setMetadata(ID, N); + break; + } + // If it's integral now, translate it to !range metadata. + if (NewTy->isIntegerTy()) { + auto *ITy = cast<IntegerType>(NewTy); + auto *NullInt = ConstantExpr::getPtrToInt( + ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy); + auto *NonNullInt = + ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1)); + NewLoad->setMetadata(LLVMContext::MD_range, + MDB.createRange(NonNullInt, NullInt)); + } + break; + case LLVMContext::MD_range: // FIXME: It would be nice to propagate this in some way, but the type - // conversions make it hard. + // conversions make it hard. If the new type is a pointer, we could + // translate it to !nonnull metadata. break; } } return NewLoad; } +/// \brief Combine a store to a new type. +/// +/// Returns the newly created store instruction. +static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value *V) { + Value *Ptr = SI.getPointerOperand(); + unsigned AS = SI.getPointerAddressSpace(); + SmallVector<std::pair<unsigned, MDNode *>, 8> MD; + SI.getAllMetadata(MD); + + StoreInst *NewStore = IC.Builder->CreateAlignedStore( + V, IC.Builder->CreateBitCast(Ptr, V->getType()->getPointerTo(AS)), + SI.getAlignment()); + for (const auto &MDPair : MD) { + unsigned ID = MDPair.first; + MDNode *N = MDPair.second; + // Note, essentially every kind of metadata should be preserved here! This + // routine is supposed to clone a store instruction changing *only its + // type*. The only metadata it makes sense to drop is metadata which is + // invalidated when the pointer type changes. This should essentially + // never be the case in LLVM, but we explicitly switch over only known + // metadata to be conservatively correct. If you are adding metadata to + // LLVM which pertains to stores, you almost certainly want to add it + // here. + switch (ID) { + case LLVMContext::MD_dbg: + case LLVMContext::MD_tbaa: + case LLVMContext::MD_prof: + case LLVMContext::MD_fpmath: + case LLVMContext::MD_tbaa_struct: + case LLVMContext::MD_alias_scope: + case LLVMContext::MD_noalias: + case LLVMContext::MD_nontemporal: + case LLVMContext::MD_mem_parallel_loop_access: + // All of these directly apply. + NewStore->setMetadata(ID, N); + break; + + case LLVMContext::MD_invariant_load: + case LLVMContext::MD_nonnull: + case LLVMContext::MD_range: + // These don't apply for stores. + break; + } + } + + return NewStore; +} + /// \brief Combine loads to match the type of value their uses after looking /// through intervening bitcasts. /// @@ -371,6 +438,35 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) { if (LI.use_empty()) return nullptr; + Type *Ty = LI.getType(); + + // Try to canonicalize loads which are only ever stored to operate over + // integers instead of any other type. We only do this when the loaded type + // is sized and has a size exactly the same as its store size and the store + // size is a legal integer type. + const DataLayout *DL = IC.getDataLayout(); + if (!Ty->isIntegerTy() && Ty->isSized() && DL && + DL->isLegalInteger(DL->getTypeStoreSizeInBits(Ty)) && + DL->getTypeStoreSizeInBits(Ty) == DL->getTypeSizeInBits(Ty)) { + if (std::all_of(LI.user_begin(), LI.user_end(), [&LI](User *U) { + auto *SI = dyn_cast<StoreInst>(U); + return SI && SI->getPointerOperand() != &LI; + })) { + LoadInst *NewLoad = combineLoadToNewType( + IC, LI, + Type::getIntNTy(LI.getContext(), DL->getTypeStoreSizeInBits(Ty))); + // Replace all the stores with stores of the newly loaded value. + for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) { + auto *SI = cast<StoreInst>(*UI++); + IC.Builder->SetInsertPoint(SI); + combineStoreToNewValue(IC, *SI, NewLoad); + IC.EraseInstFromFunction(*SI); + } + assert(LI.use_empty() && "Failed to remove all users of the load!"); + // Return the old load so the combiner can delete it safely. + return &LI; + } + } // Fold away bit casts of the loaded value by loading the desired type. if (LI.hasOneUse()) @@ -386,6 +482,181 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) { return nullptr; } +// If we can determine that all possible objects pointed to by the provided +// pointer value are, not only dereferenceable, but also definitively less than +// or equal to the provided maximum size, then return true. Otherwise, return +// false (constant global values and allocas fall into this category). +// +// FIXME: This should probably live in ValueTracking (or similar). +static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize, + const DataLayout *DL) { + SmallPtrSet<Value *, 4> Visited; + SmallVector<Value *, 4> Worklist(1, V); + + do { + Value *P = Worklist.pop_back_val(); + P = P->stripPointerCasts(); + + if (!Visited.insert(P).second) + continue; + + if (SelectInst *SI = dyn_cast<SelectInst>(P)) { + Worklist.push_back(SI->getTrueValue()); + Worklist.push_back(SI->getFalseValue()); + continue; + } + + if (PHINode *PN = dyn_cast<PHINode>(P)) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + Worklist.push_back(PN->getIncomingValue(i)); + continue; + } + + if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) { + if (GA->mayBeOverridden()) + return false; + Worklist.push_back(GA->getAliasee()); + continue; + } + + // If we know how big this object is, and it is less than MaxSize, continue + // searching. Otherwise, return false. + if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) { + if (!AI->getAllocatedType()->isSized()) + return false; + + ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize()); + if (!CS) + return false; + + uint64_t TypeSize = DL->getTypeAllocSize(AI->getAllocatedType()); + // Make sure that, even if the multiplication below would wrap as an + // uint64_t, we still do the right thing. + if ((CS->getValue().zextOrSelf(128)*APInt(128, TypeSize)).ugt(MaxSize)) + return false; + continue; + } + + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) { + if (!GV->hasDefinitiveInitializer() || !GV->isConstant()) + return false; + + uint64_t InitSize = DL->getTypeAllocSize(GV->getType()->getElementType()); + if (InitSize > MaxSize) + return false; + continue; + } + + return false; + } while (!Worklist.empty()); + + return true; +} + +// If we're indexing into an object of a known size, and the outer index is +// not a constant, but having any value but zero would lead to undefined +// behavior, replace it with zero. +// +// For example, if we have: +// @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4 +// ... +// %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x +// ... = load i32* %arrayidx, align 4 +// Then we know that we can replace %x in the GEP with i64 0. +// +// FIXME: We could fold any GEP index to zero that would cause UB if it were +// not zero. Currently, we only handle the first such index. Also, we could +// also search through non-zero constant indices if we kept track of the +// offsets those indices implied. +static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI, + Instruction *MemI, unsigned &Idx) { + const DataLayout *DL = IC.getDataLayout(); + if (GEPI->getNumOperands() < 2 || !DL) + return false; + + // Find the first non-zero index of a GEP. If all indices are zero, return + // one past the last index. + auto FirstNZIdx = [](const GetElementPtrInst *GEPI) { + unsigned I = 1; + for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) { + Value *V = GEPI->getOperand(I); + if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) + if (CI->isZero()) + continue; + + break; + } + + return I; + }; + + // Skip through initial 'zero' indices, and find the corresponding pointer + // type. See if the next index is not a constant. + Idx = FirstNZIdx(GEPI); + if (Idx == GEPI->getNumOperands()) + return false; + if (isa<Constant>(GEPI->getOperand(Idx))) + return false; + + SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx); + Type *AllocTy = + GetElementPtrInst::getIndexedType(GEPI->getOperand(0)->getType(), Ops); + if (!AllocTy || !AllocTy->isSized()) + return false; + uint64_t TyAllocSize = DL->getTypeAllocSize(AllocTy); + + // If there are more indices after the one we might replace with a zero, make + // sure they're all non-negative. If any of them are negative, the overall + // address being computed might be before the base address determined by the + // first non-zero index. + auto IsAllNonNegative = [&]() { + for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) { + bool KnownNonNegative, KnownNegative; + IC.ComputeSignBit(GEPI->getOperand(i), KnownNonNegative, + KnownNegative, 0, MemI); + if (KnownNonNegative) + continue; + return false; + } + + return true; + }; + + // FIXME: If the GEP is not inbounds, and there are extra indices after the + // one we'll replace, those could cause the address computation to wrap + // (rendering the IsAllNonNegative() check below insufficient). We can do + // better, ignoring zero indicies (and other indicies we can prove small + // enough not to wrap). + if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds()) + return false; + + // Note that isObjectSizeLessThanOrEq will return true only if the pointer is + // also known to be dereferenceable. + return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) && + IsAllNonNegative(); +} + +// If we're indexing into an object with a variable index for the memory +// access, but the object has only one element, we can assume that the index +// will always be zero. If we replace the GEP, return it. +template <typename T> +static Instruction *replaceGEPIdxWithZero(InstCombiner &IC, Value *Ptr, + T &MemI) { + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) { + unsigned Idx; + if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) { + Instruction *NewGEPI = GEPI->clone(); + NewGEPI->setOperand(Idx, + ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0)); + NewGEPI->insertBefore(GEPI); + MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI); + return NewGEPI; + } + } + + return nullptr; +} + Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { Value *Op = LI.getOperand(0); @@ -395,9 +666,8 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { // Attempt to improve the alignment. if (DL) { - unsigned KnownAlign = - getOrEnforceKnownAlignment(Op, DL->getPrefTypeAlignment(LI.getType()), - DL, AT, &LI, DT); + unsigned KnownAlign = getOrEnforceKnownAlignment( + Op, DL->getPrefTypeAlignment(LI.getType()), DL, AC, &LI, DT); unsigned LoadAlign = LI.getAlignment(); unsigned EffectiveLoadAlign = LoadAlign != 0 ? LoadAlign : DL->getABITypeAlignment(LI.getType()); @@ -408,6 +678,12 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { LI.setAlignment(EffectiveLoadAlign); } + // Replace GEP indices if possible. + if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) { + Worklist.Add(NewGEPI); + return &LI; + } + // None of the following transforms are legal for volatile/atomic loads. // FIXME: Some of it is okay for atomic loads; needs refactoring. if (!LI.isSimple()) return nullptr; @@ -418,7 +694,8 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { BasicBlock::iterator BBI = &LI; if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,6)) return ReplaceInstUsesWith( - LI, Builder->CreateBitCast(AvailableVal, LI.getType())); + LI, Builder->CreateBitOrPointerCast(AvailableVal, LI.getType(), + LI.getName() + ".cast")); // load(gep null, ...) -> unreachable if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) { @@ -473,119 +750,61 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { } // load (select (cond, null, P)) -> load P - if (Constant *C = dyn_cast<Constant>(SI->getOperand(1))) - if (C->isNullValue()) { - LI.setOperand(0, SI->getOperand(2)); - return &LI; - } + if (isa<ConstantPointerNull>(SI->getOperand(1)) && + LI.getPointerAddressSpace() == 0) { + LI.setOperand(0, SI->getOperand(2)); + return &LI; + } // load (select (cond, P, null)) -> load P - if (Constant *C = dyn_cast<Constant>(SI->getOperand(2))) - if (C->isNullValue()) { - LI.setOperand(0, SI->getOperand(1)); - return &LI; - } + if (isa<ConstantPointerNull>(SI->getOperand(2)) && + LI.getPointerAddressSpace() == 0) { + LI.setOperand(0, SI->getOperand(1)); + return &LI; + } } } return nullptr; } -/// InstCombineStoreToCast - Fold store V, (cast P) -> store (cast V), P -/// when possible. This makes it generally easy to do alias analysis and/or -/// SROA/mem2reg of the memory object. -static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) { - User *CI = cast<User>(SI.getOperand(1)); - Value *CastOp = CI->getOperand(0); - - Type *DestPTy = CI->getType()->getPointerElementType(); - PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType()); - if (!SrcTy) return nullptr; - - Type *SrcPTy = SrcTy->getElementType(); - - if (!DestPTy->isIntegerTy() && !DestPTy->isPointerTy()) - return nullptr; - - /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep" - /// to its first element. This allows us to handle things like: - /// store i32 xxx, (bitcast {foo*, float}* %P to i32*) - /// on 32-bit hosts. - SmallVector<Value*, 4> NewGEPIndices; - - // If the source is an array, the code below will not succeed. Check to - // see if a trivial 'gep P, 0, 0' will help matters. Only do this for - // constants. - if (SrcPTy->isArrayTy() || SrcPTy->isStructTy()) { - // Index through pointer. - Constant *Zero = Constant::getNullValue(Type::getInt32Ty(SI.getContext())); - NewGEPIndices.push_back(Zero); - - while (1) { - if (StructType *STy = dyn_cast<StructType>(SrcPTy)) { - if (!STy->getNumElements()) /* Struct can be empty {} */ - break; - NewGEPIndices.push_back(Zero); - SrcPTy = STy->getElementType(0); - } else if (ArrayType *ATy = dyn_cast<ArrayType>(SrcPTy)) { - NewGEPIndices.push_back(Zero); - SrcPTy = ATy->getElementType(); - } else { - break; - } - } - - SrcTy = PointerType::get(SrcPTy, SrcTy->getAddressSpace()); - } - - if (!SrcPTy->isIntegerTy() && !SrcPTy->isPointerTy()) - return nullptr; - - // If the pointers point into different address spaces don't do the - // transformation. - if (SrcTy->getAddressSpace() != CI->getType()->getPointerAddressSpace()) - return nullptr; - - // If the pointers point to values of different sizes don't do the - // transformation. - if (!IC.getDataLayout() || - IC.getDataLayout()->getTypeSizeInBits(SrcPTy) != - IC.getDataLayout()->getTypeSizeInBits(DestPTy)) - return nullptr; +/// \brief Combine stores to match the type of value being stored. +/// +/// The core idea here is that the memory does not have any intrinsic type and +/// where we can we should match the type of a store to the type of value being +/// stored. +/// +/// However, this routine must never change the width of a store or the number of +/// stores as that would introduce a semantic change. This combine is expected to +/// be a semantic no-op which just allows stores to more closely model the types +/// of their incoming values. +/// +/// Currently, we also refuse to change the precise type used for an atomic or +/// volatile store. This is debatable, and might be reasonable to change later. +/// However, it is risky in case some backend or other part of LLVM is relying +/// on the exact type stored to select appropriate atomic operations. +/// +/// \returns true if the store was successfully combined away. This indicates +/// the caller must erase the store instruction. We have to let the caller erase +/// the store instruction sas otherwise there is no way to signal whether it was +/// combined or not: IC.EraseInstFromFunction returns a null pointer. +static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) { + // FIXME: We could probably with some care handle both volatile and atomic + // stores here but it isn't clear that this is important. + if (!SI.isSimple()) + return false; - // If the pointers point to pointers to different address spaces don't do the - // transformation. It is not safe to introduce an addrspacecast instruction in - // this case since, depending on the target, addrspacecast may not be a no-op - // cast. - if (SrcPTy->isPointerTy() && DestPTy->isPointerTy() && - SrcPTy->getPointerAddressSpace() != DestPTy->getPointerAddressSpace()) - return nullptr; + Value *V = SI.getValueOperand(); - // Okay, we are casting from one integer or pointer type to another of - // the same size. Instead of casting the pointer before - // the store, cast the value to be stored. - Value *NewCast; - Instruction::CastOps opcode = Instruction::BitCast; - Type* CastSrcTy = DestPTy; - Type* CastDstTy = SrcPTy; - if (CastDstTy->isPointerTy()) { - if (CastSrcTy->isIntegerTy()) - opcode = Instruction::IntToPtr; - } else if (CastDstTy->isIntegerTy()) { - if (CastSrcTy->isPointerTy()) - opcode = Instruction::PtrToInt; + // Fold away bit casts of the stored value by storing the original type. + if (auto *BC = dyn_cast<BitCastInst>(V)) { + V = BC->getOperand(0); + combineStoreToNewValue(IC, SI, V); + return true; } - // SIOp0 is a pointer to aggregate and this is a store to the first field, - // emit a GEP to index into its first field. - if (!NewGEPIndices.empty()) - CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices); - - Value *SIOp0 = SI.getOperand(0); - NewCast = IC.Builder->CreateCast(opcode, SIOp0, CastDstTy, - SIOp0->getName()+".c"); - SI.setOperand(0, NewCast); - SI.setOperand(1, CastOp); - return &SI; + // FIXME: We should also canonicalize loads of vectors when their elements are + // cast to other types. + return false; } /// equivalentAddressValues - Test if A and B will obviously have the same @@ -621,11 +840,14 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { Value *Val = SI.getOperand(0); Value *Ptr = SI.getOperand(1); + // Try to canonicalize the stored type. + if (combineStoreToValueType(*this, SI)) + return EraseInstFromFunction(SI); + // Attempt to improve the alignment. if (DL) { - unsigned KnownAlign = - getOrEnforceKnownAlignment(Ptr, DL->getPrefTypeAlignment(Val->getType()), - DL, AT, &SI, DT); + unsigned KnownAlign = getOrEnforceKnownAlignment( + Ptr, DL->getPrefTypeAlignment(Val->getType()), DL, AC, &SI, DT); unsigned StoreAlign = SI.getAlignment(); unsigned EffectiveStoreAlign = StoreAlign != 0 ? StoreAlign : DL->getABITypeAlignment(Val->getType()); @@ -636,6 +858,12 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { SI.setAlignment(EffectiveStoreAlign); } + // Replace GEP indices if possible. + if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) { + Worklist.Add(NewGEPI); + return &SI; + } + // Don't hack volatile/atomic stores. // FIXME: Some bits are legal for atomic stores; needs refactoring. if (!SI.isSimple()) return nullptr; @@ -712,17 +940,6 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { if (isa<UndefValue>(Val)) return EraseInstFromFunction(SI); - // If the pointer destination is a cast, see if we can fold the cast into the - // source instead. - if (isa<CastInst>(Ptr)) - if (Instruction *Res = InstCombineStoreToCast(*this, SI)) - return Res; - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) - if (CE->isCast()) - if (Instruction *Res = InstCombineStoreToCast(*this, SI)) - return Res; - - // If this store is the last instruction in the basic block (possibly // excepting debug info instructions), and if the block ends with an // unconditional branch, try to move it to the successor block. diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 8c48dce..c48e3c9 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" @@ -46,10 +46,10 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC, // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it // inexact. Similarly for <<. if (BinaryOperator *I = dyn_cast<BinaryOperator>(V)) - if (I->isLogicalShift() && isKnownToBeAPowerOfTwo(I->getOperand(0), false, - 0, IC.getAssumptionTracker(), - CxtI, - IC.getDominatorTree())) { + if (I->isLogicalShift() && + isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0, + IC.getAssumptionCache(), CxtI, + IC.getDominatorTree())) { // We know that this is an exact/nuw shift and that the input is a // non-zero context as well. if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) { @@ -123,6 +123,48 @@ static Constant *getLogBase2Vector(ConstantDataVector *CV) { return ConstantVector::get(Elts); } +/// \brief Return true if we can prove that: +/// (mul LHS, RHS) === (mul nsw LHS, RHS) +bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS, + Instruction *CxtI) { + // Multiplying n * m significant bits yields a result of n + m significant + // bits. If the total number of significant bits does not exceed the + // result bit width (minus 1), there is no overflow. + // This means if we have enough leading sign bits in the operands + // we can guarantee that the result does not overflow. + // Ref: "Hacker's Delight" by Henry Warren + unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); + + // Note that underestimating the number of sign bits gives a more + // conservative answer. + unsigned SignBits = ComputeNumSignBits(LHS, 0, CxtI) + + ComputeNumSignBits(RHS, 0, CxtI); + + // First handle the easy case: if we have enough sign bits there's + // definitely no overflow. + if (SignBits > BitWidth + 1) + return true; + + // There are two ambiguous cases where there can be no overflow: + // SignBits == BitWidth + 1 and + // SignBits == BitWidth + // The second case is difficult to check, therefore we only handle the + // first case. + if (SignBits == BitWidth + 1) { + // It overflows only when both arguments are negative and the true + // product is exactly the minimum negative number. + // E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000 + // For simplicity we just check if at least one side is not negative. + bool LHSNonNegative, LHSNegative; + bool RHSNonNegative, RHSNegative; + ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, CxtI); + ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, CxtI); + if (LHSNonNegative || RHSNonNegative) + return true; + } + return false; +} + Instruction *InstCombiner::visitMul(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -130,14 +172,19 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyMulInst(Op0, Op1, DL, TLI, DT, AT)) + if (Value *V = SimplifyMulInst(Op0, Op1, DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); if (Value *V = SimplifyUsingDistributiveLaws(I)) return ReplaceInstUsesWith(I, V); - if (match(Op1, m_AllOnes())) // X * -1 == 0 - X - return BinaryOperator::CreateNeg(Op0, I.getName()); + // X * -1 == 0 - X + if (match(Op1, m_AllOnes())) { + BinaryOperator *BO = BinaryOperator::CreateNeg(Op0, I.getName()); + if (I.hasNoSignedWrap()) + BO->setHasNoSignedWrap(); + return BO; + } // Also allow combining multiply instructions on vectors. { @@ -146,9 +193,18 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { const APInt *IVal; if (match(&I, m_Mul(m_Shl(m_Value(NewOp), m_Constant(C2)), m_Constant(C1))) && - match(C1, m_APInt(IVal))) - // ((X << C1)*C2) == (X * (C2 << C1)) - return BinaryOperator::CreateMul(NewOp, ConstantExpr::getShl(C1, C2)); + match(C1, m_APInt(IVal))) { + // ((X << C2)*C1) == (X * (C1 << C2)) + Constant *Shl = ConstantExpr::getShl(C1, C2); + BinaryOperator *Mul = cast<BinaryOperator>(I.getOperand(0)); + BinaryOperator *BO = BinaryOperator::CreateMul(NewOp, Shl); + if (I.hasNoUnsignedWrap() && Mul->hasNoUnsignedWrap()) + BO->setHasNoUnsignedWrap(); + if (I.hasNoSignedWrap() && Mul->hasNoSignedWrap() && + Shl->isNotMinSignedValue()) + BO->setHasNoSignedWrap(); + return BO; + } if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) { Constant *NewCst = nullptr; @@ -165,6 +221,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap(); + if (I.hasNoSignedWrap() && NewCst->isNotMinSignedValue()) + Shl->setHasNoSignedWrap(); return Shl; } @@ -221,9 +279,16 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { } } - if (Value *Op0v = dyn_castNegVal(Op0)) // -X * -Y = X*Y - if (Value *Op1v = dyn_castNegVal(Op1)) - return BinaryOperator::CreateMul(Op0v, Op1v); + if (Value *Op0v = dyn_castNegVal(Op0)) { // -X * -Y = X*Y + if (Value *Op1v = dyn_castNegVal(Op1)) { + BinaryOperator *BO = BinaryOperator::CreateMul(Op0v, Op1v); + if (I.hasNoSignedWrap() && + match(Op0, m_NSWSub(m_Value(), m_Value())) && + match(Op1, m_NSWSub(m_Value(), m_Value()))) + BO->setHasNoSignedWrap(); + return BO; + } + } // (X / Y) * Y = X - (X % Y) // (X / Y) * -Y = (X % Y) - X @@ -272,10 +337,22 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { // (1 << Y)*X --> X << Y { Value *Y; - if (match(Op0, m_Shl(m_One(), m_Value(Y)))) - return BinaryOperator::CreateShl(Op1, Y); - if (match(Op1, m_Shl(m_One(), m_Value(Y)))) - return BinaryOperator::CreateShl(Op0, Y); + BinaryOperator *BO = nullptr; + bool ShlNSW = false; + if (match(Op0, m_Shl(m_One(), m_Value(Y)))) { + BO = BinaryOperator::CreateShl(Op1, Y); + ShlNSW = cast<ShlOperator>(Op0)->hasNoSignedWrap(); + } else if (match(Op1, m_Shl(m_One(), m_Value(Y)))) { + BO = BinaryOperator::CreateShl(Op0, Y); + ShlNSW = cast<ShlOperator>(Op1)->hasNoSignedWrap(); + } + if (BO) { + if (I.hasNoUnsignedWrap()) + BO->setHasNoUnsignedWrap(); + if (I.hasNoSignedWrap() && ShlNSW) + BO->setHasNoSignedWrap(); + return BO; + } } // If one of the operands of the multiply is a cast from a boolean value, then @@ -298,6 +375,18 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { } } + if (!I.hasNoSignedWrap() && WillNotOverflowSignedMul(Op0, Op1, &I)) { + Changed = true; + I.setHasNoSignedWrap(true); + } + + if (!I.hasNoUnsignedWrap() && + computeOverflowForUnsignedMul(Op0, Op1, &I) == + OverflowResult::NeverOverflows) { + Changed = true; + I.setHasNoUnsignedWrap(true); + } + return Changed ? &I : nullptr; } @@ -441,8 +530,8 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (isa<Constant>(Op0)) std::swap(Op0, Op1); - if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), DL, TLI, - DT, AT)) + if (Value *V = + SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); bool AllowReassociate = I.hasUnsafeAlgebra(); @@ -946,7 +1035,7 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyUDivInst(Op0, Op1, DL, TLI, DT, AT)) + if (Value *V = SimplifyUDivInst(Op0, Op1, DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // Handle the integer div common cases @@ -961,9 +1050,14 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { match(Op1, m_APInt(C2))) { bool Overflow; APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow); - if (!Overflow) - return BinaryOperator::CreateUDiv( + if (!Overflow) { + bool IsExact = I.isExact() && match(Op0, m_Exact(m_Value())); + BinaryOperator *BO = BinaryOperator::CreateUDiv( X, ConstantInt::get(X->getType(), C2ShlC1)); + if (IsExact) + BO->setIsExact(); + return BO; + } } } @@ -1014,7 +1108,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifySDivInst(Op0, Op1, DL, TLI, DT, AT)) + if (Value *V = SimplifySDivInst(Op0, Op1, DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // Handle the integer div common cases @@ -1041,10 +1135,12 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { return new ZExtInst(Builder->CreateICmpEQ(Op0, Op1), I.getType()); // -X/C --> X/-C provided the negation doesn't overflow. - if (SubOperator *Sub = dyn_cast<SubOperator>(Op0)) - if (match(Sub->getOperand(0), m_Zero()) && Sub->hasNoSignedWrap()) - return BinaryOperator::CreateSDiv(Sub->getOperand(1), - ConstantExpr::getNeg(RHS)); + Value *X; + if (match(Op0, m_NSWSub(m_Zero(), m_Value(X)))) { + auto *BO = BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(RHS)); + BO->setIsExact(I.isExact()); + return BO; + } } // If the sign bits of both operands are zero (i.e. we can prove they are @@ -1054,15 +1150,19 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { if (MaskedValueIsZero(Op0, Mask, 0, &I)) { if (MaskedValueIsZero(Op1, Mask, 0, &I)) { // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set - return BinaryOperator::CreateUDiv(Op0, Op1, I.getName()); + auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName()); + BO->setIsExact(I.isExact()); + return BO; } - if (match(Op1, m_Shl(m_Power2(), m_Value()))) { + if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, AC, &I, DT)) { // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y) // Safe because the only negative value (1 << Y) can take on is // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have // the sign bit set. - return BinaryOperator::CreateUDiv(Op0, Op1, I.getName()); + auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName()); + BO->setIsExact(I.isExact()); + return BO; } } } @@ -1106,7 +1206,8 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyFDivInst(Op0, Op1, DL, TLI, DT, AT)) + if (Value *V = SimplifyFDivInst(Op0, Op1, I.getFastMathFlags(), + DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); if (isa<Constant>(Op0)) @@ -1271,7 +1372,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyURemInst(Op0, Op1, DL, TLI, DT, AT)) + if (Value *V = SimplifyURemInst(Op0, Op1, DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); if (Instruction *common = commonIRemTransforms(I)) @@ -1284,7 +1385,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) { I.getType()); // X urem Y -> X and Y-1, where Y is a power of 2, - if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/true, 0, AT, &I, DT)) { + if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, AC, &I, DT)) { Constant *N1 = Constant::getAllOnesValue(I.getType()); Value *Add = Builder->CreateAdd(Op1, N1); return BinaryOperator::CreateAnd(Op0, Add); @@ -1306,7 +1407,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifySRemInst(Op0, Op1, DL, TLI, DT, AT)) + if (Value *V = SimplifySRemInst(Op0, Op1, DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // Handle the integer rem common cases @@ -1381,7 +1482,8 @@ Instruction *InstCombiner::visitFRem(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyFRemInst(Op0, Op1, DL, TLI, DT, AT)) + if (Value *V = SimplifyFRemInst(Op0, Op1, I.getFastMathFlags(), + DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // Handle cases involving: rem X, (select Cond, Y, Z) diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp index 794263a..0e73db8 100644 --- a/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -788,7 +788,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { // PHINode simplification // Instruction *InstCombiner::visitPHINode(PHINode &PN) { - if (Value *V = SimplifyInstruction(&PN, DL, TLI, DT, AT)) + if (Value *V = SimplifyInstruction(&PN, DL, TLI, DT, AC)) return ReplaceInstUsesWith(PN, V); // If all PHI operands are the same operation, pull them through the PHI, diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index 079ae34..dd0e65f 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/PatternMatch.h" @@ -314,8 +314,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, const DataLayout *TD, const TargetLibraryInfo *TLI, - DominatorTree *DT, - AssumptionTracker *AT) { + DominatorTree *DT, AssumptionCache *AC) { // Trivial replacement. if (V == Op) return RepOp; @@ -336,10 +335,10 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, if (CmpInst *C = dyn_cast<CmpInst>(I)) { if (C->getOperand(0) == Op) return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), TD, - TLI, DT, AT); + TLI, DT, AC); if (C->getOperand(1) == Op) return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, TD, - TLI, DT, AT); + TLI, DT, AC); } // TODO: We could hand off more cases to instsimplify here. @@ -389,15 +388,7 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, /// 1. The icmp predicate is inverted /// 2. The select operands are reversed /// 3. The magnitude of C2 and C1 are flipped -/// -/// This also tries to turn -/// --- Single bit tests: -/// if ((x & C) == 0) x |= C to x |= C -/// if ((x & C) != 0) x ^= C to x &= ~C -/// if ((x & C) == 0) x ^= C to x |= C -/// if ((x & C) != 0) x &= ~C to x &= ~C -/// if ((x & C) == 0) x &= ~C to nothing -static Value *foldSelectICmpAndOr(SelectInst &SI, Value *TrueVal, +static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal, Value *FalseVal, InstCombiner::BuilderTy *Builder) { const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition()); @@ -416,25 +407,6 @@ static Value *foldSelectICmpAndOr(SelectInst &SI, Value *TrueVal, return nullptr; const APInt *C2; - if (match(TrueVal, m_Specific(X))) { - // if ((X & C) != 0) X ^= C becomes X &= ~C - if (match(FalseVal, m_Xor(m_Specific(X), m_APInt(C2))) && C1 == C2) - return Builder->CreateAnd(X, ~(*C1)); - // if ((X & C) != 0) X &= ~C becomes X &= ~C - if (match(FalseVal, m_And(m_Specific(X), m_APInt(C2))) && *C1 == ~(*C2)) - return FalseVal; - } else if (match(FalseVal, m_Specific(X))) { - // if ((X & C) == 0) X ^= C becomes X |= C - if (match(TrueVal, m_Xor(m_Specific(X), m_APInt(C2))) && C1 == C2) - return Builder->CreateOr(X, *C1); - // if ((X & C) == 0) X &= ~C becomes nothing - if (match(TrueVal, m_And(m_Specific(X), m_APInt(C2))) && *C1 == ~(*C2)) - return X; - // if ((X & C) == 0) X |= C becomes X |= C - if (match(TrueVal, m_Or(m_Specific(X), m_APInt(C2))) && C1 == C2) - return TrueVal; - } - bool OrOnTrueVal = false; bool OrOnFalseVal = match(FalseVal, m_Or(m_Specific(TrueVal), m_Power2(C2))); if (!OrOnFalseVal) @@ -465,6 +437,62 @@ static Value *foldSelectICmpAndOr(SelectInst &SI, Value *TrueVal, return Builder->CreateOr(V, Y); } +/// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single +/// call to cttz/ctlz with flag 'is_zero_undef' cleared. +/// +/// For example, we can fold the following code sequence: +/// \code +/// %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true) +/// %1 = icmp ne i32 %x, 0 +/// %2 = select i1 %1, i32 %0, i32 32 +/// \code +/// +/// into: +/// %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false) +static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, + InstCombiner::BuilderTy *Builder) { + ICmpInst::Predicate Pred = ICI->getPredicate(); + Value *CmpLHS = ICI->getOperand(0); + Value *CmpRHS = ICI->getOperand(1); + + // Check if the condition value compares a value for equality against zero. + if (!ICI->isEquality() || !match(CmpRHS, m_Zero())) + return nullptr; + + Value *Count = FalseVal; + Value *ValueOnZero = TrueVal; + if (Pred == ICmpInst::ICMP_NE) + std::swap(Count, ValueOnZero); + + // Skip zero extend/truncate. + Value *V = nullptr; + if (match(Count, m_ZExt(m_Value(V))) || + match(Count, m_Trunc(m_Value(V)))) + Count = V; + + // Check if the value propagated on zero is a constant number equal to the + // sizeof in bits of 'Count'. + unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits(); + if (!match(ValueOnZero, m_SpecificInt(SizeOfInBits))) + return nullptr; + + // Check that 'Count' is a call to intrinsic cttz/ctlz. Also check that the + // input to the cttz/ctlz is used as LHS for the compare instruction. + if (match(Count, m_Intrinsic<Intrinsic::cttz>(m_Specific(CmpLHS))) || + match(Count, m_Intrinsic<Intrinsic::ctlz>(m_Specific(CmpLHS)))) { + IntrinsicInst *II = cast<IntrinsicInst>(Count); + IRBuilder<> Builder(II); + // Explicitly clear the 'undef_on_zero' flag. + IntrinsicInst *NewI = cast<IntrinsicInst>(II->clone()); + Type *Ty = NewI->getArgOperand(1)->getType(); + NewI->setArgOperand(1, Constant::getNullValue(Ty)); + Builder.Insert(NewI); + return Builder.CreateZExtOrTrunc(NewI, ValueOnZero->getType()); + } + + return nullptr; +} + /// visitSelectInstWithICmp - Visit a SelectInst that has an /// ICmpInst as its first operand. /// @@ -607,26 +635,26 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, // arms of the select. See if substituting this value into the arm and // simplifying the result yields the same value as the other arm. if (Pred == ICmpInst::ICMP_EQ) { - if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI, - DT, AT) == TrueVal || - SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI, - DT, AT) == TrueVal) + if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) == + TrueVal || + SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) == + TrueVal) return ReplaceInstUsesWith(SI, FalseVal); - if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI, - DT, AT) == FalseVal || - SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI, - DT, AT) == FalseVal) + if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) == + FalseVal || + SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) == + FalseVal) return ReplaceInstUsesWith(SI, FalseVal); } else if (Pred == ICmpInst::ICMP_NE) { - if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI, - DT, AT) == FalseVal || - SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI, - DT, AT) == FalseVal) + if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) == + FalseVal || + SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) == + FalseVal) return ReplaceInstUsesWith(SI, TrueVal); - if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI, - DT, AT) == TrueVal || - SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI, - DT, AT) == TrueVal) + if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) == + TrueVal || + SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) == + TrueVal) return ReplaceInstUsesWith(SI, TrueVal); } @@ -644,9 +672,58 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, } } + if (unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits()) { + APInt MinSignedValue = APInt::getSignBit(BitWidth); + Value *X; + const APInt *Y, *C; + bool TrueWhenUnset; + bool IsBitTest = false; + if (ICmpInst::isEquality(Pred) && + match(CmpLHS, m_And(m_Value(X), m_Power2(Y))) && + match(CmpRHS, m_Zero())) { + IsBitTest = true; + TrueWhenUnset = Pred == ICmpInst::ICMP_EQ; + } else if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_Zero())) { + X = CmpLHS; + Y = &MinSignedValue; + IsBitTest = true; + TrueWhenUnset = false; + } else if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_AllOnes())) { + X = CmpLHS; + Y = &MinSignedValue; + IsBitTest = true; + TrueWhenUnset = true; + } + if (IsBitTest) { + Value *V = nullptr; + // (X & Y) == 0 ? X : X ^ Y --> X & ~Y + if (TrueWhenUnset && TrueVal == X && + match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) + V = Builder->CreateAnd(X, ~(*Y)); + // (X & Y) != 0 ? X ^ Y : X --> X & ~Y + else if (!TrueWhenUnset && FalseVal == X && + match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) + V = Builder->CreateAnd(X, ~(*Y)); + // (X & Y) == 0 ? X ^ Y : X --> X | Y + else if (TrueWhenUnset && FalseVal == X && + match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) + V = Builder->CreateOr(X, *Y); + // (X & Y) != 0 ? X : X ^ Y --> X | Y + else if (!TrueWhenUnset && TrueVal == X && + match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) + V = Builder->CreateOr(X, *Y); + + if (V) + return ReplaceInstUsesWith(SI, V); + } + } + if (Value *V = foldSelectICmpAndOr(SI, TrueVal, FalseVal, Builder)) return ReplaceInstUsesWith(SI, V); + if (Value *V = foldSelectCttzCtlz(ICI, TrueVal, FalseVal, Builder)) + return ReplaceInstUsesWith(SI, V); + return Changed ? &SI : nullptr; } @@ -835,8 +912,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { Value *TrueVal = SI.getTrueValue(); Value *FalseVal = SI.getFalseValue(); - if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal, DL, TLI, - DT, AT)) + if (Value *V = + SimplifySelectInst(CondVal, TrueVal, FalseVal, DL, TLI, DT, AC)) return ReplaceInstUsesWith(SI, V); if (SI.getType()->isIntegerTy(1)) { @@ -928,8 +1005,22 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { !CFPf->getValueAPF().isZero())) return ReplaceInstUsesWith(SI, TrueVal); } - // NOTE: if we wanted to, this is where to detect MIN/MAX + // Canonicalize to use ordered comparisons by swapping the select + // operands. + // + // e.g. + // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X + if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { + FCmpInst::Predicate InvPred = FCI->getInversePredicate(); + Value *NewCond = Builder->CreateFCmp(InvPred, TrueVal, FalseVal, + FCI->getName() + ".inv"); + + return SelectInst::Create(NewCond, FalseVal, TrueVal, + SI.getName() + ".p"); + } + + // NOTE: if we wanted to, this is where to detect MIN/MAX } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){ // Transform (X == Y) ? Y : X -> X if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) { @@ -955,6 +1046,21 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { !CFPf->getValueAPF().isZero())) return ReplaceInstUsesWith(SI, TrueVal); } + + // Canonicalize to use ordered comparisons by swapping the select + // operands. + // + // e.g. + // (X ugt Y) ? X : Y -> (X ole Y) ? X : Y + if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { + FCmpInst::Predicate InvPred = FCI->getInversePredicate(); + Value *NewCond = Builder->CreateFCmp(InvPred, FalseVal, TrueVal, + FCI->getName() + ".inv"); + + return SelectInst::Create(NewCond, FalseVal, TrueVal, + SI.getName() + ".p"); + } + // NOTE: if we wanted to, this is where to detect MIN/MAX } // NOTE: if we wanted to, this is where to detect ABS @@ -1039,12 +1145,14 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal)) return FoldI; + Value *LHS, *RHS, *LHS2, *RHS2; + SelectPatternFlavor SPF = MatchSelectPattern(&SI, LHS, RHS); + // MAX(MAX(a, b), a) -> MAX(a, b) // MIN(MIN(a, b), a) -> MIN(a, b) // MAX(MIN(a, b), a) -> a // MIN(MAX(a, b), a) -> a - Value *LHS, *RHS, *LHS2, *RHS2; - if (SelectPatternFlavor SPF = MatchSelectPattern(&SI, LHS, RHS)) { + if (SPF) { if (SelectPatternFlavor SPF2 = MatchSelectPattern(LHS, LHS2, RHS2)) if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2, SI, SPF, RHS)) @@ -1055,6 +1163,33 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { return R; } + // MAX(~a, ~b) -> ~MIN(a, b) + if (SPF == SPF_SMAX || SPF == SPF_UMAX) { + if (IsFreeToInvert(LHS, LHS->hasNUses(2)) && + IsFreeToInvert(RHS, RHS->hasNUses(2))) { + + // This transform adds a xor operation and that extra cost needs to be + // justified. We look for simplifications that will result from + // applying this rule: + + bool Profitable = + (LHS->hasNUses(2) && match(LHS, m_Not(m_Value()))) || + (RHS->hasNUses(2) && match(RHS, m_Not(m_Value()))) || + (SI.hasOneUse() && match(*SI.user_begin(), m_Not(m_Value()))); + + if (Profitable) { + Value *NewLHS = Builder->CreateNot(LHS); + Value *NewRHS = Builder->CreateNot(RHS); + Value *NewCmp = SPF == SPF_SMAX + ? Builder->CreateICmpSLT(NewLHS, NewRHS) + : Builder->CreateICmpULT(NewLHS, NewRHS); + Value *NewSI = + Builder->CreateNot(Builder->CreateSelect(NewCmp, NewLHS, NewRHS)); + return ReplaceInstUsesWith(SI, NewSI); + } + } + } + // TODO. // ABS(-X) -> ABS(X) } @@ -1068,20 +1203,38 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { return NV; if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) { + // select(C, select(C, a, b), c) -> select(C, a, c) if (TrueSI->getCondition() == CondVal) { if (SI.getTrueValue() == TrueSI->getTrueValue()) return nullptr; SI.setOperand(1, TrueSI->getTrueValue()); return &SI; } + // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b) + // We choose this as normal form to enable folding on the And and shortening + // paths for the values (this helps GetUnderlyingObjects() for example). + if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) { + Value *And = Builder->CreateAnd(CondVal, TrueSI->getCondition()); + SI.setOperand(0, And); + SI.setOperand(1, TrueSI->getTrueValue()); + return &SI; + } } if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) { + // select(C, a, select(C, b, c)) -> select(C, a, c) if (FalseSI->getCondition() == CondVal) { if (SI.getFalseValue() == FalseSI->getFalseValue()) return nullptr; SI.setOperand(2, FalseSI->getFalseValue()); return &SI; } + // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b) + if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) { + Value *Or = Builder->CreateOr(CondVal, FalseSI->getCondition()); + SI.setOperand(0, Or); + SI.setOperand(2, FalseSI->getFalseValue()); + return &SI; + } } if (BinaryOperator::isNot(CondVal)) { diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp index afa907a..b4976e0 100644 --- a/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/IntrinsicInst.h" @@ -693,9 +693,9 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1), - I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), - DL, TLI, DT, AT)) + if (Value *V = + SimplifyShlInst(I.getOperand(0), I.getOperand(1), I.hasNoSignedWrap(), + I.hasNoUnsignedWrap(), DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); if (Instruction *V = commonShiftTransforms(I)) @@ -735,8 +735,8 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), - I.isExact(), DL, TLI, DT, AT)) + if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(), + DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); if (Instruction *R = commonShiftTransforms(I)) @@ -779,8 +779,8 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), - I.isExact(), DL, TLI, DT, AT)) + if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(), + DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); if (Instruction *R = commonShiftTransforms(I)) diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index ad6983a..c5603aa 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index cb16584..e07efb5 100644 --- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -12,7 +12,8 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/IR/PatternMatch.h" using namespace llvm; using namespace PatternMatch; @@ -853,10 +854,32 @@ static void RecognizeIdentityMask(const SmallVectorImpl<int> &Mask, } } +// Returns true if the shuffle is extracting a contiguous range of values from +// LHS, for example: +// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +// Input: |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP| +// Shuffles to: |EE|FF|GG|HH| +// +--+--+--+--+ +static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI, + SmallVector<int, 16> &Mask) { + unsigned LHSElems = + cast<VectorType>(SVI.getOperand(0)->getType())->getNumElements(); + unsigned MaskElems = Mask.size(); + unsigned BegIdx = Mask.front(); + unsigned EndIdx = Mask.back(); + if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1) + return false; + for (unsigned I = 0; I != MaskElems; ++I) + if (static_cast<unsigned>(Mask[I]) != BegIdx + I) + return false; + return true; +} + Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); Value *RHS = SVI.getOperand(1); SmallVector<int, 16> Mask = SVI.getShuffleMask(); + Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); bool MadeChange = false; @@ -892,18 +915,17 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { SmallVector<Constant*, 16> Elts; for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) { if (Mask[i] < 0) { - Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext()))); + Elts.push_back(UndefValue::get(Int32Ty)); continue; } if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) || (Mask[i] < (int)e && isa<UndefValue>(LHS))) { Mask[i] = -1; // Turn into undef. - Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext()))); + Elts.push_back(UndefValue::get(Int32Ty)); } else { Mask[i] = Mask[i] % e; // Force to LHS. - Elts.push_back(ConstantInt::get(Type::getInt32Ty(SVI.getContext()), - Mask[i])); + Elts.push_back(ConstantInt::get(Int32Ty, Mask[i])); } } SVI.setOperand(0, SVI.getOperand(1)); @@ -929,6 +951,96 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { return ReplaceInstUsesWith(SVI, V); } + // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to + // a non-vector type. We can instead bitcast the original vector followed by + // an extract of the desired element: + // + // %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, + // <4 x i32> <i32 0, i32 1, i32 2, i32 3> + // %1 = bitcast <4 x i8> %sroa to i32 + // Becomes: + // %bc = bitcast <16 x i8> %in to <4 x i32> + // %ext = extractelement <4 x i32> %bc, i32 0 + // + // If the shuffle is extracting a contiguous range of values from the input + // vector then each use which is a bitcast of the extracted size can be + // replaced. This will work if the vector types are compatible, and the begin + // index is aligned to a value in the casted vector type. If the begin index + // isn't aligned then we can shuffle the original vector (keeping the same + // vector type) before extracting. + // + // This code will bail out if the target type is fundamentally incompatible + // with vectors of the source type. + // + // Example of <16 x i8>, target type i32: + // Index range [4,8): v-----------v Will work. + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // <16 x i8>: | | | | | | | | | | | | | | | | | + // <4 x i32>: | | | | | + // +-----------+-----------+-----------+-----------+ + // Index range [6,10): ^-----------^ Needs an extra shuffle. + // Target type i40: ^--------------^ Won't work, bail. + if (isShuffleExtractingFromLHS(SVI, Mask)) { + Value *V = LHS; + unsigned MaskElems = Mask.size(); + unsigned BegIdx = Mask.front(); + VectorType *SrcTy = cast<VectorType>(V->getType()); + unsigned VecBitWidth = SrcTy->getBitWidth(); + unsigned SrcElemBitWidth = + SrcTy->getElementType()->getPrimitiveSizeInBits(); + assert(SrcElemBitWidth && "vector elements must have a bitwidth"); + unsigned SrcNumElems = SrcTy->getNumElements(); + SmallVector<BitCastInst *, 8> BCs; + DenseMap<Type *, Value *> NewBCs; + for (User *U : SVI.users()) + if (BitCastInst *BC = dyn_cast<BitCastInst>(U)) + if (!BC->use_empty()) + // Only visit bitcasts that weren't previously handled. + BCs.push_back(BC); + for (BitCastInst *BC : BCs) { + Type *TgtTy = BC->getDestTy(); + unsigned TgtElemBitWidth = TgtTy->getPrimitiveSizeInBits(); + if (!TgtElemBitWidth) + continue; + unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth; + bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth; + bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth); + if (!VecBitWidthsEqual) + continue; + if (!VectorType::isValidElementType(TgtTy)) + continue; + VectorType *CastSrcTy = VectorType::get(TgtTy, TgtNumElems); + if (!BegIsAligned) { + // Shuffle the input so [0,NumElements) contains the output, and + // [NumElems,SrcNumElems) is undef. + SmallVector<Constant *, 16> ShuffleMask(SrcNumElems, + UndefValue::get(Int32Ty)); + for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I) + ShuffleMask[I] = ConstantInt::get(Int32Ty, Idx); + V = Builder->CreateShuffleVector(V, UndefValue::get(V->getType()), + ConstantVector::get(ShuffleMask), + SVI.getName() + ".extract"); + BegIdx = 0; + } + unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth; + assert(SrcElemsPerTgtElem); + BegIdx /= SrcElemsPerTgtElem; + bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end(); + auto *NewBC = + BCAlreadyExists + ? NewBCs[CastSrcTy] + : Builder->CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc"); + if (!BCAlreadyExists) + NewBCs[CastSrcTy] = NewBC; + auto *Ext = Builder->CreateExtractElement( + NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract"); + // The shufflevector isn't being replaced: the bitcast that used it + // is. InstCombine will visit the newly-created instructions. + ReplaceInstUsesWith(*BC, Ext); + MadeChange = true; + } + } + // If the LHS is a shufflevector itself, see if we can combine it with this // one without producing an unusual shuffle. // Cases that might be simplified: @@ -1099,7 +1211,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { // or is a splat, do the replacement. if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) { SmallVector<Constant*, 16> Elts; - Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); for (unsigned i = 0, e = newMask.size(); i != e; ++i) { if (newMask[i] < 0) { Elts.push_back(UndefValue::get(Int32Ty)); diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h deleted file mode 100644 index 8d857d0..0000000 --- a/lib/Transforms/InstCombine/InstCombineWorklist.h +++ /dev/null @@ -1,107 +0,0 @@ -//===- InstCombineWorklist.h - Worklist for InstCombine pass ----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEWORKLIST_H -#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEWORKLIST_H - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/IR/Instruction.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -#define DEBUG_TYPE "instcombine" - -namespace llvm { - -/// InstCombineWorklist - This is the worklist management logic for -/// InstCombine. -class LLVM_LIBRARY_VISIBILITY InstCombineWorklist { - SmallVector<Instruction*, 256> Worklist; - DenseMap<Instruction*, unsigned> WorklistMap; - - void operator=(const InstCombineWorklist&RHS) LLVM_DELETED_FUNCTION; - InstCombineWorklist(const InstCombineWorklist&) LLVM_DELETED_FUNCTION; -public: - InstCombineWorklist() {} - - bool isEmpty() const { return Worklist.empty(); } - - /// Add - Add the specified instruction to the worklist if it isn't already - /// in it. - void Add(Instruction *I) { - if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second) { - DEBUG(dbgs() << "IC: ADD: " << *I << '\n'); - Worklist.push_back(I); - } - } - - void AddValue(Value *V) { - if (Instruction *I = dyn_cast<Instruction>(V)) - Add(I); - } - - /// AddInitialGroup - Add the specified batch of stuff in reverse order. - /// which should only be done when the worklist is empty and when the group - /// has no duplicates. - void AddInitialGroup(Instruction *const *List, unsigned NumEntries) { - assert(Worklist.empty() && "Worklist must be empty to add initial group"); - Worklist.reserve(NumEntries+16); - WorklistMap.resize(NumEntries); - DEBUG(dbgs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n"); - for (unsigned Idx = 0; NumEntries; --NumEntries) { - Instruction *I = List[NumEntries-1]; - WorklistMap.insert(std::make_pair(I, Idx++)); - Worklist.push_back(I); - } - } - - // Remove - remove I from the worklist if it exists. - void Remove(Instruction *I) { - DenseMap<Instruction*, unsigned>::iterator It = WorklistMap.find(I); - if (It == WorklistMap.end()) return; // Not in worklist. - - // Don't bother moving everything down, just null out the slot. - Worklist[It->second] = nullptr; - - WorklistMap.erase(It); - } - - Instruction *RemoveOne() { - Instruction *I = Worklist.pop_back_val(); - WorklistMap.erase(I); - return I; - } - - /// AddUsersToWorkList - When an instruction is simplified, add all users of - /// the instruction to the work lists because they might get more simplified - /// now. - /// - void AddUsersToWorkList(Instruction &I) { - for (User *U : I.users()) - Add(cast<Instruction>(U)); - } - - - /// Zap - check that the worklist is empty and nuke the backing store for - /// the map if it is large. - void Zap() { - assert(WorklistMap.empty() && "Worklist empty, but map not?"); - - // Do an explicit clear, this shrinks the map if needed. - WorklistMap.clear(); - } -}; - -} // end namespace llvm. - -#undef DEBUG_TYPE - -#endif diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index e4a4fef..88fcd53 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -33,18 +33,20 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" -#include "InstCombine.h" +#include "llvm/Transforms/InstCombine/InstCombine.h" +#include "InstCombineInternal.h" #include "llvm-c/Initialization.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LibCallSemantics.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/DataLayout.h" @@ -55,7 +57,7 @@ #include "llvm/IR/ValueHandle.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <climits> @@ -72,30 +74,6 @@ STATISTIC(NumExpand, "Number of expansions"); STATISTIC(NumFactor , "Number of factorizations"); STATISTIC(NumReassoc , "Number of reassociations"); -// Initialization Routines -void llvm::initializeInstCombine(PassRegistry &Registry) { - initializeInstCombinerPass(Registry); -} - -void LLVMInitializeInstCombine(LLVMPassRegistryRef R) { - initializeInstCombine(*unwrap(R)); -} - -char InstCombiner::ID = 0; -INITIALIZE_PASS_BEGIN(InstCombiner, "instcombine", - "Combine redundant instructions", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) -INITIALIZE_PASS_END(InstCombiner, "instcombine", - "Combine redundant instructions", false, false) - -void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesCFG(); - AU.addRequired<AssumptionTracker>(); - AU.addRequired<TargetLibraryInfo>(); -} - - Value *InstCombiner::EmitGEPOffset(User *GEP) { return llvm::EmitGEPOffset(Builder, *getDataLayout(), GEP); } @@ -796,8 +774,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { // If the incoming non-constant value is in I's block, we will remove one // instruction, but insert another equivalent one, leading to infinite // instcombine. - if (isPotentiallyReachable(I.getParent(), NonConstBB, DT, - getAnalysisIfAvailable<LoopInfo>())) + if (isPotentiallyReachable(I.getParent(), NonConstBB, DT, LI)) return nullptr; } @@ -1316,7 +1293,7 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) { Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end()); - if (Value *V = SimplifyGEPInst(Ops, DL, TLI, DT, AT)) + if (Value *V = SimplifyGEPInst(Ops, DL, TLI, DT, AC)) return ReplaceInstUsesWith(GEP, V); Value *PtrOp = GEP.getOperand(0); @@ -1414,8 +1391,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (DI == -1) { // All the GEPs feeding the PHI are identical. Clone one down into our // BB so that it can be merged with the current GEP. - GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(), - NewGEP); + GEP.getParent()->getInstList().insert( + GEP.getParent()->getFirstInsertionPt(), NewGEP); } else { // All the GEPs feeding the PHI differ at a single offset. Clone a GEP // into the current block so it can be merged, and create a new PHI to @@ -1431,8 +1408,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { PN->getIncomingBlock(I)); NewGEP->setOperand(DI, NewPN); - GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(), - NewGEP); + GEP.getParent()->getInstList().insert( + GEP.getParent()->getFirstInsertionPt(), NewGEP); NewGEP->setOperand(DI, NewPN); } @@ -2092,7 +2069,10 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { // the largest legal integer type. We need to be conservative here since // x86 generates redundant zero-extenstion instructions if the operand is // truncated to i8 or i16. - if (BitWidth > NewWidth && NewWidth >= DL->getLargestLegalIntTypeSize()) { + bool TruncCond = false; + if (DL && BitWidth > NewWidth && + NewWidth >= DL->getLargestLegalIntTypeSize()) { + TruncCond = true; IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth); Builder->SetInsertPoint(&SI); Value *NewCond = Builder->CreateTrunc(SI.getCondition(), Ty, "trunc"); @@ -2111,8 +2091,12 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) { ConstantInt* CaseVal = i.getCaseValue(); - Constant* NewCaseVal = ConstantExpr::getSub(cast<Constant>(CaseVal), - AddRHS); + Constant *LHS = CaseVal; + if (TruncCond) + LHS = LeadingKnownZeros + ? ConstantExpr::getZExt(CaseVal, Cond->getType()) + : ConstantExpr::getSExt(CaseVal, Cond->getType()); + Constant* NewCaseVal = ConstantExpr::getSub(LHS, AddRHS); assert(isa<ConstantInt>(NewCaseVal) && "Result of expression should be constant"); i.setValue(cast<ConstantInt>(NewCaseVal)); @@ -2122,7 +2106,8 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { return &SI; } } - return nullptr; + + return TruncCond ? &SI : nullptr; } Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { @@ -2275,41 +2260,27 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { return nullptr; } -enum Personality_Type { - Unknown_Personality, - GNU_Ada_Personality, - GNU_CXX_Personality, - GNU_ObjC_Personality -}; - -/// RecognizePersonality - See if the given exception handling personality -/// function is one that we understand. If so, return a description of it; -/// otherwise return Unknown_Personality. -static Personality_Type RecognizePersonality(Value *Pers) { - Function *F = dyn_cast<Function>(Pers->stripPointerCasts()); - if (!F) - return Unknown_Personality; - return StringSwitch<Personality_Type>(F->getName()) - .Case("__gnat_eh_personality", GNU_Ada_Personality) - .Case("__gxx_personality_v0", GNU_CXX_Personality) - .Case("__objc_personality_v0", GNU_ObjC_Personality) - .Default(Unknown_Personality); -} - /// isCatchAll - Return 'true' if the given typeinfo will match anything. -static bool isCatchAll(Personality_Type Personality, Constant *TypeInfo) { +static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) { switch (Personality) { - case Unknown_Personality: + case EHPersonality::GNU_C: + // The GCC C EH personality only exists to support cleanups, so it's not + // clear what the semantics of catch clauses are. return false; - case GNU_Ada_Personality: + case EHPersonality::Unknown: + return false; + case EHPersonality::GNU_Ada: // While __gnat_all_others_value will match any Ada exception, it doesn't // match foreign exceptions (or didn't, before gcc-4.7). return false; - case GNU_CXX_Personality: - case GNU_ObjC_Personality: + case EHPersonality::GNU_CXX: + case EHPersonality::GNU_ObjC: + case EHPersonality::MSVC_X86SEH: + case EHPersonality::MSVC_Win64SEH: + case EHPersonality::MSVC_CXX: return TypeInfo->isNullValue(); } - llvm_unreachable("Unknown personality!"); + llvm_unreachable("invalid enum"); } static bool shorter_filter(const Value *LHS, const Value *RHS) { @@ -2323,7 +2294,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { // The logic here should be correct for any real-world personality function. // However if that turns out not to be true, the offending logic can always // be conditioned on the personality function, like the catch-all logic is. - Personality_Type Personality = RecognizePersonality(LI.getPersonalityFn()); + EHPersonality Personality = classifyEHPersonality(LI.getPersonalityFn()); // Simplify the list of clauses, eg by removing repeated catch clauses // (these are often created by inlining). @@ -2614,9 +2585,6 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { return nullptr; } - - - /// TryToSinkInstruction - Try to move the specified instruction from its /// current block into the beginning of DestBlock, which can only happen if it's /// safe to move the instruction past all of the instructions between it and the @@ -2649,6 +2617,135 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { return true; } +bool InstCombiner::run() { + while (!Worklist.isEmpty()) { + Instruction *I = Worklist.RemoveOne(); + if (I == nullptr) continue; // skip null values. + + // Check to see if we can DCE the instruction. + if (isInstructionTriviallyDead(I, TLI)) { + DEBUG(dbgs() << "IC: DCE: " << *I << '\n'); + EraseInstFromFunction(*I); + ++NumDeadInst; + MadeIRChange = true; + continue; + } + + // Instruction isn't dead, see if we can constant propagate it. + if (!I->use_empty() && isa<Constant>(I->getOperand(0))) + if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) { + DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n'); + + // Add operands to the worklist. + ReplaceInstUsesWith(*I, C); + ++NumConstProp; + EraseInstFromFunction(*I); + MadeIRChange = true; + continue; + } + + // See if we can trivially sink this instruction to a successor basic block. + if (I->hasOneUse()) { + BasicBlock *BB = I->getParent(); + Instruction *UserInst = cast<Instruction>(*I->user_begin()); + BasicBlock *UserParent; + + // Get the block the use occurs in. + if (PHINode *PN = dyn_cast<PHINode>(UserInst)) + UserParent = PN->getIncomingBlock(*I->use_begin()); + else + UserParent = UserInst->getParent(); + + if (UserParent != BB) { + bool UserIsSuccessor = false; + // See if the user is one of our successors. + for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) + if (*SI == UserParent) { + UserIsSuccessor = true; + break; + } + + // If the user is one of our immediate successors, and if that successor + // only has us as a predecessors (we'd have to split the critical edge + // otherwise), we can keep going. + if (UserIsSuccessor && UserParent->getSinglePredecessor()) { + // Okay, the CFG is simple enough, try to sink this instruction. + if (TryToSinkInstruction(I, UserParent)) { + MadeIRChange = true; + // We'll add uses of the sunk instruction below, but since sinking + // can expose opportunities for it's *operands* add them to the + // worklist + for (Use &U : I->operands()) + if (Instruction *OpI = dyn_cast<Instruction>(U.get())) + Worklist.Add(OpI); + } + } + } + } + + // Now that we have an instruction, try combining it to simplify it. + Builder->SetInsertPoint(I->getParent(), I); + Builder->SetCurrentDebugLocation(I->getDebugLoc()); + +#ifndef NDEBUG + std::string OrigI; +#endif + DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str();); + DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n'); + + if (Instruction *Result = visit(*I)) { + ++NumCombined; + // Should we replace the old instruction with a new one? + if (Result != I) { + DEBUG(dbgs() << "IC: Old = " << *I << '\n' + << " New = " << *Result << '\n'); + + if (!I->getDebugLoc().isUnknown()) + Result->setDebugLoc(I->getDebugLoc()); + // Everything uses the new instruction now. + I->replaceAllUsesWith(Result); + + // Move the name to the new instruction first. + Result->takeName(I); + + // Push the new instruction and any users onto the worklist. + Worklist.Add(Result); + Worklist.AddUsersToWorkList(*Result); + + // Insert the new instruction into the basic block... + BasicBlock *InstParent = I->getParent(); + BasicBlock::iterator InsertPos = I; + + // If we replace a PHI with something that isn't a PHI, fix up the + // insertion point. + if (!isa<PHINode>(Result) && isa<PHINode>(InsertPos)) + InsertPos = InstParent->getFirstInsertionPt(); + + InstParent->getInstList().insert(InsertPos, Result); + + EraseInstFromFunction(*I); + } else { +#ifndef NDEBUG + DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n' + << " New = " << *I << '\n'); +#endif + + // If the instruction was modified, it's possible that it is now dead. + // if so, remove it. + if (isInstructionTriviallyDead(I, TLI)) { + EraseInstFromFunction(*I); + } else { + Worklist.Add(I); + Worklist.AddUsersToWorkList(*I); + } + } + MadeIRChange = true; + } + } + + Worklist.Zap(); + return MadeIRChange; +} /// AddReachableCodeToWorklist - Walk the function in depth-first order, adding /// all reachable code to the worklist. @@ -2661,7 +2758,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { /// static bool AddReachableCodeToWorklist(BasicBlock *BB, SmallPtrSetImpl<BasicBlock*> &Visited, - InstCombiner &IC, + InstCombineWorklist &ICWorklist, const DataLayout *DL, const TargetLibraryInfo *TLI) { bool MadeIRChange = false; @@ -2759,244 +2856,183 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, // of the function down. This jives well with the way that it adds all uses // of instructions to the worklist after doing a transformation, thus avoiding // some N^2 behavior in pathological cases. - IC.Worklist.AddInitialGroup(&InstrsForInstCombineWorklist[0], - InstrsForInstCombineWorklist.size()); + ICWorklist.AddInitialGroup(&InstrsForInstCombineWorklist[0], + InstrsForInstCombineWorklist.size()); return MadeIRChange; } -bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { - MadeIRChange = false; - - DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " - << F.getName() << "\n"); - - { - // Do a depth-first traversal of the function, populate the worklist with - // the reachable instructions. Ignore blocks that are not reachable. Keep - // track of which blocks we visit. - SmallPtrSet<BasicBlock*, 64> Visited; - MadeIRChange |= AddReachableCodeToWorklist(F.begin(), Visited, *this, DL, - TLI); - - // Do a quick scan over the function. If we find any blocks that are - // unreachable, remove any instructions inside of them. This prevents - // the instcombine code from having to deal with some bad special cases. - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (Visited.count(BB)) continue; - - // Delete the instructions backwards, as it has a reduced likelihood of - // having to update as many def-use and use-def chains. - Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. - while (EndInst != BB->begin()) { - // Delete the next to last instruction. - BasicBlock::iterator I = EndInst; - Instruction *Inst = --I; - if (!Inst->use_empty()) - Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); - if (isa<LandingPadInst>(Inst)) { - EndInst = Inst; - continue; - } - if (!isa<DbgInfoIntrinsic>(Inst)) { - ++NumDeadInst; - MadeIRChange = true; - } - Inst->eraseFromParent(); - } - } - } - - while (!Worklist.isEmpty()) { - Instruction *I = Worklist.RemoveOne(); - if (I == nullptr) continue; // skip null values. +/// \brief Populate the IC worklist from a function, and prune any dead basic +/// blocks discovered in the process. +/// +/// This also does basic constant propagation and other forward fixing to make +/// the combiner itself run much faster. +static bool prepareICWorklistFromFunction(Function &F, const DataLayout *DL, + TargetLibraryInfo *TLI, + InstCombineWorklist &ICWorklist) { + bool MadeIRChange = false; - // Check to see if we can DCE the instruction. - if (isInstructionTriviallyDead(I, TLI)) { - DEBUG(dbgs() << "IC: DCE: " << *I << '\n'); - EraseInstFromFunction(*I); - ++NumDeadInst; - MadeIRChange = true; + // Do a depth-first traversal of the function, populate the worklist with + // the reachable instructions. Ignore blocks that are not reachable. Keep + // track of which blocks we visit. + SmallPtrSet<BasicBlock *, 64> Visited; + MadeIRChange |= + AddReachableCodeToWorklist(F.begin(), Visited, ICWorklist, DL, TLI); + + // Do a quick scan over the function. If we find any blocks that are + // unreachable, remove any instructions inside of them. This prevents + // the instcombine code from having to deal with some bad special cases. + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (Visited.count(BB)) continue; - } - // Instruction isn't dead, see if we can constant propagate it. - if (!I->use_empty() && isa<Constant>(I->getOperand(0))) - if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) { - DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n'); - - // Add operands to the worklist. - ReplaceInstUsesWith(*I, C); - ++NumConstProp; - EraseInstFromFunction(*I); - MadeIRChange = true; + // Delete the instructions backwards, as it has a reduced likelihood of + // having to update as many def-use and use-def chains. + Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. + while (EndInst != BB->begin()) { + // Delete the next to last instruction. + BasicBlock::iterator I = EndInst; + Instruction *Inst = --I; + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); + if (isa<LandingPadInst>(Inst)) { + EndInst = Inst; continue; } - - // See if we can trivially sink this instruction to a successor basic block. - if (I->hasOneUse()) { - BasicBlock *BB = I->getParent(); - Instruction *UserInst = cast<Instruction>(*I->user_begin()); - BasicBlock *UserParent; - - // Get the block the use occurs in. - if (PHINode *PN = dyn_cast<PHINode>(UserInst)) - UserParent = PN->getIncomingBlock(*I->use_begin()); - else - UserParent = UserInst->getParent(); - - if (UserParent != BB) { - bool UserIsSuccessor = false; - // See if the user is one of our successors. - for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) - if (*SI == UserParent) { - UserIsSuccessor = true; - break; - } - - // If the user is one of our immediate successors, and if that successor - // only has us as a predecessors (we'd have to split the critical edge - // otherwise), we can keep going. - if (UserIsSuccessor && UserParent->getSinglePredecessor()) { - // Okay, the CFG is simple enough, try to sink this instruction. - if (TryToSinkInstruction(I, UserParent)) { - MadeIRChange = true; - // We'll add uses of the sunk instruction below, but since sinking - // can expose opportunities for it's *operands* add them to the - // worklist - for (Use &U : I->operands()) - if (Instruction *OpI = dyn_cast<Instruction>(U.get())) - Worklist.Add(OpI); - } - } + if (!isa<DbgInfoIntrinsic>(Inst)) { + ++NumDeadInst; + MadeIRChange = true; } + Inst->eraseFromParent(); } + } - // Now that we have an instruction, try combining it to simplify it. - Builder->SetInsertPoint(I->getParent(), I); - Builder->SetCurrentDebugLocation(I->getDebugLoc()); + return MadeIRChange; +} -#ifndef NDEBUG - std::string OrigI; -#endif - DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str();); - DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n'); +static bool combineInstructionsOverFunction( + Function &F, InstCombineWorklist &Worklist, AssumptionCache &AC, + TargetLibraryInfo &TLI, DominatorTree &DT, const DataLayout *DL = nullptr, + LoopInfo *LI = nullptr) { + // Minimizing size? + bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize); - if (Instruction *Result = visit(*I)) { - ++NumCombined; - // Should we replace the old instruction with a new one? - if (Result != I) { - DEBUG(dbgs() << "IC: Old = " << *I << '\n' - << " New = " << *Result << '\n'); + /// Builder - This is an IRBuilder that automatically inserts new + /// instructions into the worklist when they are created. + IRBuilder<true, TargetFolder, InstCombineIRInserter> Builder( + F.getContext(), TargetFolder(DL), InstCombineIRInserter(Worklist, &AC)); - if (!I->getDebugLoc().isUnknown()) - Result->setDebugLoc(I->getDebugLoc()); - // Everything uses the new instruction now. - I->replaceAllUsesWith(Result); + // Lower dbg.declare intrinsics otherwise their value may be clobbered + // by instcombiner. + bool DbgDeclaresChanged = LowerDbgDeclare(F); - // Move the name to the new instruction first. - Result->takeName(I); + // Iterate while there is work to do. + int Iteration = 0; + for (;;) { + ++Iteration; + DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " + << F.getName() << "\n"); - // Push the new instruction and any users onto the worklist. - Worklist.Add(Result); - Worklist.AddUsersToWorkList(*Result); + bool Changed = false; + if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist)) + Changed = true; - // Insert the new instruction into the basic block... - BasicBlock *InstParent = I->getParent(); - BasicBlock::iterator InsertPos = I; + InstCombiner IC(Worklist, &Builder, MinimizeSize, &AC, &TLI, &DT, DL, LI); + if (IC.run()) + Changed = true; - // If we replace a PHI with something that isn't a PHI, fix up the - // insertion point. - if (!isa<PHINode>(Result) && isa<PHINode>(InsertPos)) - InsertPos = InstParent->getFirstInsertionPt(); + if (!Changed) + break; + } - InstParent->getInstList().insert(InsertPos, Result); + return DbgDeclaresChanged || Iteration > 1; +} - EraseInstFromFunction(*I); - } else { -#ifndef NDEBUG - DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n' - << " New = " << *I << '\n'); -#endif +PreservedAnalyses InstCombinePass::run(Function &F, + AnalysisManager<Function> *AM) { + auto *DL = F.getParent()->getDataLayout(); - // If the instruction was modified, it's possible that it is now dead. - // if so, remove it. - if (isInstructionTriviallyDead(I, TLI)) { - EraseInstFromFunction(*I); - } else { - Worklist.Add(I); - Worklist.AddUsersToWorkList(*I); - } - } - MadeIRChange = true; - } - } + auto &AC = AM->getResult<AssumptionAnalysis>(F); + auto &DT = AM->getResult<DominatorTreeAnalysis>(F); + auto &TLI = AM->getResult<TargetLibraryAnalysis>(F); - Worklist.Zap(); - return MadeIRChange; + auto *LI = AM->getCachedResult<LoopAnalysis>(F); + + if (!combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, DL, LI)) + // No changes, all analyses are preserved. + return PreservedAnalyses::all(); + + // Mark all the analyses that instcombine updates as preserved. + // FIXME: Need a way to preserve CFG analyses here! + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + return PA; } namespace { -class InstCombinerLibCallSimplifier final : public LibCallSimplifier { - InstCombiner *IC; +/// \brief The legacy pass manager's instcombine pass. +/// +/// This is a basic whole-function wrapper around the instcombine utility. It +/// will try to combine all instructions in the function. +class InstructionCombiningPass : public FunctionPass { + InstCombineWorklist Worklist; + public: - InstCombinerLibCallSimplifier(const DataLayout *DL, - const TargetLibraryInfo *TLI, - InstCombiner *IC) - : LibCallSimplifier(DL, TLI) { - this->IC = IC; - } + static char ID; // Pass identification, replacement for typeid - /// replaceAllUsesWith - override so that instruction replacement - /// can be defined in terms of the instruction combiner framework. - void replaceAllUsesWith(Instruction *I, Value *With) const override { - IC->ReplaceInstUsesWith(*I, With); + InstructionCombiningPass() : FunctionPass(ID) { + initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry()); } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; }; } -bool InstCombiner::runOnFunction(Function &F) { +void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); +} + +bool InstructionCombiningPass::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; - AT = &getAnalysis<AssumptionTracker>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); + // Required analyses. + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; - - // Minimizing size? - MinimizeSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::MinSize); - - /// Builder - This is an IRBuilder that automatically inserts new - /// instructions into the worklist when they are created. - IRBuilder<true, TargetFolder, InstCombineIRInserter> - TheBuilder(F.getContext(), TargetFolder(DL), - InstCombineIRInserter(Worklist, AT)); - Builder = &TheBuilder; + // Optional analyses. + auto *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + auto *DL = DLP ? &DLP->getDataLayout() : nullptr; + auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); + auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - InstCombinerLibCallSimplifier TheSimplifier(DL, TLI, this); - Simplifier = &TheSimplifier; + return combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, DL, LI); +} - bool EverMadeChange = false; +char InstructionCombiningPass::ID = 0; +INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine", + "Combine redundant instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine", + "Combine redundant instructions", false, false) - // Lower dbg.declare intrinsics otherwise their value may be clobbered - // by instcombiner. - EverMadeChange = LowerDbgDeclare(F); - - // Iterate while there is work to do. - unsigned Iteration = 0; - while (DoOneIteration(F, Iteration++)) - EverMadeChange = true; +// Initialization Routines +void llvm::initializeInstCombine(PassRegistry &Registry) { + initializeInstructionCombiningPassPass(Registry); +} - Builder = nullptr; - return EverMadeChange; +void LLVMInitializeInstCombine(LLVMPassRegistryRef R) { + initializeInstructionCombiningPassPass(*unwrap(R)); } FunctionPass *llvm::createInstructionCombiningPass() { - return new InstCombiner(); + return new InstructionCombiningPass(); } diff --git a/lib/Transforms/InstCombine/LLVMBuild.txt b/lib/Transforms/InstCombine/LLVMBuild.txt index 62c6161..c26e0e3 100644 --- a/lib/Transforms/InstCombine/LLVMBuild.txt +++ b/lib/Transforms/InstCombine/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = InstCombine parent = Transforms -required_libraries = Analysis Core Support Target TransformUtils +required_libraries = Analysis Core Support TransformUtils diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 38f587f..882aab0 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/CallSite.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" @@ -36,10 +37,12 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/MC/MCSectionMachO.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Endian.h" +#include "llvm/Support/SwapByteOrder.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/ASanStackFrameLayout.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -61,9 +64,11 @@ static const uint64_t kDefaultShadowOffset64 = 1ULL << 44; static const uint64_t kSmallX86_64ShadowOffset = 0x7FFF8000; // < 2G. static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41; static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000; -static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 36; +static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37; +static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36; static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30; static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46; +static const uint64_t kWindowsShadowOffset32 = 3ULL << 28; static const size_t kMinStackMallocSize = 1 << 6; // 64B static const size_t kMaxStackMallocSize = 1 << 16; // 64K @@ -81,7 +86,7 @@ static const char *const kAsanUnregisterGlobalsName = "__asan_unregister_globals"; static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init"; static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init"; -static const char *const kAsanInitName = "__asan_init_v4"; +static const char *const kAsanInitName = "__asan_init_v5"; static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp"; static const char *const kAsanPtrSub = "__sanitizer_ptr_sub"; static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return"; @@ -105,6 +110,12 @@ static const int kAsanStackAfterReturnMagic = 0xf5; // Accesses sizes are powers of two: 1, 2, 4, 8, 16. static const size_t kNumberOfAccessSizes = 5; +static const unsigned kAllocaRzSize = 32; +static const unsigned kAsanAllocaLeftMagic = 0xcacacacaU; +static const unsigned kAsanAllocaRightMagic = 0xcbcbcbcbU; +static const unsigned kAsanAllocaPartialVal1 = 0xcbcbcb00U; +static const unsigned kAsanAllocaPartialVal2 = 0x000000cbU; + // Command-line flags. // This flag may need to be replaced with -f[no-]asan-reads. @@ -152,19 +163,8 @@ static cl::opt<std::string> ClMemoryAccessCallbackPrefix( "asan-memory-access-callback-prefix", cl::desc("Prefix for memory access callbacks"), cl::Hidden, cl::init("__asan_")); - -// This is an experimental feature that will allow to choose between -// instrumented and non-instrumented code at link-time. -// If this option is on, just before instrumenting a function we create its -// clone; if the function is not changed by asan the clone is deleted. -// If we end up with a clone, we put the instrumented function into a section -// called "ASAN" and the uninstrumented function into a section called "NOASAN". -// -// This is still a prototype, we need to figure out a way to keep two copies of -// a function so that the linker can easily choose one of them. -static cl::opt<bool> ClKeepUninstrumented("asan-keep-uninstrumented-functions", - cl::desc("Keep uninstrumented copies of functions"), - cl::Hidden, cl::init(false)); +static cl::opt<bool> ClInstrumentAllocas("asan-instrument-allocas", + cl::desc("instrument dynamic allocas"), cl::Hidden, cl::init(false)); // These flags allow to change the shadow mapping. // The shadow mapping looks like @@ -186,6 +186,11 @@ static cl::opt<bool> ClCheckLifetime("asan-check-lifetime", cl::desc("Use llvm.lifetime intrinsics to insert extra checks"), cl::Hidden, cl::init(false)); +static cl::opt<bool> ClDynamicAllocaStack( + "asan-stack-dynamic-alloca", + cl::desc("Use dynamic alloca to represent stack variables"), cl::Hidden, + cl::init(true)); + // Debug flags. static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden, cl::init(0)); @@ -200,6 +205,8 @@ static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"), STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); +STATISTIC(NumInstrumentedDynamicAllocas, + "Number of instrumented dynamic allocas"); STATISTIC(NumOptimizedAccessesToGlobalArray, "Number of optimized accesses to global arrays"); STATISTIC(NumOptimizedAccessesToGlobalVar, @@ -220,8 +227,10 @@ struct LocationMetadata { assert(MDN->getNumOperands() == 3); MDString *MDFilename = cast<MDString>(MDN->getOperand(0)); Filename = MDFilename->getString(); - LineNo = cast<ConstantInt>(MDN->getOperand(1))->getLimitedValue(); - ColumnNo = cast<ConstantInt>(MDN->getOperand(2))->getLimitedValue(); + LineNo = + mdconst::extract<ConstantInt>(MDN->getOperand(1))->getLimitedValue(); + ColumnNo = + mdconst::extract<ConstantInt>(MDN->getOperand(2))->getLimitedValue(); } }; @@ -249,23 +258,22 @@ class GlobalsMetadata { for (auto MDN : Globals->operands()) { // Metadata node contains the global and the fields of "Entry". assert(MDN->getNumOperands() == 5); - Value *V = MDN->getOperand(0); + auto *GV = mdconst::extract_or_null<GlobalVariable>(MDN->getOperand(0)); // The optimizer may optimize away a global entirely. - if (!V) + if (!GV) continue; - GlobalVariable *GV = cast<GlobalVariable>(V); // We can already have an entry for GV if it was merged with another // global. Entry &E = Entries[GV]; - if (Value *Loc = MDN->getOperand(1)) - E.SourceLoc.parse(cast<MDNode>(Loc)); - if (Value *Name = MDN->getOperand(2)) { - MDString *MDName = cast<MDString>(Name); - E.Name = MDName->getString(); - } - ConstantInt *IsDynInit = cast<ConstantInt>(MDN->getOperand(3)); + if (auto *Loc = cast_or_null<MDNode>(MDN->getOperand(1))) + E.SourceLoc.parse(Loc); + if (auto *Name = cast_or_null<MDString>(MDN->getOperand(2))) + E.Name = Name->getString(); + ConstantInt *IsDynInit = + mdconst::extract<ConstantInt>(MDN->getOperand(3)); E.IsDynInit |= IsDynInit->isOne(); - ConstantInt *IsBlacklisted = cast<ConstantInt>(MDN->getOperand(4)); + ConstantInt *IsBlacklisted = + mdconst::extract<ConstantInt>(MDN->getOperand(4)); E.IsBlacklisted |= IsBlacklisted->isOne(); } } @@ -289,12 +297,11 @@ struct ShadowMapping { bool OrShadowOffset; }; -static ShadowMapping getShadowMapping(const Module &M, int LongSize) { - llvm::Triple TargetTriple(M.getTargetTriple()); +static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize) { bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android; bool IsIOS = TargetTriple.isiOS(); - bool IsFreeBSD = TargetTriple.getOS() == llvm::Triple::FreeBSD; - bool IsLinux = TargetTriple.getOS() == llvm::Triple::Linux; + bool IsFreeBSD = TargetTriple.isOSFreeBSD(); + bool IsLinux = TargetTriple.isOSLinux(); bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 || TargetTriple.getArch() == llvm::Triple::ppc64le; bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64; @@ -302,6 +309,8 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize) { TargetTriple.getArch() == llvm::Triple::mipsel; bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 || TargetTriple.getArch() == llvm::Triple::mips64el; + bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64; + bool IsWindows = TargetTriple.isOSWindows(); ShadowMapping Mapping; @@ -314,6 +323,8 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize) { Mapping.Offset = kFreeBSD_ShadowOffset32; else if (IsIOS) Mapping.Offset = kIOSShadowOffset32; + else if (IsWindows) + Mapping.Offset = kWindowsShadowOffset32; else Mapping.Offset = kDefaultShadowOffset32; } else { // LongSize == 64 @@ -325,6 +336,8 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize) { Mapping.Offset = kSmallX86_64ShadowOffset; else if (IsMIPS64) Mapping.Offset = kMIPS64_ShadowOffset64; + else if (IsAArch64) + Mapping.Offset = kAArch64_ShadowOffset64; else Mapping.Offset = kDefaultShadowOffset64; } @@ -350,10 +363,15 @@ static size_t RedzoneSizeForScale(int MappingScale) { /// AddressSanitizer: instrument the code in module to find memory bugs. struct AddressSanitizer : public FunctionPass { - AddressSanitizer() : FunctionPass(ID) {} + AddressSanitizer() : FunctionPass(ID) { + initializeAddressSanitizerPass(*PassRegistry::getPassRegistry()); + } const char *getPassName() const override { return "AddressSanitizerFunctionPass"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + } void instrumentMop(Instruction *I, bool UseCalls); void instrumentPointerComparisonOrSubtraction(Instruction *I); void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore, @@ -371,6 +389,8 @@ struct AddressSanitizer : public FunctionPass { bool doInitialization(Module &M) override; static char ID; // Pass identification, replacement for typeid + DominatorTree &getDominatorTree() const { return *DT; } + private: void initializeCallbacks(Module &M); @@ -379,9 +399,11 @@ struct AddressSanitizer : public FunctionPass { LLVMContext *C; const DataLayout *DL; + Triple TargetTriple; int LongSize; Type *IntptrTy; ShadowMapping Mapping; + DominatorTree *DT; Function *AsanCtorFunction; Function *AsanInitFunction; Function *AsanHandleNoReturnFunc; @@ -423,6 +445,7 @@ class AddressSanitizerModule : public ModulePass { Type *IntptrTy; LLVMContext *C; const DataLayout *DL; + Triple TargetTriple; ShadowMapping Mapping; Function *AsanPoisonGlobals; Function *AsanUnpoisonGlobals; @@ -465,15 +488,36 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { }; SmallVector<AllocaPoisonCall, 8> AllocaPoisonCallVec; + // Stores left and right redzone shadow addresses for dynamic alloca + // and pointer to alloca instruction itself. + // LeftRzAddr is a shadow address for alloca left redzone. + // RightRzAddr is a shadow address for alloca right redzone. + struct DynamicAllocaCall { + AllocaInst *AI; + Value *LeftRzAddr; + Value *RightRzAddr; + bool Poison; + explicit DynamicAllocaCall(AllocaInst *AI, + Value *LeftRzAddr = nullptr, + Value *RightRzAddr = nullptr) + : AI(AI), LeftRzAddr(LeftRzAddr), RightRzAddr(RightRzAddr), Poison(true) + {} + }; + SmallVector<DynamicAllocaCall, 1> DynamicAllocaVec; + // Maps Value to an AllocaInst from which the Value is originated. typedef DenseMap<Value*, AllocaInst*> AllocaForValueMapTy; AllocaForValueMapTy AllocaForValue; + bool HasNonEmptyInlineAsm; + std::unique_ptr<CallInst> EmptyInlineAsm; + FunctionStackPoisoner(Function &F, AddressSanitizer &ASan) - : F(F), ASan(ASan), DIB(*F.getParent()), C(ASan.C), - IntptrTy(ASan.IntptrTy), IntptrPtrTy(PointerType::get(IntptrTy, 0)), - Mapping(ASan.Mapping), - StackAlignment(1 << Mapping.Scale) {} + : F(F), ASan(ASan), DIB(*F.getParent(), /*AllowUnresolved*/ false), + C(ASan.C), IntptrTy(ASan.IntptrTy), + IntptrPtrTy(PointerType::get(IntptrTy, 0)), Mapping(ASan.Mapping), + StackAlignment(1 << Mapping.Scale), HasNonEmptyInlineAsm(false), + EmptyInlineAsm(CallInst::Create(ASan.EmptyAsm)) {} bool runOnFunction() { if (!ClStack) return false; @@ -481,7 +525,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB); - if (AllocaVec.empty()) return false; + if (AllocaVec.empty() && DynamicAllocaVec.empty()) return false; initializeCallbacks(*F.getParent()); @@ -493,7 +537,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { return true; } - // Finds all static Alloca instructions and puts + // Finds all Alloca instructions and puts // poisoned red zones around all of them. // Then unpoison everything back before the function returns. void poisonStack(); @@ -504,12 +548,64 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { RetVec.push_back(&RI); } + // Unpoison dynamic allocas redzones. + void unpoisonDynamicAlloca(DynamicAllocaCall &AllocaCall) { + if (!AllocaCall.Poison) + return; + for (auto Ret : RetVec) { + IRBuilder<> IRBRet(Ret); + PointerType *Int32PtrTy = PointerType::getUnqual(IRBRet.getInt32Ty()); + Value *Zero = Constant::getNullValue(IRBRet.getInt32Ty()); + Value *PartialRzAddr = IRBRet.CreateSub(AllocaCall.RightRzAddr, + ConstantInt::get(IntptrTy, 4)); + IRBRet.CreateStore(Zero, IRBRet.CreateIntToPtr(AllocaCall.LeftRzAddr, + Int32PtrTy)); + IRBRet.CreateStore(Zero, IRBRet.CreateIntToPtr(PartialRzAddr, + Int32PtrTy)); + IRBRet.CreateStore(Zero, IRBRet.CreateIntToPtr(AllocaCall.RightRzAddr, + Int32PtrTy)); + } + } + + // Right shift for BigEndian and left shift for LittleEndian. + Value *shiftAllocaMagic(Value *Val, IRBuilder<> &IRB, Value *Shift) { + return ASan.DL->isLittleEndian() ? IRB.CreateShl(Val, Shift) + : IRB.CreateLShr(Val, Shift); + } + + // Compute PartialRzMagic for dynamic alloca call. Since we don't know the + // size of requested memory until runtime, we should compute it dynamically. + // If PartialSize is 0, PartialRzMagic would contain kAsanAllocaRightMagic, + // otherwise it would contain the value that we will use to poison the + // partial redzone for alloca call. + Value *computePartialRzMagic(Value *PartialSize, IRBuilder<> &IRB); + + // Deploy and poison redzones around dynamic alloca call. To do this, we + // should replace this call with another one with changed parameters and + // replace all its uses with new address, so + // addr = alloca type, old_size, align + // is replaced by + // new_size = (old_size + additional_size) * sizeof(type) + // tmp = alloca i8, new_size, max(align, 32) + // addr = tmp + 32 (first 32 bytes are for the left redzone). + // Additional_size is added to make new memory allocation contain not only + // requested memory, but also left, partial and right redzones. + // After that, we should poison redzones: + // (1) Left redzone with kAsanAllocaLeftMagic. + // (2) Partial redzone with the value, computed in runtime by + // computePartialRzMagic function. + // (3) Right redzone with kAsanAllocaRightMagic. + void handleDynamicAllocaCall(DynamicAllocaCall &AllocaCall); + /// \brief Collect Alloca instructions we want (and can) handle. void visitAllocaInst(AllocaInst &AI) { if (!isInterestingAlloca(AI)) return; StackAlignment = std::max(StackAlignment, AI.getAlignment()); - AllocaVec.push_back(&AI); + if (isDynamicAlloca(AI)) + DynamicAllocaVec.push_back(DynamicAllocaCall(&AI)); + else + AllocaVec.push_back(&AI); } /// \brief Collect lifetime intrinsic calls to check for use-after-scope @@ -538,13 +634,29 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { AllocaPoisonCallVec.push_back(APC); } + void visitCallInst(CallInst &CI) { + HasNonEmptyInlineAsm |= + CI.isInlineAsm() && !CI.isIdenticalTo(EmptyInlineAsm.get()); + } + // ---------------------- Helpers. void initializeCallbacks(Module &M); + bool doesDominateAllExits(const Instruction *I) const { + for (auto Ret : RetVec) { + if (!ASan.getDominatorTree().dominates(I, Ret)) + return false; + } + return true; + } + + bool isDynamicAlloca(AllocaInst &AI) const { + return AI.isArrayAllocation() || !AI.isStaticAlloca(); + } + // Check if we want (and can) handle this alloca. bool isInterestingAlloca(AllocaInst &AI) const { - return (!AI.isArrayAllocation() && AI.isStaticAlloca() && - AI.getAllocatedType()->isSized() && + return (AI.getAllocatedType()->isSized() && // alloca() may be called with 0 size, ignore it. getAllocaSizeInBytes(&AI) > 0); } @@ -562,12 +674,20 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { void SetShadowToStackAfterReturnInlined(IRBuilder<> &IRB, Value *ShadowBase, int Size); + Value *createAllocaForLayout(IRBuilder<> &IRB, const ASanStackFrameLayout &L, + bool Dynamic); + PHINode *createPHI(IRBuilder<> &IRB, Value *Cond, Value *ValueIfTrue, + Instruction *ThenTerm, Value *ValueIfFalse); }; } // namespace char AddressSanitizer::ID = 0; -INITIALIZE_PASS(AddressSanitizer, "asan", +INITIALIZE_PASS_BEGIN(AddressSanitizer, "asan", + "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(AddressSanitizer, "asan", "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false, false) FunctionPass *llvm::createAddressSanitizerFunctionPass() { @@ -951,37 +1071,47 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { if (G->hasSection()) { StringRef Section(G->getSection()); - // Ignore the globals from the __OBJC section. The ObjC runtime assumes - // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to - // them. - if (Section.startswith("__OBJC,") || - Section.startswith("__DATA, __objc_")) { - DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n"); - return false; - } - // See http://code.google.com/p/address-sanitizer/issues/detail?id=32 - // Constant CFString instances are compiled in the following way: - // -- the string buffer is emitted into - // __TEXT,__cstring,cstring_literals - // -- the constant NSConstantString structure referencing that buffer - // is placed into __DATA,__cfstring - // Therefore there's no point in placing redzones into __DATA,__cfstring. - // Moreover, it causes the linker to crash on OS X 10.7 - if (Section.startswith("__DATA,__cfstring")) { - DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n"); - return false; - } - // The linker merges the contents of cstring_literals and removes the - // trailing zeroes. - if (Section.startswith("__TEXT,__cstring,cstring_literals")) { - DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n"); - return false; - } - if (Section.startswith("__TEXT,__objc_methname,cstring_literals")) { - DEBUG(dbgs() << "Ignoring objc_methname cstring global: " << *G << "\n"); - return false; - } + if (TargetTriple.isOSBinFormatMachO()) { + StringRef ParsedSegment, ParsedSection; + unsigned TAA = 0, StubSize = 0; + bool TAAParsed; + std::string ErrorCode = + MCSectionMachO::ParseSectionSpecifier(Section, ParsedSegment, + ParsedSection, TAA, TAAParsed, + StubSize); + if (!ErrorCode.empty()) { + report_fatal_error("Invalid section specifier '" + ParsedSection + + "': " + ErrorCode + "."); + } + + // Ignore the globals from the __OBJC section. The ObjC runtime assumes + // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to + // them. + if (ParsedSegment == "__OBJC" || + (ParsedSegment == "__DATA" && ParsedSection.startswith("__objc_"))) { + DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n"); + return false; + } + // See http://code.google.com/p/address-sanitizer/issues/detail?id=32 + // Constant CFString instances are compiled in the following way: + // -- the string buffer is emitted into + // __TEXT,__cstring,cstring_literals + // -- the constant NSConstantString structure referencing that buffer + // is placed into __DATA,__cfstring + // Therefore there's no point in placing redzones into __DATA,__cfstring. + // Moreover, it causes the linker to crash on OS X 10.7 + if (ParsedSegment == "__DATA" && ParsedSection == "__cfstring") { + DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n"); + return false; + } + // The linker merges the contents of cstring_literals and removes the + // trailing zeroes. + if (ParsedSegment == "__TEXT" && (TAA & MachO::S_CSTRING_LITERALS)) { + DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n"); + return false; + } + } // Callbacks put into the CRT initializer/terminator sections // should not be instrumented. @@ -1165,7 +1295,8 @@ bool AddressSanitizerModule::runOnModule(Module &M) { C = &(M.getContext()); int LongSize = DL->getPointerSizeInBits(); IntptrTy = Type::getIntNTy(*C, LongSize); - Mapping = getShadowMapping(M, LongSize); + TargetTriple = Triple(M.getTargetTriple()); + Mapping = getShadowMapping(TargetTriple, LongSize); initializeCallbacks(M); bool Changed = false; @@ -1247,6 +1378,7 @@ bool AddressSanitizer::doInitialization(Module &M) { C = &(M.getContext()); LongSize = DL->getPointerSizeInBits(); IntptrTy = Type::getIntNTy(*C, LongSize); + TargetTriple = Triple(M.getTargetTriple()); AsanCtorFunction = Function::Create( FunctionType::get(Type::getVoidTy(*C), false), @@ -1259,7 +1391,7 @@ bool AddressSanitizer::doInitialization(Module &M) { AsanInitFunction->setLinkage(Function::ExternalLinkage); IRB.CreateCall(AsanInitFunction); - Mapping = getShadowMapping(M, LongSize); + Mapping = getShadowMapping(TargetTriple, LongSize); appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority); return true; @@ -1287,6 +1419,8 @@ bool AddressSanitizer::runOnFunction(Function &F) { DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n"); initializeCallbacks(*F.getParent()); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + // If needed, insert __asan_init before checking for SanitizeAddress attr. maybeInsertAsanInitAtFunctionEntry(F); @@ -1345,17 +1479,6 @@ bool AddressSanitizer::runOnFunction(Function &F) { } } - Function *UninstrumentedDuplicate = nullptr; - bool LikelyToInstrument = - !NoReturnCalls.empty() || !ToInstrument.empty() || (NumAllocas > 0); - if (ClKeepUninstrumented && LikelyToInstrument) { - ValueToValueMapTy VMap; - UninstrumentedDuplicate = CloneFunction(&F, VMap, false); - UninstrumentedDuplicate->removeFnAttr(Attribute::SanitizeAddress); - UninstrumentedDuplicate->setName("NOASAN_" + F.getName()); - F.getParent()->getFunctionList().push_back(UninstrumentedDuplicate); - } - bool UseCalls = false; if (ClInstrumentationWithCallsThreshold >= 0 && ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold) @@ -1393,20 +1516,6 @@ bool AddressSanitizer::runOnFunction(Function &F) { DEBUG(dbgs() << "ASAN done instrumenting: " << res << " " << F << "\n"); - if (ClKeepUninstrumented) { - if (!res) { - // No instrumentation is done, no need for the duplicate. - if (UninstrumentedDuplicate) - UninstrumentedDuplicate->eraseFromParent(); - } else { - // The function was instrumented. We must have the duplicate. - assert(UninstrumentedDuplicate); - UninstrumentedDuplicate->setSection("NOASAN"); - assert(!F.hasSection()); - F.setSection("ASAN"); - } - } - return res; } @@ -1426,12 +1535,11 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) { IRBuilder<> IRB(*C); for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) { std::string Suffix = itostr(i); - AsanStackMallocFunc[i] = checkInterfaceFunction( - M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy, - IntptrTy, IntptrTy, nullptr)); - AsanStackFreeFunc[i] = checkInterfaceFunction(M.getOrInsertFunction( - kAsanStackFreeNameTemplate + Suffix, IRB.getVoidTy(), IntptrTy, - IntptrTy, IntptrTy, nullptr)); + AsanStackMallocFunc[i] = checkInterfaceFunction(M.getOrInsertFunction( + kAsanStackMallocNameTemplate + Suffix, IntptrTy, IntptrTy, nullptr)); + AsanStackFreeFunc[i] = checkInterfaceFunction( + M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix, + IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr)); } AsanPoisonStackMemoryFunc = checkInterfaceFunction( M.getOrInsertFunction(kAsanPoisonStackMemoryName, IRB.getVoidTy(), @@ -1503,11 +1611,52 @@ static DebugLoc getFunctionEntryDebugLocation(Function &F) { return DebugLoc(); } +PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond, + Value *ValueIfTrue, + Instruction *ThenTerm, + Value *ValueIfFalse) { + PHINode *PHI = IRB.CreatePHI(IntptrTy, 2); + BasicBlock *CondBlock = cast<Instruction>(Cond)->getParent(); + PHI->addIncoming(ValueIfFalse, CondBlock); + BasicBlock *ThenBlock = ThenTerm->getParent(); + PHI->addIncoming(ValueIfTrue, ThenBlock); + return PHI; +} + +Value *FunctionStackPoisoner::createAllocaForLayout( + IRBuilder<> &IRB, const ASanStackFrameLayout &L, bool Dynamic) { + AllocaInst *Alloca; + if (Dynamic) { + Alloca = IRB.CreateAlloca(IRB.getInt8Ty(), + ConstantInt::get(IRB.getInt64Ty(), L.FrameSize), + "MyAlloca"); + } else { + Alloca = IRB.CreateAlloca(ArrayType::get(IRB.getInt8Ty(), L.FrameSize), + nullptr, "MyAlloca"); + assert(Alloca->isStaticAlloca()); + } + assert((ClRealignStack & (ClRealignStack - 1)) == 0); + size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack); + Alloca->setAlignment(FrameAlignment); + return IRB.CreatePointerCast(Alloca, IntptrTy); +} + void FunctionStackPoisoner::poisonStack() { + assert(AllocaVec.size() > 0 || DynamicAllocaVec.size() > 0); + + if (ClInstrumentAllocas) { + // Handle dynamic allocas. + for (auto &AllocaCall : DynamicAllocaVec) { + handleDynamicAllocaCall(AllocaCall); + unpoisonDynamicAlloca(AllocaCall); + } + } + + if (AllocaVec.size() == 0) return; + int StackMallocIdx = -1; DebugLoc EntryDebugLocation = getFunctionEntryDebugLocation(F); - assert(AllocaVec.size() > 0); Instruction *InsBefore = AllocaVec[0]; IRBuilder<> IRB(InsBefore); IRB.SetCurrentDebugLocation(EntryDebugLocation); @@ -1529,42 +1678,56 @@ void FunctionStackPoisoner::poisonStack() { uint64_t LocalStackSize = L.FrameSize; bool DoStackMalloc = ClUseAfterReturn && LocalStackSize <= kMaxStackMallocSize; + // Don't do dynamic alloca in presence of inline asm: too often it + // makes assumptions on which registers are available. + bool DoDynamicAlloca = ClDynamicAllocaStack && !HasNonEmptyInlineAsm; - Type *ByteArrayTy = ArrayType::get(IRB.getInt8Ty(), LocalStackSize); - AllocaInst *MyAlloca = - new AllocaInst(ByteArrayTy, "MyAlloca", InsBefore); - MyAlloca->setDebugLoc(EntryDebugLocation); - assert((ClRealignStack & (ClRealignStack - 1)) == 0); - size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack); - MyAlloca->setAlignment(FrameAlignment); - assert(MyAlloca->isStaticAlloca()); - Value *OrigStackBase = IRB.CreatePointerCast(MyAlloca, IntptrTy); - Value *LocalStackBase = OrigStackBase; + Value *StaticAlloca = + DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false); + + Value *FakeStack; + Value *LocalStackBase; if (DoStackMalloc) { - // LocalStackBase = OrigStackBase - // if (__asan_option_detect_stack_use_after_return) - // LocalStackBase = __asan_stack_malloc_N(LocalStackBase, OrigStackBase); - StackMallocIdx = StackMallocSizeClass(LocalStackSize); - assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass); + // void *FakeStack = __asan_option_detect_stack_use_after_return + // ? __asan_stack_malloc_N(LocalStackSize) + // : nullptr; + // void *LocalStackBase = (FakeStack) ? FakeStack : alloca(LocalStackSize); Constant *OptionDetectUAR = F.getParent()->getOrInsertGlobal( kAsanOptionDetectUAR, IRB.getInt32Ty()); - Value *Cmp = IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUAR), - Constant::getNullValue(IRB.getInt32Ty())); - Instruction *Term = SplitBlockAndInsertIfThen(Cmp, InsBefore, false); - BasicBlock *CmpBlock = cast<Instruction>(Cmp)->getParent(); + Value *UARIsEnabled = + IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUAR), + Constant::getNullValue(IRB.getInt32Ty())); + Instruction *Term = + SplitBlockAndInsertIfThen(UARIsEnabled, InsBefore, false); IRBuilder<> IRBIf(Term); IRBIf.SetCurrentDebugLocation(EntryDebugLocation); - LocalStackBase = IRBIf.CreateCall2( - AsanStackMallocFunc[StackMallocIdx], - ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase); - BasicBlock *SetBlock = cast<Instruction>(LocalStackBase)->getParent(); + StackMallocIdx = StackMallocSizeClass(LocalStackSize); + assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass); + Value *FakeStackValue = + IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx], + ConstantInt::get(IntptrTy, LocalStackSize)); IRB.SetInsertPoint(InsBefore); IRB.SetCurrentDebugLocation(EntryDebugLocation); - PHINode *Phi = IRB.CreatePHI(IntptrTy, 2); - Phi->addIncoming(OrigStackBase, CmpBlock); - Phi->addIncoming(LocalStackBase, SetBlock); - LocalStackBase = Phi; + FakeStack = createPHI(IRB, UARIsEnabled, FakeStackValue, Term, + ConstantInt::get(IntptrTy, 0)); + + Value *NoFakeStack = + IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy)); + Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false); + IRBIf.SetInsertPoint(Term); + IRBIf.SetCurrentDebugLocation(EntryDebugLocation); + Value *AllocaValue = + DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca; + IRB.SetInsertPoint(InsBefore); + IRB.SetCurrentDebugLocation(EntryDebugLocation); + LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack); + } else { + // void *FakeStack = nullptr; + // void *LocalStackBase = alloca(LocalStackSize); + FakeStack = ConstantInt::get(IntptrTy, 0); + LocalStackBase = + DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca; } // Insert poison calls for lifetime intrinsics for alloca. @@ -1583,7 +1746,7 @@ void FunctionStackPoisoner::poisonStack() { Value *NewAllocaPtr = IRB.CreateIntToPtr( IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)), AI->getType()); - replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB); + replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB, /*Deref=*/true); AI->replaceAllUsesWith(NewAllocaPtr); } @@ -1621,17 +1784,18 @@ void FunctionStackPoisoner::poisonStack() { BasePlus0); if (DoStackMalloc) { assert(StackMallocIdx >= 0); - // if LocalStackBase != OrigStackBase: + // if FakeStack != 0 // LocalStackBase == FakeStack // // In use-after-return mode, poison the whole stack frame. // if StackMallocIdx <= 4 // // For small sizes inline the whole thing: // memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize); - // **SavedFlagPtr(LocalStackBase) = 0 + // **SavedFlagPtr(FakeStack) = 0 // else - // __asan_stack_free_N(LocalStackBase, OrigStackBase) + // __asan_stack_free_N(FakeStack, LocalStackSize) // else // <This is not a fake stack; unpoison the redzones> - Value *Cmp = IRBRet.CreateICmpNE(LocalStackBase, OrigStackBase); + Value *Cmp = + IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy)); TerminatorInst *ThenTerm, *ElseTerm; SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm); @@ -1641,7 +1805,7 @@ void FunctionStackPoisoner::poisonStack() { SetShadowToStackAfterReturnInlined(IRBPoison, ShadowBase, ClassSize >> Mapping.Scale); Value *SavedFlagPtrPtr = IRBPoison.CreateAdd( - LocalStackBase, + FakeStack, ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8)); Value *SavedFlagPtr = IRBPoison.CreateLoad( IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy)); @@ -1650,9 +1814,8 @@ void FunctionStackPoisoner::poisonStack() { IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy())); } else { // For larger frames call __asan_stack_free_*. - IRBPoison.CreateCall3(AsanStackFreeFunc[StackMallocIdx], LocalStackBase, - ConstantInt::get(IntptrTy, LocalStackSize), - OrigStackBase); + IRBPoison.CreateCall2(AsanStackFreeFunc[StackMallocIdx], FakeStack, + ConstantInt::get(IntptrTy, LocalStackSize)); } IRBuilder<> IRBElse(ElseTerm); @@ -1660,7 +1823,6 @@ void FunctionStackPoisoner::poisonStack() { } else if (HavePoisonedAllocas) { // If we poisoned some allocas in llvm.lifetime analysis, // unpoison whole stack frame now. - assert(LocalStackBase == OrigStackBase); poisonAlloca(LocalStackBase, LocalStackSize, IRBRet, false); } else { poisonRedZones(L.ShadowBytes, IRBRet, ShadowBase, false); @@ -1722,3 +1884,140 @@ AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) { AllocaForValue[V] = Res; return Res; } + +// Compute PartialRzMagic for dynamic alloca call. PartialRzMagic is +// constructed from two separate 32-bit numbers: PartialRzMagic = Val1 | Val2. +// (1) Val1 is resposible for forming base value for PartialRzMagic, containing +// only 00 for fully addressable and 0xcb for fully poisoned bytes for each +// 8-byte chunk of user memory respectively. +// (2) Val2 forms the value for marking first poisoned byte in shadow memory +// with appropriate value (0x01 - 0x07 or 0xcb if Padding % 8 == 0). + +// Shift = Padding & ~7; // the number of bits we need to shift to access first +// chunk in shadow memory, containing nonzero bytes. +// Example: +// Padding = 21 Padding = 16 +// Shadow: |00|00|05|cb| Shadow: |00|00|cb|cb| +// ^ ^ +// | | +// Shift = 21 & ~7 = 16 Shift = 16 & ~7 = 16 +// +// Val1 = 0xcbcbcbcb << Shift; +// PartialBits = Padding ? Padding & 7 : 0xcb; +// Val2 = PartialBits << Shift; +// Result = Val1 | Val2; +Value *FunctionStackPoisoner::computePartialRzMagic(Value *PartialSize, + IRBuilder<> &IRB) { + PartialSize = IRB.CreateIntCast(PartialSize, IRB.getInt32Ty(), false); + Value *Shift = IRB.CreateAnd(PartialSize, IRB.getInt32(~7)); + unsigned Val1Int = kAsanAllocaPartialVal1; + unsigned Val2Int = kAsanAllocaPartialVal2; + if (!ASan.DL->isLittleEndian()) { + Val1Int = sys::getSwappedBytes(Val1Int); + Val2Int = sys::getSwappedBytes(Val2Int); + } + Value *Val1 = shiftAllocaMagic(IRB.getInt32(Val1Int), IRB, Shift); + Value *PartialBits = IRB.CreateAnd(PartialSize, IRB.getInt32(7)); + // For BigEndian get 0x000000YZ -> 0xYZ000000. + if (ASan.DL->isBigEndian()) + PartialBits = IRB.CreateShl(PartialBits, IRB.getInt32(24)); + Value *Val2 = IRB.getInt32(Val2Int); + Value *Cond = + IRB.CreateICmpNE(PartialBits, Constant::getNullValue(IRB.getInt32Ty())); + Val2 = IRB.CreateSelect(Cond, shiftAllocaMagic(PartialBits, IRB, Shift), + shiftAllocaMagic(Val2, IRB, Shift)); + return IRB.CreateOr(Val1, Val2); +} + +void FunctionStackPoisoner::handleDynamicAllocaCall( + DynamicAllocaCall &AllocaCall) { + AllocaInst *AI = AllocaCall.AI; + if (!doesDominateAllExits(AI)) { + // We do not yet handle complex allocas + AllocaCall.Poison = false; + return; + } + + IRBuilder<> IRB(AI); + + PointerType *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); + const unsigned Align = std::max(kAllocaRzSize, AI->getAlignment()); + const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1; + + Value *Zero = Constant::getNullValue(IntptrTy); + Value *AllocaRzSize = ConstantInt::get(IntptrTy, kAllocaRzSize); + Value *AllocaRzMask = ConstantInt::get(IntptrTy, AllocaRedzoneMask); + Value *NotAllocaRzMask = ConstantInt::get(IntptrTy, ~AllocaRedzoneMask); + + // Since we need to extend alloca with additional memory to locate + // redzones, and OldSize is number of allocated blocks with + // ElementSize size, get allocated memory size in bytes by + // OldSize * ElementSize. + unsigned ElementSize = ASan.DL->getTypeAllocSize(AI->getAllocatedType()); + Value *OldSize = IRB.CreateMul(AI->getArraySize(), + ConstantInt::get(IntptrTy, ElementSize)); + + // PartialSize = OldSize % 32 + Value *PartialSize = IRB.CreateAnd(OldSize, AllocaRzMask); + + // Misalign = kAllocaRzSize - PartialSize; + Value *Misalign = IRB.CreateSub(AllocaRzSize, PartialSize); + + // PartialPadding = Misalign != kAllocaRzSize ? Misalign : 0; + Value *Cond = IRB.CreateICmpNE(Misalign, AllocaRzSize); + Value *PartialPadding = IRB.CreateSelect(Cond, Misalign, Zero); + + // AdditionalChunkSize = Align + PartialPadding + kAllocaRzSize + // Align is added to locate left redzone, PartialPadding for possible + // partial redzone and kAllocaRzSize for right redzone respectively. + Value *AdditionalChunkSize = IRB.CreateAdd( + ConstantInt::get(IntptrTy, Align + kAllocaRzSize), PartialPadding); + + Value *NewSize = IRB.CreateAdd(OldSize, AdditionalChunkSize); + + // Insert new alloca with new NewSize and Align params. + AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize); + NewAlloca->setAlignment(Align); + + // NewAddress = Address + Align + Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy), + ConstantInt::get(IntptrTy, Align)); + + Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType()); + + // LeftRzAddress = NewAddress - kAllocaRzSize + Value *LeftRzAddress = IRB.CreateSub(NewAddress, AllocaRzSize); + + // Poisoning left redzone. + AllocaCall.LeftRzAddr = ASan.memToShadow(LeftRzAddress, IRB); + IRB.CreateStore(ConstantInt::get(IRB.getInt32Ty(), kAsanAllocaLeftMagic), + IRB.CreateIntToPtr(AllocaCall.LeftRzAddr, Int32PtrTy)); + + // PartialRzAligned = PartialRzAddr & ~AllocaRzMask + Value *PartialRzAddr = IRB.CreateAdd(NewAddress, OldSize); + Value *PartialRzAligned = IRB.CreateAnd(PartialRzAddr, NotAllocaRzMask); + + // Poisoning partial redzone. + Value *PartialRzMagic = computePartialRzMagic(PartialSize, IRB); + Value *PartialRzShadowAddr = ASan.memToShadow(PartialRzAligned, IRB); + IRB.CreateStore(PartialRzMagic, + IRB.CreateIntToPtr(PartialRzShadowAddr, Int32PtrTy)); + + // RightRzAddress + // = (PartialRzAddr + AllocaRzMask) & ~AllocaRzMask + Value *RightRzAddress = IRB.CreateAnd( + IRB.CreateAdd(PartialRzAddr, AllocaRzMask), NotAllocaRzMask); + + // Poisoning right redzone. + AllocaCall.RightRzAddr = ASan.memToShadow(RightRzAddress, IRB); + IRB.CreateStore(ConstantInt::get(IRB.getInt32Ty(), kAsanAllocaRightMagic), + IRB.CreateIntToPtr(AllocaCall.RightRzAddr, Int32PtrTy)); + + // Replace all uses of AddessReturnedByAlloca with NewAddress. + AI->replaceAllUsesWith(NewAddressPtr); + + // We are done. Erase old alloca and store left, partial and right redzones + // shadow addresses for future unpoisoning. + AI->eraseFromParent(); + NumInstrumentedDynamicAllocas++; +} diff --git a/lib/Transforms/Instrumentation/Android.mk b/lib/Transforms/Instrumentation/Android.mk index 1f21028..46f0281 100644 --- a/lib/Transforms/Instrumentation/Android.mk +++ b/lib/Transforms/Instrumentation/Android.mk @@ -4,8 +4,8 @@ instrumentation_SRC_FILES := \ AddressSanitizer.cpp \ BoundsChecking.cpp \ DataFlowSanitizer.cpp \ - DebugIR.cpp \ GCOVProfiling.cpp \ + InstrProfiling.cpp \ Instrumentation.cpp \ MemorySanitizer.cpp \ SanitizerCoverage.cpp \ diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp index 9a5cea8..2b5f39c 100644 --- a/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -24,7 +24,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; #define DEBUG_TYPE "bounds-checking" @@ -50,7 +50,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DataLayoutPass>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } private: @@ -166,7 +166,7 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) { bool BoundsChecking::runOnFunction(Function &F) { DL = &getAnalysis<DataLayoutPass>().getDataLayout(); - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); TrapBB = nullptr; BuilderTy TheBuilder(F.getContext(), TargetFolder(DL)); diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt index 139e514..b2ff033 100644 --- a/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/lib/Transforms/Instrumentation/CMakeLists.txt @@ -2,12 +2,15 @@ add_llvm_library(LLVMInstrumentation AddressSanitizer.cpp BoundsChecking.cpp DataFlowSanitizer.cpp - DebugIR.cpp GCOVProfiling.cpp MemorySanitizer.cpp Instrumentation.cpp + InstrProfiling.cpp SanitizerCoverage.cpp ThreadSanitizer.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms ) add_dependencies(LLVMInstrumentation intrinsics_gen) diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index c5a4860..6adf0d2 100644 --- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -49,6 +49,7 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Triple.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/DebugInfo.h" @@ -82,14 +83,14 @@ static cl::opt<bool> ClPreserveAlignment( cl::desc("respect alignment requirements provided by input IR"), cl::Hidden, cl::init(false)); -// The ABI list file controls how shadow parameters are passed. The pass treats +// The ABI list files control how shadow parameters are passed. The pass treats // every function labelled "uninstrumented" in the ABI list file as conforming // to the "native" (i.e. unsanitized) ABI. Unless the ABI list contains // additional annotations for those functions, a call to one of those functions // will produce a warning message, as the labelling behaviour of the function is // unknown. The other supported annotations are "functional" and "discard", // which are described below under DataFlowSanitizer::WrapperKind. -static cl::opt<std::string> ClABIListFile( +static cl::list<std::string> ClABIListFiles( "dfsan-abilist", cl::desc("File listing native ABI functions and how the pass treats them"), cl::Hidden); @@ -140,7 +141,9 @@ class DFSanABIList { std::unique_ptr<SpecialCaseList> SCL; public: - DFSanABIList(std::unique_ptr<SpecialCaseList> SCL) : SCL(std::move(SCL)) {} + DFSanABIList() {} + + void set(std::unique_ptr<SpecialCaseList> List) { SCL = std::move(List); } /// Returns whether either this function or its source file are listed in the /// given category. @@ -263,9 +266,9 @@ class DataFlowSanitizer : public ModulePass { Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName); public: - DataFlowSanitizer(StringRef ABIListFile = StringRef(), - void *(*getArgTLS)() = nullptr, - void *(*getRetValTLS)() = nullptr); + DataFlowSanitizer( + const std::vector<std::string> &ABIListFiles = std::vector<std::string>(), + void *(*getArgTLS)() = nullptr, void *(*getRetValTLS)() = nullptr); static char ID; bool doInitialization(Module &M) override; bool runOnModule(Module &M) override; @@ -350,25 +353,26 @@ char DataFlowSanitizer::ID; INITIALIZE_PASS(DataFlowSanitizer, "dfsan", "DataFlowSanitizer: dynamic data flow analysis.", false, false) -ModulePass *llvm::createDataFlowSanitizerPass(StringRef ABIListFile, - void *(*getArgTLS)(), - void *(*getRetValTLS)()) { - return new DataFlowSanitizer(ABIListFile, getArgTLS, getRetValTLS); +ModulePass * +llvm::createDataFlowSanitizerPass(const std::vector<std::string> &ABIListFiles, + void *(*getArgTLS)(), + void *(*getRetValTLS)()) { + return new DataFlowSanitizer(ABIListFiles, getArgTLS, getRetValTLS); } -DataFlowSanitizer::DataFlowSanitizer(StringRef ABIListFile, - void *(*getArgTLS)(), - void *(*getRetValTLS)()) - : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS), - ABIList(SpecialCaseList::createOrDie(ABIListFile.empty() ? ClABIListFile - : ABIListFile)) { +DataFlowSanitizer::DataFlowSanitizer( + const std::vector<std::string> &ABIListFiles, void *(*getArgTLS)(), + void *(*getRetValTLS)()) + : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS) { + std::vector<std::string> AllABIListFiles(std::move(ABIListFiles)); + AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(), + ClABIListFiles.end()); + ABIList.set(SpecialCaseList::createOrDie(AllABIListFiles)); } FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) { - llvm::SmallVector<Type *, 4> ArgTypes; - std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes)); - for (unsigned i = 0, e = T->getNumParams(); i != e; ++i) - ArgTypes.push_back(ShadowTy); + llvm::SmallVector<Type *, 4> ArgTypes(T->param_begin(), T->param_end()); + ArgTypes.append(T->getNumParams(), ShadowTy); if (T->isVarArg()) ArgTypes.push_back(ShadowPtrTy); Type *RetType = T->getReturnType(); @@ -381,9 +385,8 @@ FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) { assert(!T->isVarArg()); llvm::SmallVector<Type *, 4> ArgTypes; ArgTypes.push_back(T->getPointerTo()); - std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes)); - for (unsigned i = 0, e = T->getNumParams(); i != e; ++i) - ArgTypes.push_back(ShadowTy); + ArgTypes.append(T->param_begin(), T->param_end()); + ArgTypes.append(T->getNumParams(), ShadowTy); Type *RetType = T->getReturnType(); if (!RetType->isVoidTy()) ArgTypes.push_back(ShadowPtrTy); @@ -414,6 +417,11 @@ FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) { } bool DataFlowSanitizer::doInitialization(Module &M) { + llvm::Triple TargetTriple(M.getTargetTriple()); + bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64; + bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 || + TargetTriple.getArch() == llvm::Triple::mips64el; + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); if (!DLP) report_fatal_error("data layout missing"); @@ -425,8 +433,13 @@ bool DataFlowSanitizer::doInitialization(Module &M) { ShadowPtrTy = PointerType::getUnqual(ShadowTy); IntptrTy = DL->getIntPtrType(*Ctx); ZeroShadow = ConstantInt::getSigned(ShadowTy, 0); - ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL); ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8); + if (IsX86_64) + ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL); + else if (IsMIPS64) + ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL); + else + report_fatal_error("unsupported triple"); Type *DFSanUnionArgs[2] = { ShadowTy, ShadowTy }; DFSanUnionFnTy = @@ -1521,7 +1534,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) { Next = II->getNormalDest()->begin(); } else { BasicBlock *NewBB = - SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DFS); + SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT); Next = NewBB->begin(); } } else { diff --git a/lib/Transforms/Instrumentation/DebugIR.cpp b/lib/Transforms/Instrumentation/DebugIR.cpp deleted file mode 100644 index 5234341..0000000 --- a/lib/Transforms/Instrumentation/DebugIR.cpp +++ /dev/null @@ -1,617 +0,0 @@ -//===--- DebugIR.cpp - Transform debug metadata to allow debugging IR -----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// A Module transform pass that emits a succinct version of the IR and replaces -// the source file metadata to allow debuggers to step through the IR. -// -// FIXME: instead of replacing debug metadata, this pass should allow for -// additional metadata to be used to point capable debuggers to the IR file -// without destroying the mapping to the original source file. -// -//===----------------------------------------------------------------------===// - -#include "llvm/IR/ValueMap.h" -#include "DebugIR.h" -#include "llvm/IR/AssemblyAnnotationWriter.h" -#include "llvm/IR/DIBuilder.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" -#include "llvm/IR/InstVisitor.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/FormattedStream.h" -#include "llvm/Support/Path.h" -#include "llvm/Support/ToolOutputFile.h" -#include "llvm/Transforms/Instrumentation.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include <string> - -#define STR_HELPER(x) #x -#define STR(x) STR_HELPER(x) - -using namespace llvm; - -#define DEBUG_TYPE "debug-ir" - -namespace { - -/// Builds a map of Value* to line numbers on which the Value appears in a -/// textual representation of the IR by plugging into the AssemblyWriter by -/// masquerading as an AssemblyAnnotationWriter. -class ValueToLineMap : public AssemblyAnnotationWriter { - ValueMap<const Value *, unsigned int> Lines; - typedef ValueMap<const Value *, unsigned int>::const_iterator LineIter; - - void addEntry(const Value *V, formatted_raw_ostream &Out) { - Out.flush(); - Lines.insert(std::make_pair(V, Out.getLine() + 1)); - } - -public: - - /// Prints Module to a null buffer in order to build the map of Value pointers - /// to line numbers. - ValueToLineMap(const Module *M) { - raw_null_ostream ThrowAway; - M->print(ThrowAway, this); - } - - // This function is called after an Instruction, GlobalValue, or GlobalAlias - // is printed. - void printInfoComment(const Value &V, formatted_raw_ostream &Out) override { - addEntry(&V, Out); - } - - void emitFunctionAnnot(const Function *F, - formatted_raw_ostream &Out) override { - addEntry(F, Out); - } - - /// If V appears on a line in the textual IR representation, sets Line to the - /// line number and returns true, otherwise returns false. - bool getLine(const Value *V, unsigned int &Line) const { - LineIter i = Lines.find(V); - if (i != Lines.end()) { - Line = i->second; - return true; - } - return false; - } -}; - -/// Removes debug intrisncs like llvm.dbg.declare and llvm.dbg.value. -class DebugIntrinsicsRemover : public InstVisitor<DebugIntrinsicsRemover> { - void remove(Instruction &I) { I.eraseFromParent(); } - -public: - static void process(Module &M) { - DebugIntrinsicsRemover Remover; - Remover.visit(&M); - } - void visitDbgDeclareInst(DbgDeclareInst &I) { remove(I); } - void visitDbgValueInst(DbgValueInst &I) { remove(I); } - void visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) { remove(I); } -}; - -/// Removes debug metadata (!dbg) nodes from all instructions, and optionally -/// metadata named "llvm.dbg.cu" if RemoveNamedInfo is true. -class DebugMetadataRemover : public InstVisitor<DebugMetadataRemover> { - bool RemoveNamedInfo; - -public: - static void process(Module &M, bool RemoveNamedInfo = true) { - DebugMetadataRemover Remover(RemoveNamedInfo); - Remover.run(&M); - } - - DebugMetadataRemover(bool RemoveNamedInfo) - : RemoveNamedInfo(RemoveNamedInfo) {} - - void visitInstruction(Instruction &I) { - if (I.getMetadata(LLVMContext::MD_dbg)) - I.setMetadata(LLVMContext::MD_dbg, nullptr); - } - - void run(Module *M) { - // Remove debug metadata attached to instructions - visit(M); - - if (RemoveNamedInfo) { - // Remove CU named metadata (and all children nodes) - NamedMDNode *Node = M->getNamedMetadata("llvm.dbg.cu"); - if (Node) - M->eraseNamedMetadata(Node); - } - } -}; - -/// Updates debug metadata in a Module: -/// - changes Filename/Directory to values provided on construction -/// - adds/updates line number (DebugLoc) entries associated with each -/// instruction to reflect the instruction's location in an LLVM IR file -class DIUpdater : public InstVisitor<DIUpdater> { - /// Builder of debug information - DIBuilder Builder; - - /// Helper for type attributes/sizes/etc - DataLayout Layout; - - /// Map of Value* to line numbers - const ValueToLineMap LineTable; - - /// Map of Value* (in original Module) to Value* (in optional cloned Module) - const ValueToValueMapTy *VMap; - - /// Directory of debug metadata - DebugInfoFinder Finder; - - /// Source filename and directory - StringRef Filename; - StringRef Directory; - - // CU nodes needed when creating DI subprograms - MDNode *FileNode; - MDNode *LexicalBlockFileNode; - const MDNode *CUNode; - - ValueMap<const Function *, MDNode *> SubprogramDescriptors; - DenseMap<const Type *, MDNode *> TypeDescriptors; - -public: - DIUpdater(Module &M, StringRef Filename = StringRef(), - StringRef Directory = StringRef(), const Module *DisplayM = nullptr, - const ValueToValueMapTy *VMap = nullptr) - : Builder(M), Layout(&M), LineTable(DisplayM ? DisplayM : &M), VMap(VMap), - Finder(), Filename(Filename), Directory(Directory), FileNode(nullptr), - LexicalBlockFileNode(nullptr), CUNode(nullptr) { - Finder.processModule(M); - visit(&M); - } - - ~DIUpdater() { Builder.finalize(); } - - void visitModule(Module &M) { - if (Finder.compile_unit_count() > 1) - report_fatal_error("DebugIR pass supports only a signle compile unit per " - "Module."); - createCompileUnit(Finder.compile_unit_count() == 1 ? - (MDNode*)*Finder.compile_units().begin() : nullptr); - } - - void visitFunction(Function &F) { - if (F.isDeclaration() || findDISubprogram(&F)) - return; - - StringRef MangledName = F.getName(); - DICompositeType Sig = createFunctionSignature(&F); - - // find line of function declaration - unsigned Line = 0; - if (!findLine(&F, Line)) { - DEBUG(dbgs() << "WARNING: No line for Function " << F.getName().str() - << "\n"); - return; - } - - Instruction *FirstInst = F.begin()->begin(); - unsigned ScopeLine = 0; - if (!findLine(FirstInst, ScopeLine)) { - DEBUG(dbgs() << "WARNING: No line for 1st Instruction in Function " - << F.getName().str() << "\n"); - return; - } - - bool Local = F.hasInternalLinkage(); - bool IsDefinition = !F.isDeclaration(); - bool IsOptimized = false; - - int FuncFlags = llvm::DIDescriptor::FlagPrototyped; - assert(CUNode && FileNode); - DISubprogram Sub = Builder.createFunction( - DICompileUnit(CUNode), F.getName(), MangledName, DIFile(FileNode), Line, - Sig, Local, IsDefinition, ScopeLine, FuncFlags, IsOptimized, &F); - assert(Sub.isSubprogram()); - DEBUG(dbgs() << "create subprogram mdnode " << *Sub << ": " - << "\n"); - - SubprogramDescriptors.insert(std::make_pair(&F, Sub)); - } - - void visitInstruction(Instruction &I) { - DebugLoc Loc(I.getDebugLoc()); - - /// If a ValueToValueMap is provided, use it to get the real instruction as - /// the line table was generated on a clone of the module on which we are - /// operating. - Value *RealInst = nullptr; - if (VMap) - RealInst = VMap->lookup(&I); - - if (!RealInst) - RealInst = &I; - - unsigned Col = 0; // FIXME: support columns - unsigned Line; - if (!LineTable.getLine(RealInst, Line)) { - // Instruction has no line, it may have been removed (in the module that - // will be passed to the debugger) so there is nothing to do here. - DEBUG(dbgs() << "WARNING: no LineTable entry for instruction " << RealInst - << "\n"); - DEBUG(RealInst->dump()); - return; - } - - DebugLoc NewLoc; - if (!Loc.isUnknown()) - // I had a previous debug location: re-use the DebugLoc - NewLoc = DebugLoc::get(Line, Col, Loc.getScope(RealInst->getContext()), - Loc.getInlinedAt(RealInst->getContext())); - else if (MDNode *scope = findScope(&I)) - NewLoc = DebugLoc::get(Line, Col, scope, nullptr); - else { - DEBUG(dbgs() << "WARNING: no valid scope for instruction " << &I - << ". no DebugLoc will be present." - << "\n"); - return; - } - - addDebugLocation(I, NewLoc); - } - -private: - - void createCompileUnit(MDNode *CUToReplace) { - std::string Flags; - bool IsOptimized = false; - StringRef Producer; - unsigned RuntimeVersion(0); - StringRef SplitName; - - if (CUToReplace) { - // save fields from existing CU to re-use in the new CU - DICompileUnit ExistingCU(CUToReplace); - Producer = ExistingCU.getProducer(); - IsOptimized = ExistingCU.isOptimized(); - Flags = ExistingCU.getFlags(); - RuntimeVersion = ExistingCU.getRunTimeVersion(); - SplitName = ExistingCU.getSplitDebugFilename(); - } else { - Producer = - "LLVM Version " STR(LLVM_VERSION_MAJOR) "." STR(LLVM_VERSION_MINOR); - } - - CUNode = - Builder.createCompileUnit(dwarf::DW_LANG_C99, Filename, Directory, - Producer, IsOptimized, Flags, RuntimeVersion); - - if (CUToReplace) - CUToReplace->replaceAllUsesWith(const_cast<MDNode *>(CUNode)); - - DICompileUnit CU(CUNode); - FileNode = Builder.createFile(Filename, Directory); - LexicalBlockFileNode = Builder.createLexicalBlockFile(CU, DIFile(FileNode)); - } - - /// Returns the MDNode* that represents the DI scope to associate with I - MDNode *findScope(const Instruction *I) { - const Function *F = I->getParent()->getParent(); - if (MDNode *ret = findDISubprogram(F)) - return ret; - - DEBUG(dbgs() << "WARNING: Using fallback lexical block file scope " - << LexicalBlockFileNode << " as scope for instruction " << I - << "\n"); - return LexicalBlockFileNode; - } - - /// Returns the MDNode* that is the descriptor for F - MDNode *findDISubprogram(const Function *F) { - typedef ValueMap<const Function *, MDNode *>::const_iterator FuncNodeIter; - FuncNodeIter i = SubprogramDescriptors.find(F); - if (i != SubprogramDescriptors.end()) - return i->second; - - DEBUG(dbgs() << "searching for DI scope node for Function " << F - << " in a list of " << Finder.subprogram_count() - << " subprogram nodes" - << "\n"); - - for (DISubprogram S : Finder.subprograms()) { - if (S.getFunction() == F) { - DEBUG(dbgs() << "Found DISubprogram " << S << " for function " - << S.getFunction() << "\n"); - return S; - } - } - DEBUG(dbgs() << "unable to find DISubprogram node for function " - << F->getName().str() << "\n"); - return nullptr; - } - - /// Sets Line to the line number on which V appears and returns true. If a - /// line location for V is not found, returns false. - bool findLine(const Value *V, unsigned &Line) { - if (LineTable.getLine(V, Line)) - return true; - - if (VMap) { - Value *mapped = VMap->lookup(V); - if (mapped && LineTable.getLine(mapped, Line)) - return true; - } - return false; - } - - std::string getTypeName(Type *T) { - std::string TypeName; - raw_string_ostream TypeStream(TypeName); - if (T) - T->print(TypeStream); - else - TypeStream << "Printing <null> Type"; - TypeStream.flush(); - return TypeName; - } - - /// Returns the MDNode that represents type T if it is already created, or 0 - /// if it is not. - MDNode *getType(const Type *T) { - typedef DenseMap<const Type *, MDNode *>::const_iterator TypeNodeIter; - TypeNodeIter i = TypeDescriptors.find(T); - if (i != TypeDescriptors.end()) - return i->second; - return nullptr; - } - - /// Returns a DebugInfo type from an LLVM type T. - DIDerivedType getOrCreateType(Type *T) { - MDNode *N = getType(T); - if (N) - return DIDerivedType(N); - else if (T->isVoidTy()) - return DIDerivedType(nullptr); - else if (T->isStructTy()) { - N = Builder.createStructType( - DIScope(LexicalBlockFileNode), T->getStructName(), DIFile(FileNode), - 0, Layout.getTypeSizeInBits(T), Layout.getABITypeAlignment(T), 0, - DIType(nullptr), DIArray(nullptr)); // filled in later - - // N is added to the map (early) so that element search below can find it, - // so as to avoid infinite recursion for structs that contain pointers to - // their own type. - TypeDescriptors[T] = N; - DICompositeType StructDescriptor(N); - - SmallVector<Value *, 4> Elements; - for (unsigned i = 0; i < T->getStructNumElements(); ++i) - Elements.push_back(getOrCreateType(T->getStructElementType(i))); - - // set struct elements - StructDescriptor.setArrays(Builder.getOrCreateArray(Elements)); - } else if (T->isPointerTy()) { - Type *PointeeTy = T->getPointerElementType(); - if (!(N = getType(PointeeTy))) - N = Builder.createPointerType( - getOrCreateType(PointeeTy), Layout.getPointerTypeSizeInBits(T), - Layout.getPrefTypeAlignment(T), getTypeName(T)); - } else if (T->isArrayTy()) { - SmallVector<Value *, 1> Subrange; - Subrange.push_back( - Builder.getOrCreateSubrange(0, T->getArrayNumElements() - 1)); - - N = Builder.createArrayType(Layout.getTypeSizeInBits(T), - Layout.getPrefTypeAlignment(T), - getOrCreateType(T->getArrayElementType()), - Builder.getOrCreateArray(Subrange)); - } else { - int encoding = llvm::dwarf::DW_ATE_signed; - if (T->isIntegerTy()) - encoding = llvm::dwarf::DW_ATE_unsigned; - else if (T->isFloatingPointTy()) - encoding = llvm::dwarf::DW_ATE_float; - - N = Builder.createBasicType(getTypeName(T), T->getPrimitiveSizeInBits(), - 0, encoding); - } - TypeDescriptors[T] = N; - return DIDerivedType(N); - } - - /// Returns a DebugInfo type that represents a function signature for Func. - DICompositeType createFunctionSignature(const Function *Func) { - SmallVector<Value *, 4> Params; - DIDerivedType ReturnType(getOrCreateType(Func->getReturnType())); - Params.push_back(ReturnType); - - const Function::ArgumentListType &Args(Func->getArgumentList()); - for (Function::ArgumentListType::const_iterator i = Args.begin(), - e = Args.end(); - i != e; ++i) { - Type *T(i->getType()); - Params.push_back(getOrCreateType(T)); - } - - DITypeArray ParamArray = Builder.getOrCreateTypeArray(Params); - return Builder.createSubroutineType(DIFile(FileNode), ParamArray); - } - - /// Associates Instruction I with debug location Loc. - void addDebugLocation(Instruction &I, DebugLoc Loc) { - MDNode *MD = Loc.getAsMDNode(I.getContext()); - I.setMetadata(LLVMContext::MD_dbg, MD); - } -}; - -/// Sets Filename/Directory from the Module identifier and returns true, or -/// false if source information is not present. -bool getSourceInfoFromModule(const Module &M, std::string &Directory, - std::string &Filename) { - std::string PathStr(M.getModuleIdentifier()); - if (PathStr.length() == 0 || PathStr == "<stdin>") - return false; - - Filename = sys::path::filename(PathStr); - SmallVector<char, 16> Path(PathStr.begin(), PathStr.end()); - sys::path::remove_filename(Path); - Directory = StringRef(Path.data(), Path.size()); - return true; -} - -// Sets Filename/Directory from debug information in M and returns true, or -// false if no debug information available, or cannot be parsed. -bool getSourceInfoFromDI(const Module &M, std::string &Directory, - std::string &Filename) { - NamedMDNode *CUNode = M.getNamedMetadata("llvm.dbg.cu"); - if (!CUNode || CUNode->getNumOperands() == 0) - return false; - - DICompileUnit CU(CUNode->getOperand(0)); - if (!CU.Verify()) - return false; - - Filename = CU.getFilename(); - Directory = CU.getDirectory(); - return true; -} - -} // anonymous namespace - -namespace llvm { - -bool DebugIR::getSourceInfo(const Module &M) { - ParsedPath = getSourceInfoFromDI(M, Directory, Filename) || - getSourceInfoFromModule(M, Directory, Filename); - return ParsedPath; -} - -bool DebugIR::updateExtension(StringRef NewExtension) { - size_t dot = Filename.find_last_of("."); - if (dot == std::string::npos) - return false; - - Filename.erase(dot); - Filename += NewExtension.str(); - return true; -} - -void DebugIR::generateFilename(std::unique_ptr<int> &fd) { - SmallVector<char, 16> PathVec; - fd.reset(new int); - sys::fs::createTemporaryFile("debug-ir", "ll", *fd, PathVec); - StringRef Path(PathVec.data(), PathVec.size()); - Filename = sys::path::filename(Path); - sys::path::remove_filename(PathVec); - Directory = StringRef(PathVec.data(), PathVec.size()); - - GeneratedPath = true; -} - -std::string DebugIR::getPath() { - SmallVector<char, 16> Path; - sys::path::append(Path, Directory, Filename); - Path.resize(Filename.size() + Directory.size() + 2); - Path[Filename.size() + Directory.size() + 1] = '\0'; - return std::string(Path.data()); -} - -void DebugIR::writeDebugBitcode(const Module *M, int *fd) { - std::unique_ptr<raw_fd_ostream> Out; - std::error_code EC; - - if (!fd) { - std::string Path = getPath(); - Out.reset(new raw_fd_ostream(Path, EC, sys::fs::F_Text)); - DEBUG(dbgs() << "WRITING debug bitcode from Module " << M << " to file " - << Path << "\n"); - } else { - DEBUG(dbgs() << "WRITING debug bitcode from Module " << M << " to fd " - << *fd << "\n"); - Out.reset(new raw_fd_ostream(*fd, true)); - } - - M->print(*Out, nullptr); - Out->close(); -} - -void DebugIR::createDebugInfo(Module &M, std::unique_ptr<Module> &DisplayM) { - if (M.getFunctionList().size() == 0) - // no functions -- no debug info needed - return; - - std::unique_ptr<ValueToValueMapTy> VMap; - - if (WriteSourceToDisk && (HideDebugIntrinsics || HideDebugMetadata)) { - VMap.reset(new ValueToValueMapTy); - DisplayM.reset(CloneModule(&M, *VMap)); - - if (HideDebugIntrinsics) - DebugIntrinsicsRemover::process(*DisplayM); - - if (HideDebugMetadata) - DebugMetadataRemover::process(*DisplayM); - } - - DIUpdater R(M, Filename, Directory, DisplayM.get(), VMap.get()); -} - -bool DebugIR::isMissingPath() { return Filename.empty() || Directory.empty(); } - -bool DebugIR::runOnModule(Module &M) { - std::unique_ptr<int> fd; - - if (isMissingPath() && !getSourceInfo(M)) { - if (!WriteSourceToDisk) - report_fatal_error("DebugIR unable to determine file name in input. " - "Ensure Module contains an identifier, a valid " - "DICompileUnit, or construct DebugIR with " - "non-empty Filename/Directory parameters."); - else - generateFilename(fd); - } - - if (!GeneratedPath && WriteSourceToDisk) - updateExtension(".debug-ll"); - - // Clear line numbers. Keep debug info (if any) if we were able to read the - // file name from the DICompileUnit descriptor. - DebugMetadataRemover::process(M, !ParsedPath); - - std::unique_ptr<Module> DisplayM; - createDebugInfo(M, DisplayM); - if (WriteSourceToDisk) { - Module *OutputM = DisplayM.get() ? DisplayM.get() : &M; - writeDebugBitcode(OutputM, fd.get()); - } - - DEBUG(M.dump()); - return true; -} - -bool DebugIR::runOnModule(Module &M, std::string &Path) { - bool result = runOnModule(M); - Path = getPath(); - return result; -} - -} // llvm namespace - -char DebugIR::ID = 0; -INITIALIZE_PASS(DebugIR, "debug-ir", "Enable debugging IR", false, false) - -ModulePass *llvm::createDebugIRPass(bool HideDebugIntrinsics, - bool HideDebugMetadata, StringRef Directory, - StringRef Filename) { - return new DebugIR(HideDebugIntrinsics, HideDebugMetadata, Directory, - Filename); -} - -ModulePass *llvm::createDebugIRPass() { return new DebugIR(); } diff --git a/lib/Transforms/Instrumentation/DebugIR.h b/lib/Transforms/Instrumentation/DebugIR.h deleted file mode 100644 index 8d74a4d..0000000 --- a/lib/Transforms/Instrumentation/DebugIR.h +++ /dev/null @@ -1,98 +0,0 @@ -//===- llvm/Transforms/Instrumentation/DebugIR.h - Interface ----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the interface of the DebugIR pass. For most users, -// including Instrumentation.h and calling createDebugIRPass() is sufficient and -// there is no need to include this file. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H -#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H - -#include "llvm/Pass.h" - -namespace llvm { - -class DebugIR : public llvm::ModulePass { - /// If true, write a source file to disk. - bool WriteSourceToDisk; - - /// Hide certain (non-essential) debug information (only relevant if - /// createSource is true. - bool HideDebugIntrinsics; - bool HideDebugMetadata; - - /// The location of the source file. - std::string Directory; - std::string Filename; - - /// True if a temporary file name was generated. - bool GeneratedPath; - - /// True if the file name was read from the Module. - bool ParsedPath; - -public: - static char ID; - - const char *getPassName() const override { return "DebugIR"; } - - /// Generate a file on disk to be displayed in a debugger. If Filename and - /// Directory are empty, a temporary path will be generated. - DebugIR(bool HideDebugIntrinsics, bool HideDebugMetadata, - llvm::StringRef Directory, llvm::StringRef Filename) - : ModulePass(ID), WriteSourceToDisk(true), - HideDebugIntrinsics(HideDebugIntrinsics), - HideDebugMetadata(HideDebugMetadata), Directory(Directory), - Filename(Filename), GeneratedPath(false), ParsedPath(false) {} - - /// Modify input in-place; do not generate additional files, and do not hide - /// any debug intrinsics/metadata that might be present. - DebugIR() - : ModulePass(ID), WriteSourceToDisk(false), HideDebugIntrinsics(false), - HideDebugMetadata(false), GeneratedPath(false), ParsedPath(false) {} - - /// Run pass on M and set Path to the source file path in the output module. - bool runOnModule(llvm::Module &M, std::string &Path); - bool runOnModule(llvm::Module &M) override; - -private: - - /// Returns the concatenated Directory + Filename, without error checking - std::string getPath(); - - /// Attempts to read source information from debug information in M, and if - /// that fails, from M's identifier. Returns true on success, false otherwise. - bool getSourceInfo(const llvm::Module &M); - - /// Replace the extension of Filename with NewExtension, and return true if - /// successful. Return false if extension could not be found or Filename is - /// empty. - bool updateExtension(llvm::StringRef NewExtension); - - /// Generate a temporary filename and open an fd - void generateFilename(std::unique_ptr<int> &fd); - - /// Creates DWARF CU/Subroutine metadata - void createDebugInfo(llvm::Module &M, - std::unique_ptr<llvm::Module> &DisplayM); - - /// Returns true if either Directory or Filename is missing, false otherwise. - bool isMissingPath(); - - /// Write M to disk, optionally passing in an fd to an open file which is - /// closed by this function after writing. If no fd is specified, a new file - /// is opened, written, and closed. - void writeDebugBitcode(const llvm::Module *M, int *fd = nullptr); -}; - -} // llvm namespace - -#endif diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 220d7f8..cb965fb 100644 --- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -285,6 +285,14 @@ namespace { DeleteContainerSeconds(LinesByFile); } + GCOVBlock(const GCOVBlock &RHS) : GCOVRecord(RHS), Number(RHS.Number) { + // Only allow copy before edges and lines have been added. After that, + // there are inter-block pointers (eg: edges) that won't take kindly to + // blocks being copied or moved around. + assert(LinesByFile.empty()); + assert(OutEdges.empty()); + } + private: friend class GCOVFunction; @@ -303,18 +311,22 @@ namespace { // object users can construct, the blocks and lines will be rooted here. class GCOVFunction : public GCOVRecord { public: - GCOVFunction(DISubprogram SP, raw_ostream *os, uint32_t Ident, - bool UseCfgChecksum) : - SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0) { + GCOVFunction(DISubprogram SP, raw_ostream *os, uint32_t Ident, + bool UseCfgChecksum) + : SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0), + ReturnBlock(1, os) { this->os = os; Function *F = SP.getFunction(); DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n"); + uint32_t i = 0; - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { - Blocks[BB] = new GCOVBlock(i++, os); + for (auto &BB : *F) { + // Skip index 1 (0, 2, 3, 4, ...) because that's assigned to the + // ReturnBlock. + bool first = i == 0; + Blocks.insert(std::make_pair(&BB, GCOVBlock(i++ + !first, os))); } - ReturnBlock = new GCOVBlock(i++, os); std::string FunctionNameAndLine; raw_string_ostream FNLOS(FunctionNameAndLine); @@ -323,17 +335,12 @@ namespace { FuncChecksum = hash_value(FunctionNameAndLine); } - ~GCOVFunction() { - DeleteContainerSeconds(Blocks); - delete ReturnBlock; - } - GCOVBlock &getBlock(BasicBlock *BB) { - return *Blocks[BB]; + return Blocks.find(BB)->second; } GCOVBlock &getReturnBlock() { - return *ReturnBlock; + return ReturnBlock; } std::string getEdgeDestinations() { @@ -341,7 +348,7 @@ namespace { raw_string_ostream EDOS(EdgeDestinations); Function *F = Blocks.begin()->first->getParent(); for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { - GCOVBlock &Block = *Blocks[I]; + GCOVBlock &Block = getBlock(I); for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) EDOS << Block.OutEdges[i]->Number; } @@ -383,7 +390,7 @@ namespace { if (Blocks.empty()) return; Function *F = Blocks.begin()->first->getParent(); for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { - GCOVBlock &Block = *Blocks[I]; + GCOVBlock &Block = getBlock(I); if (Block.OutEdges.empty()) continue; writeBytes(EdgeTag, 4); @@ -399,7 +406,7 @@ namespace { // Emit lines for each block. for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { - Blocks[I]->writeOut(); + getBlock(I).writeOut(); } } @@ -409,8 +416,8 @@ namespace { uint32_t FuncChecksum; bool UseCfgChecksum; uint32_t CfgChecksum; - DenseMap<BasicBlock *, GCOVBlock *> Blocks; - GCOVBlock *ReturnBlock; + DenseMap<BasicBlock *, GCOVBlock> Blocks; + GCOVBlock ReturnBlock; }; } diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp new file mode 100644 index 0000000..b5a491f --- /dev/null +++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -0,0 +1,351 @@ +//===-- InstrProfiling.cpp - Frontend instrumentation based profiling -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass lowers instrprof_increment intrinsics emitted by a frontend for +// profiling. It also builds the data structures and initialization code needed +// for updating execution counts and emitting the profile at runtime. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation.h" + +#include "llvm/ADT/Triple.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "instrprof" + +namespace { + +class InstrProfiling : public ModulePass { +public: + static char ID; + + InstrProfiling() : ModulePass(ID) {} + + InstrProfiling(const InstrProfOptions &Options) + : ModulePass(ID), Options(Options) {} + + const char *getPassName() const override { + return "Frontend instrumentation-based coverage lowering"; + } + + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } + +private: + InstrProfOptions Options; + Module *M; + DenseMap<GlobalVariable *, GlobalVariable *> RegionCounters; + std::vector<Value *> UsedVars; + + bool isMachO() const { + return Triple(M->getTargetTriple()).isOSBinFormatMachO(); + } + + /// Get the section name for the counter variables. + StringRef getCountersSection() const { + return isMachO() ? "__DATA,__llvm_prf_cnts" : "__llvm_prf_cnts"; + } + + /// Get the section name for the name variables. + StringRef getNameSection() const { + return isMachO() ? "__DATA,__llvm_prf_names" : "__llvm_prf_names"; + } + + /// Get the section name for the profile data variables. + StringRef getDataSection() const { + return isMachO() ? "__DATA,__llvm_prf_data" : "__llvm_prf_data"; + } + + /// Get the section name for the coverage mapping data. + StringRef getCoverageSection() const { + return isMachO() ? "__DATA,__llvm_covmap" : "__llvm_covmap"; + } + + /// Replace instrprof_increment with an increment of the appropriate value. + void lowerIncrement(InstrProfIncrementInst *Inc); + + /// Set up the section and uses for coverage data and its references. + void lowerCoverageData(GlobalVariable *CoverageData); + + /// Get the region counters for an increment, creating them if necessary. + /// + /// If the counter array doesn't yet exist, the profile data variables + /// referring to them will also be created. + GlobalVariable *getOrCreateRegionCounters(InstrProfIncrementInst *Inc); + + /// Emit runtime registration functions for each profile data variable. + void emitRegistration(); + + /// Emit the necessary plumbing to pull in the runtime initialization. + void emitRuntimeHook(); + + /// Add uses of our data variables and runtime hook. + void emitUses(); + + /// Create a static initializer for our data, on platforms that need it. + void emitInitialization(); +}; + +} // anonymous namespace + +char InstrProfiling::ID = 0; +INITIALIZE_PASS(InstrProfiling, "instrprof", + "Frontend instrumentation-based coverage lowering.", false, + false) + +ModulePass *llvm::createInstrProfilingPass(const InstrProfOptions &Options) { + return new InstrProfiling(Options); +} + +bool InstrProfiling::runOnModule(Module &M) { + bool MadeChange = false; + + this->M = &M; + RegionCounters.clear(); + UsedVars.clear(); + + for (Function &F : M) + for (BasicBlock &BB : F) + for (auto I = BB.begin(), E = BB.end(); I != E;) + if (auto *Inc = dyn_cast<InstrProfIncrementInst>(I++)) { + lowerIncrement(Inc); + MadeChange = true; + } + if (GlobalVariable *Coverage = M.getNamedGlobal("__llvm_coverage_mapping")) { + lowerCoverageData(Coverage); + MadeChange = true; + } + if (!MadeChange) + return false; + + emitRegistration(); + emitRuntimeHook(); + emitUses(); + emitInitialization(); + return true; +} + +void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) { + GlobalVariable *Counters = getOrCreateRegionCounters(Inc); + + IRBuilder<> Builder(Inc->getParent(), *Inc); + uint64_t Index = Inc->getIndex()->getZExtValue(); + llvm::Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index); + llvm::Value *Count = Builder.CreateLoad(Addr, "pgocount"); + Count = Builder.CreateAdd(Count, Builder.getInt64(1)); + Inc->replaceAllUsesWith(Builder.CreateStore(Count, Addr)); + Inc->eraseFromParent(); +} + +void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageData) { + CoverageData->setSection(getCoverageSection()); + CoverageData->setAlignment(8); + + Constant *Init = CoverageData->getInitializer(); + // We're expecting { i32, i32, i32, i32, [n x { i8*, i32, i32 }], [m x i8] } + // for some C. If not, the frontend's given us something broken. + assert(Init->getNumOperands() == 6 && "bad number of fields in coverage map"); + assert(isa<ConstantArray>(Init->getAggregateElement(4)) && + "invalid function list in coverage map"); + ConstantArray *Records = cast<ConstantArray>(Init->getAggregateElement(4)); + for (unsigned I = 0, E = Records->getNumOperands(); I < E; ++I) { + Constant *Record = Records->getOperand(I); + Value *V = const_cast<Value *>(Record->getOperand(0))->stripPointerCasts(); + + assert(isa<GlobalVariable>(V) && "Missing reference to function name"); + GlobalVariable *Name = cast<GlobalVariable>(V); + + // If we have region counters for this name, we've already handled it. + auto It = RegionCounters.find(Name); + if (It != RegionCounters.end()) + continue; + + // Move the name variable to the right section. + Name->setSection(getNameSection()); + Name->setAlignment(1); + } +} + +/// Get the name of a profiling variable for a particular function. +static std::string getVarName(InstrProfIncrementInst *Inc, StringRef VarName) { + auto *Arr = cast<ConstantDataArray>(Inc->getName()->getInitializer()); + StringRef Name = Arr->isCString() ? Arr->getAsCString() : Arr->getAsString(); + return ("__llvm_profile_" + VarName + "_" + Name).str(); +} + +GlobalVariable * +InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { + GlobalVariable *Name = Inc->getName(); + auto It = RegionCounters.find(Name); + if (It != RegionCounters.end()) + return It->second; + + // Move the name variable to the right section. + Name->setSection(getNameSection()); + Name->setAlignment(1); + + uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); + LLVMContext &Ctx = M->getContext(); + ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters); + + // Create the counters variable. + auto *Counters = new GlobalVariable(*M, CounterTy, false, Name->getLinkage(), + Constant::getNullValue(CounterTy), + getVarName(Inc, "counters")); + Counters->setVisibility(Name->getVisibility()); + Counters->setSection(getCountersSection()); + Counters->setAlignment(8); + + RegionCounters[Inc->getName()] = Counters; + + // Create data variable. + auto *NameArrayTy = Name->getType()->getPointerElementType(); + auto *Int32Ty = Type::getInt32Ty(Ctx); + auto *Int64Ty = Type::getInt64Ty(Ctx); + auto *Int8PtrTy = Type::getInt8PtrTy(Ctx); + auto *Int64PtrTy = Type::getInt64PtrTy(Ctx); + + Type *DataTypes[] = {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int64PtrTy}; + auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes)); + Constant *DataVals[] = { + ConstantInt::get(Int32Ty, NameArrayTy->getArrayNumElements()), + ConstantInt::get(Int32Ty, NumCounters), + ConstantInt::get(Int64Ty, Inc->getHash()->getZExtValue()), + ConstantExpr::getBitCast(Name, Int8PtrTy), + ConstantExpr::getBitCast(Counters, Int64PtrTy)}; + auto *Data = new GlobalVariable(*M, DataTy, true, Name->getLinkage(), + ConstantStruct::get(DataTy, DataVals), + getVarName(Inc, "data")); + Data->setVisibility(Name->getVisibility()); + Data->setSection(getDataSection()); + Data->setAlignment(8); + + // Mark the data variable as used so that it isn't stripped out. + UsedVars.push_back(Data); + + return Counters; +} + +void InstrProfiling::emitRegistration() { + // Don't do this for Darwin. compiler-rt uses linker magic. + if (Triple(M->getTargetTriple()).isOSDarwin()) + return; + + // Construct the function. + auto *VoidTy = Type::getVoidTy(M->getContext()); + auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext()); + auto *RegisterFTy = FunctionType::get(VoidTy, false); + auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage, + "__llvm_profile_register_functions", M); + RegisterF->setUnnamedAddr(true); + if (Options.NoRedZone) + RegisterF->addFnAttr(Attribute::NoRedZone); + + auto *RuntimeRegisterTy = llvm::FunctionType::get(VoidTy, VoidPtrTy, false); + auto *RuntimeRegisterF = + Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage, + "__llvm_profile_register_function", M); + + IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF)); + for (Value *Data : UsedVars) + IRB.CreateCall(RuntimeRegisterF, IRB.CreateBitCast(Data, VoidPtrTy)); + IRB.CreateRetVoid(); +} + +void InstrProfiling::emitRuntimeHook() { + const char *const RuntimeVarName = "__llvm_profile_runtime"; + const char *const RuntimeUserName = "__llvm_profile_runtime_user"; + + // If the module's provided its own runtime, we don't need to do anything. + if (M->getGlobalVariable(RuntimeVarName)) + return; + + // Declare an external variable that will pull in the runtime initialization. + auto *Int32Ty = Type::getInt32Ty(M->getContext()); + auto *Var = + new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage, + nullptr, RuntimeVarName); + + // Make a function that uses it. + auto *User = + Function::Create(FunctionType::get(Int32Ty, false), + GlobalValue::LinkOnceODRLinkage, RuntimeUserName, M); + User->addFnAttr(Attribute::NoInline); + if (Options.NoRedZone) + User->addFnAttr(Attribute::NoRedZone); + User->setVisibility(GlobalValue::HiddenVisibility); + + IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User)); + auto *Load = IRB.CreateLoad(Var); + IRB.CreateRet(Load); + + // Mark the user variable as used so that it isn't stripped out. + UsedVars.push_back(User); +} + +void InstrProfiling::emitUses() { + if (UsedVars.empty()) + return; + + GlobalVariable *LLVMUsed = M->getGlobalVariable("llvm.used"); + std::vector<Constant*> MergedVars; + if (LLVMUsed) { + // Collect the existing members of llvm.used. + ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer()); + for (unsigned I = 0, E = Inits->getNumOperands(); I != E; ++I) + MergedVars.push_back(Inits->getOperand(I)); + LLVMUsed->eraseFromParent(); + } + + Type *i8PTy = Type::getInt8PtrTy(M->getContext()); + // Add uses for our data. + for (auto *Value : UsedVars) + MergedVars.push_back( + ConstantExpr::getBitCast(cast<llvm::Constant>(Value), i8PTy)); + + // Recreate llvm.used. + ArrayType *ATy = ArrayType::get(i8PTy, MergedVars.size()); + LLVMUsed = new llvm::GlobalVariable( + *M, ATy, false, llvm::GlobalValue::AppendingLinkage, + llvm::ConstantArray::get(ATy, MergedVars), "llvm.used"); + + LLVMUsed->setSection("llvm.metadata"); +} + +void InstrProfiling::emitInitialization() { + Constant *RegisterF = M->getFunction("__llvm_profile_register_functions"); + if (!RegisterF) + return; + + // Create the initialization function. + auto *VoidTy = Type::getVoidTy(M->getContext()); + auto *F = + Function::Create(FunctionType::get(VoidTy, false), + GlobalValue::InternalLinkage, "__llvm_profile_init", M); + F->setUnnamedAddr(true); + F->addFnAttr(Attribute::NoInline); + if (Options.NoRedZone) + F->addFnAttr(Attribute::NoRedZone); + + // Add the basic block and the necessary calls. + IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F)); + IRB.CreateCall(RegisterF); + IRB.CreateRetVoid(); + + appendToGlobalCtors(*M, F, 0); +} diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp index 8e95367..a91fc0e 100644 --- a/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -25,6 +25,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) { initializeAddressSanitizerModulePass(Registry); initializeBoundsCheckingPass(Registry); initializeGCOVProfilerPass(Registry); + initializeInstrProfilingPass(Registry); initializeMemorySanitizerPass(Registry); initializeThreadSanitizerPass(Registry); initializeSanitizerCoverageModulePass(Registry); diff --git a/lib/Transforms/Instrumentation/LLVMBuild.txt b/lib/Transforms/Instrumentation/LLVMBuild.txt index 99e95df..14c1743 100644 --- a/lib/Transforms/Instrumentation/LLVMBuild.txt +++ b/lib/Transforms/Instrumentation/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = Instrumentation parent = Transforms -required_libraries = Analysis Core Support Target TransformUtils +required_libraries = Analysis Core MC Support TransformUtils diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 1261259..4152679 100644 --- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -120,10 +120,7 @@ using namespace llvm; #define DEBUG_TYPE "msan" -static const uint64_t kShadowMask32 = 1ULL << 31; -static const uint64_t kShadowMask64 = 1ULL << 46; -static const uint64_t kOriginOffset32 = 1ULL << 30; -static const uint64_t kOriginOffset64 = 1ULL << 45; +static const unsigned kOriginSize = 4; static const unsigned kMinOriginAlignment = 4; static const unsigned kShadowTLSAlignment = 8; @@ -187,18 +184,6 @@ static cl::opt<int> ClInstrumentationWithCallThreshold( "inline checks (-1 means never use callbacks)."), cl::Hidden, cl::init(3500)); -// Experimental. Wraps all indirect calls in the instrumented code with -// a call to the given function. This is needed to assist the dynamic -// helper tool (MSanDR) to regain control on transition between instrumented and -// non-instrumented code. -static cl::opt<std::string> ClWrapIndirectCalls("msan-wrap-indirect-calls", - cl::desc("Wrap indirect calls with a given function"), - cl::Hidden); - -static cl::opt<bool> ClWrapIndirectCallsFast("msan-wrap-indirect-calls-fast", - cl::desc("Do not wrap indirect calls with target in the same module"), - cl::Hidden, cl::init(true)); - // This is an experiment to enable handling of cases where shadow is a non-zero // compile-time constant. For some unexplainable reason they were silently // ignored in the instrumentation. @@ -208,6 +193,77 @@ static cl::opt<bool> ClCheckConstantShadow("msan-check-constant-shadow", namespace { +// Memory map parameters used in application-to-shadow address calculation. +// Offset = (Addr & ~AndMask) ^ XorMask +// Shadow = ShadowBase + Offset +// Origin = OriginBase + Offset +struct MemoryMapParams { + uint64_t AndMask; + uint64_t XorMask; + uint64_t ShadowBase; + uint64_t OriginBase; +}; + +struct PlatformMemoryMapParams { + const MemoryMapParams *bits32; + const MemoryMapParams *bits64; +}; + +// i386 Linux +static const MemoryMapParams Linux_I386_MemoryMapParams = { + 0x000080000000, // AndMask + 0, // XorMask (not used) + 0, // ShadowBase (not used) + 0x000040000000, // OriginBase +}; + +// x86_64 Linux +static const MemoryMapParams Linux_X86_64_MemoryMapParams = { + 0x400000000000, // AndMask + 0, // XorMask (not used) + 0, // ShadowBase (not used) + 0x200000000000, // OriginBase +}; + +// mips64 Linux +static const MemoryMapParams Linux_MIPS64_MemoryMapParams = { + 0x004000000000, // AndMask + 0, // XorMask (not used) + 0, // ShadowBase (not used) + 0x002000000000, // OriginBase +}; + +// i386 FreeBSD +static const MemoryMapParams FreeBSD_I386_MemoryMapParams = { + 0x000180000000, // AndMask + 0x000040000000, // XorMask + 0x000020000000, // ShadowBase + 0x000700000000, // OriginBase +}; + +// x86_64 FreeBSD +static const MemoryMapParams FreeBSD_X86_64_MemoryMapParams = { + 0xc00000000000, // AndMask + 0x200000000000, // XorMask + 0x100000000000, // ShadowBase + 0x380000000000, // OriginBase +}; + +static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = { + &Linux_I386_MemoryMapParams, + &Linux_X86_64_MemoryMapParams, +}; + +static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = { + NULL, + &Linux_MIPS64_MemoryMapParams, +}; + +static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = { + &FreeBSD_I386_MemoryMapParams, + &FreeBSD_X86_64_MemoryMapParams, +}; + /// \brief An instrumentation pass implementing detection of uninitialized /// reads. /// @@ -219,8 +275,7 @@ class MemorySanitizer : public FunctionPass { : FunctionPass(ID), TrackOrigins(std::max(TrackOrigins, (int)ClTrackOrigins)), DL(nullptr), - WarningFn(nullptr), - WrapIndirectCalls(!ClWrapIndirectCalls.empty()) {} + WarningFn(nullptr) {} const char *getPassName() const override { return "MemorySanitizer"; } bool runOnFunction(Function &F) override; bool doInitialization(Module &M) override; @@ -254,9 +309,6 @@ class MemorySanitizer : public FunctionPass { /// function. GlobalVariable *OriginTLS; - GlobalVariable *MsandrModuleStart; - GlobalVariable *MsandrModuleEnd; - /// \brief The run-time callback to print a warning. Value *WarningFn; // These arrays are indexed by log2(AccessSize). @@ -274,27 +326,18 @@ class MemorySanitizer : public FunctionPass { /// \brief MSan runtime replacements for memmove, memcpy and memset. Value *MemmoveFn, *MemcpyFn, *MemsetFn; - /// \brief Address mask used in application-to-shadow address calculation. - /// ShadowAddr is computed as ApplicationAddr & ~ShadowMask. - uint64_t ShadowMask; - /// \brief Offset of the origin shadow from the "normal" shadow. - /// OriginAddr is computed as (ShadowAddr + OriginOffset) & ~3ULL - uint64_t OriginOffset; - /// \brief Branch weights for error reporting. + /// \brief Memory map parameters used in application-to-shadow calculation. + const MemoryMapParams *MapParams; + MDNode *ColdCallWeights; /// \brief Branch weights for origin store. MDNode *OriginStoreWeights; /// \brief An empty volatile inline asm that prevents callback merge. InlineAsm *EmptyAsm; - bool WrapIndirectCalls; - /// \brief Run-time wrapper for indirect calls. - Value *IndirectCallWrapperFn; - // Argument and return type of IndirectCallWrapperFn: void (*f)(void). - Type *AnyFunctionPtrTy; - friend struct MemorySanitizerVisitor; friend struct VarArgAMD64Helper; + friend struct VarArgMIPS64Helper; }; } // namespace @@ -400,24 +443,6 @@ void MemorySanitizer::initializeCallbacks(Module &M) { EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), StringRef(""), StringRef(""), /*hasSideEffects=*/true); - - if (WrapIndirectCalls) { - AnyFunctionPtrTy = - PointerType::getUnqual(FunctionType::get(IRB.getVoidTy(), false)); - IndirectCallWrapperFn = M.getOrInsertFunction( - ClWrapIndirectCalls, AnyFunctionPtrTy, AnyFunctionPtrTy, nullptr); - } - - if (WrapIndirectCalls && ClWrapIndirectCallsFast) { - MsandrModuleStart = new GlobalVariable( - M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage, - nullptr, "__executable_start"); - MsandrModuleStart->setVisibility(GlobalVariable::HiddenVisibility); - MsandrModuleEnd = new GlobalVariable( - M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage, - nullptr, "_end"); - MsandrModuleEnd->setVisibility(GlobalVariable::HiddenVisibility); - } } /// \brief Module-level initialization. @@ -429,22 +454,41 @@ bool MemorySanitizer::doInitialization(Module &M) { report_fatal_error("data layout missing"); DL = &DLP->getDataLayout(); - C = &(M.getContext()); - unsigned PtrSize = DL->getPointerSizeInBits(/* AddressSpace */0); - switch (PtrSize) { - case 64: - ShadowMask = kShadowMask64; - OriginOffset = kOriginOffset64; + Triple TargetTriple(M.getTargetTriple()); + switch (TargetTriple.getOS()) { + case Triple::FreeBSD: + switch (TargetTriple.getArch()) { + case Triple::x86_64: + MapParams = FreeBSD_X86_MemoryMapParams.bits64; + break; + case Triple::x86: + MapParams = FreeBSD_X86_MemoryMapParams.bits32; + break; + default: + report_fatal_error("unsupported architecture"); + } break; - case 32: - ShadowMask = kShadowMask32; - OriginOffset = kOriginOffset32; + case Triple::Linux: + switch (TargetTriple.getArch()) { + case Triple::x86_64: + MapParams = Linux_X86_MemoryMapParams.bits64; + break; + case Triple::x86: + MapParams = Linux_X86_MemoryMapParams.bits32; + break; + case Triple::mips64: + case Triple::mips64el: + MapParams = Linux_MIPS_MemoryMapParams.bits64; + break; + default: + report_fatal_error("unsupported architecture"); + } break; default: - report_fatal_error("unsupported pointer size"); - break; + report_fatal_error("unsupported operating system"); } + C = &(M.getContext()); IRBuilder<> IRB(*C); IntptrTy = IRB.getIntPtrTy(DL); OriginTy = IRB.getInt32Ty(); @@ -537,12 +581,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { }; SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList; SmallVector<Instruction*, 16> StoreList; - SmallVector<CallSite, 16> IndirectCallList; MemorySanitizerVisitor(Function &F, MemorySanitizer &MS) : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) { - bool SanitizeFunction = F.getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::SanitizeMemory); + bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory); InsertChecks = SanitizeFunction; PropagateShadow = SanitizeFunction; PoisonStack = SanitizeFunction && ClPoisonStack; @@ -561,18 +603,63 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return IRB.CreateCall(MS.MsanChainOriginFn, V); } + Value *originToIntptr(IRBuilder<> &IRB, Value *Origin) { + unsigned IntptrSize = MS.DL->getTypeStoreSize(MS.IntptrTy); + if (IntptrSize == kOriginSize) return Origin; + assert(IntptrSize == kOriginSize * 2); + Origin = IRB.CreateIntCast(Origin, MS.IntptrTy, /* isSigned */ false); + return IRB.CreateOr(Origin, IRB.CreateShl(Origin, kOriginSize * 8)); + } + + /// \brief Fill memory range with the given origin value. + void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr, + unsigned Size, unsigned Alignment) { + unsigned IntptrAlignment = MS.DL->getABITypeAlignment(MS.IntptrTy); + unsigned IntptrSize = MS.DL->getTypeStoreSize(MS.IntptrTy); + assert(IntptrAlignment >= kMinOriginAlignment); + assert(IntptrSize >= kOriginSize); + + unsigned Ofs = 0; + unsigned CurrentAlignment = Alignment; + if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) { + Value *IntptrOrigin = originToIntptr(IRB, Origin); + Value *IntptrOriginPtr = + IRB.CreatePointerCast(OriginPtr, PointerType::get(MS.IntptrTy, 0)); + for (unsigned i = 0; i < Size / IntptrSize; ++i) { + Value *Ptr = + i ? IRB.CreateConstGEP1_32(IntptrOriginPtr, i) : IntptrOriginPtr; + IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment); + Ofs += IntptrSize / kOriginSize; + CurrentAlignment = IntptrAlignment; + } + } + + for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) { + Value *GEP = i ? IRB.CreateConstGEP1_32(OriginPtr, i) : OriginPtr; + IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment); + CurrentAlignment = kMinOriginAlignment; + } + } + void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin, unsigned Alignment, bool AsCall) { + unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment); + unsigned StoreSize = MS.DL->getTypeStoreSize(Shadow->getType()); if (isa<StructType>(Shadow->getType())) { - IRB.CreateAlignedStore(updateOrigin(Origin, IRB), getOriginPtr(Addr, IRB), - Alignment); + paintOrigin(IRB, updateOrigin(Origin, IRB), + getOriginPtr(Addr, IRB, Alignment), StoreSize, + OriginAlignment); } else { Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); - // TODO(eugenis): handle non-zero constant shadow by inserting an - // unconditional check (can not simply fail compilation as this could - // be in the dead code). - if (!ClCheckConstantShadow) - if (isa<Constant>(ConvertedShadow)) return; + Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow); + if (ConstantShadow) { + if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) + paintOrigin(IRB, updateOrigin(Origin, IRB), + getOriginPtr(Addr, IRB, Alignment), StoreSize, + OriginAlignment); + return; + } + unsigned TypeSizeInBits = MS.DL->getTypeSizeInBits(ConvertedShadow->getType()); unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits); @@ -589,8 +676,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Instruction *CheckTerm = SplitBlockAndInsertIfThen( Cmp, IRB.GetInsertPoint(), false, MS.OriginStoreWeights); IRBuilder<> IRBNew(CheckTerm); - IRBNew.CreateAlignedStore(updateOrigin(Origin, IRBNew), - getOriginPtr(Addr, IRBNew), Alignment); + paintOrigin(IRBNew, updateOrigin(Origin, IRBNew), + getOriginPtr(Addr, IRBNew, Alignment), StoreSize, + OriginAlignment); } } } @@ -614,11 +702,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (SI.isAtomic()) SI.setOrdering(addReleaseOrdering(SI.getOrdering())); - if (MS.TrackOrigins) { - unsigned Alignment = std::max(kMinOriginAlignment, SI.getAlignment()); - storeOrigin(IRB, Addr, Shadow, getOrigin(Val), Alignment, + if (MS.TrackOrigins && !SI.isAtomic()) + storeOrigin(IRB, Addr, Shadow, getOrigin(Val), SI.getAlignment(), InstrumentWithCalls); - } } } @@ -628,9 +714,23 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { DEBUG(dbgs() << " SHAD0 : " << *Shadow << "\n"); Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); DEBUG(dbgs() << " SHAD1 : " << *ConvertedShadow << "\n"); - // See the comment in storeOrigin(). - if (!ClCheckConstantShadow) - if (isa<Constant>(ConvertedShadow)) return; + + Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow); + if (ConstantShadow) { + if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) { + if (MS.TrackOrigins) { + IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0), + MS.OriginTLS); + } + IRB.CreateCall(MS.WarningFn); + IRB.CreateCall(MS.EmptyAsm); + // FIXME: Insert UnreachableInst if !ClKeepGoing? + // This may invalidate some of the following checks and needs to be done + // at the very end. + } + return; + } + unsigned TypeSizeInBits = MS.DL->getTypeSizeInBits(ConvertedShadow->getType()); unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits); @@ -669,47 +769,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { DEBUG(dbgs() << "DONE:\n" << F); } - void materializeIndirectCalls() { - for (auto &CS : IndirectCallList) { - Instruction *I = CS.getInstruction(); - BasicBlock *B = I->getParent(); - IRBuilder<> IRB(I); - Value *Fn0 = CS.getCalledValue(); - Value *Fn = IRB.CreateBitCast(Fn0, MS.AnyFunctionPtrTy); - - if (ClWrapIndirectCallsFast) { - // Check that call target is inside this module limits. - Value *Start = - IRB.CreateBitCast(MS.MsandrModuleStart, MS.AnyFunctionPtrTy); - Value *End = IRB.CreateBitCast(MS.MsandrModuleEnd, MS.AnyFunctionPtrTy); - - Value *NotInThisModule = IRB.CreateOr(IRB.CreateICmpULT(Fn, Start), - IRB.CreateICmpUGE(Fn, End)); - - PHINode *NewFnPhi = - IRB.CreatePHI(Fn0->getType(), 2, "msandr.indirect_target"); - - Instruction *CheckTerm = SplitBlockAndInsertIfThen( - NotInThisModule, NewFnPhi, - /* Unreachable */ false, MS.ColdCallWeights); - - IRB.SetInsertPoint(CheckTerm); - // Slow path: call wrapper function to possibly transform the call - // target. - Value *NewFn = IRB.CreateBitCast( - IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType()); - - NewFnPhi->addIncoming(Fn0, B); - NewFnPhi->addIncoming(NewFn, dyn_cast<Instruction>(NewFn)->getParent()); - CS.setCalledFunction(NewFnPhi); - } else { - Value *NewFn = IRB.CreateBitCast( - IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType()); - CS.setCalledFunction(NewFn); - } - } - } - /// \brief Add MemorySanitizer instrumentation to a function. bool runOnFunction() { MS.initializeCallbacks(*F.getParent()); @@ -752,9 +811,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Insert shadow value checks. materializeChecks(InstrumentWithCalls); - // Wrap indirect calls. - materializeIndirectCalls(); - return true; } @@ -808,32 +864,57 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return IRB.CreateBitCast(V, NoVecTy); } + /// \brief Compute the integer shadow offset that corresponds to a given + /// application address. + /// + /// Offset = (Addr & ~AndMask) ^ XorMask + Value *getShadowPtrOffset(Value *Addr, IRBuilder<> &IRB) { + uint64_t AndMask = MS.MapParams->AndMask; + assert(AndMask != 0 && "AndMask shall be specified"); + Value *OffsetLong = + IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy), + ConstantInt::get(MS.IntptrTy, ~AndMask)); + + uint64_t XorMask = MS.MapParams->XorMask; + if (XorMask != 0) + OffsetLong = IRB.CreateXor(OffsetLong, + ConstantInt::get(MS.IntptrTy, XorMask)); + return OffsetLong; + } + /// \brief Compute the shadow address that corresponds to a given application /// address. /// - /// Shadow = Addr & ~ShadowMask. + /// Shadow = ShadowBase + Offset Value *getShadowPtr(Value *Addr, Type *ShadowTy, IRBuilder<> &IRB) { - Value *ShadowLong = - IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy), - ConstantInt::get(MS.IntptrTy, ~MS.ShadowMask)); + Value *ShadowLong = getShadowPtrOffset(Addr, IRB); + uint64_t ShadowBase = MS.MapParams->ShadowBase; + if (ShadowBase != 0) + ShadowLong = + IRB.CreateAdd(ShadowLong, + ConstantInt::get(MS.IntptrTy, ShadowBase)); return IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0)); } /// \brief Compute the origin address that corresponds to a given application /// address. /// - /// OriginAddr = (ShadowAddr + OriginOffset) & ~3ULL - Value *getOriginPtr(Value *Addr, IRBuilder<> &IRB) { - Value *ShadowLong = - IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy), - ConstantInt::get(MS.IntptrTy, ~MS.ShadowMask)); - Value *Add = - IRB.CreateAdd(ShadowLong, - ConstantInt::get(MS.IntptrTy, MS.OriginOffset)); - Value *SecondAnd = - IRB.CreateAnd(Add, ConstantInt::get(MS.IntptrTy, ~3ULL)); - return IRB.CreateIntToPtr(SecondAnd, PointerType::get(IRB.getInt32Ty(), 0)); + /// OriginAddr = (OriginBase + Offset) & ~3ULL + Value *getOriginPtr(Value *Addr, IRBuilder<> &IRB, unsigned Alignment) { + Value *OriginLong = getShadowPtrOffset(Addr, IRB); + uint64_t OriginBase = MS.MapParams->OriginBase; + if (OriginBase != 0) + OriginLong = + IRB.CreateAdd(OriginLong, + ConstantInt::get(MS.IntptrTy, OriginBase)); + if (Alignment < kMinOriginAlignment) { + uint64_t Mask = kMinOriginAlignment - 1; + OriginLong = IRB.CreateAnd(OriginLong, + ConstantInt::get(MS.IntptrTy, ~Mask)); + } + return IRB.CreateIntToPtr(OriginLong, + PointerType::get(IRB.getInt32Ty(), 0)); } /// \brief Compute the shadow address for a given function argument. @@ -1006,6 +1087,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *OriginPtr = getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset); setOrigin(A, EntryIRB.CreateLoad(OriginPtr)); + } else { + setOrigin(A, getCleanOrigin()); } } ArgOffset += RoundUpToAlignment(Size, kShadowTLSAlignment); @@ -1025,15 +1108,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Get the origin for a value. Value *getOrigin(Value *V) { if (!MS.TrackOrigins) return nullptr; - if (isa<Instruction>(V) || isa<Argument>(V)) { - Value *Origin = OriginMap[V]; - if (!Origin) { - DEBUG(dbgs() << "NO ORIGIN: " << *V << "\n"); - Origin = getCleanOrigin(); - } - return Origin; - } - return getCleanOrigin(); + if (!PropagateShadow) return getCleanOrigin(); + if (isa<Constant>(V)) return getCleanOrigin(); + assert((isa<Instruction>(V) || isa<Argument>(V)) && + "Unexpected value type in getOrigin()"); + Value *Origin = OriginMap[V]; + assert(Origin && "Missing origin"); + return Origin; } /// \brief Get the origin for i-th argument of the instruction I. @@ -1121,7 +1202,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(I.getNextNode()); Type *ShadowTy = getShadowTy(&I); Value *Addr = I.getPointerOperand(); - if (PropagateShadow) { + if (PropagateShadow && !I.getMetadata("nosanitize")) { Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB); setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, I.getAlignment(), "_msld")); @@ -1137,9 +1218,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (MS.TrackOrigins) { if (PropagateShadow) { - unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment()); - setOrigin(&I, - IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB), Alignment)); + unsigned Alignment = I.getAlignment(); + unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment); + setOrigin(&I, IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB, Alignment), + OriginAlignment)); } else { setOrigin(&I, getCleanOrigin()); } @@ -1173,6 +1255,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRB.CreateStore(getCleanShadow(&I), ShadowPtr); setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); } void visitAtomicRMWInst(AtomicRMWInst &I) { @@ -1790,7 +1873,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // FIXME: use ClStoreCleanOrigin // FIXME: factor out common code from materializeStores if (MS.TrackOrigins) - IRB.CreateStore(getOrigin(&I, 1), getOriginPtr(Addr, IRB)); + IRB.CreateStore(getOrigin(&I, 1), getOriginPtr(Addr, IRB, 1)); return true; } @@ -1817,7 +1900,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (MS.TrackOrigins) { if (PropagateShadow) - setOrigin(&I, IRB.CreateLoad(getOriginPtr(Addr, IRB))); + setOrigin(&I, IRB.CreateLoad(getOriginPtr(Addr, IRB, 1))); else setOrigin(&I, getCleanOrigin()); } @@ -1981,6 +2064,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOrigin(&I, getOrigin(CopyOp)); } else { setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); } } @@ -2179,15 +2263,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case llvm::Intrinsic::x86_sse_cvttps2pi: handleVectorConvertIntrinsic(I, 2); break; - case llvm::Intrinsic::x86_avx512_psll_dq: - case llvm::Intrinsic::x86_avx512_psrl_dq: case llvm::Intrinsic::x86_avx2_psll_w: case llvm::Intrinsic::x86_avx2_psll_d: case llvm::Intrinsic::x86_avx2_psll_q: case llvm::Intrinsic::x86_avx2_pslli_w: case llvm::Intrinsic::x86_avx2_pslli_d: case llvm::Intrinsic::x86_avx2_pslli_q: - case llvm::Intrinsic::x86_avx2_psll_dq: case llvm::Intrinsic::x86_avx2_psrl_w: case llvm::Intrinsic::x86_avx2_psrl_d: case llvm::Intrinsic::x86_avx2_psrl_q: @@ -2198,14 +2279,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case llvm::Intrinsic::x86_avx2_psrli_q: case llvm::Intrinsic::x86_avx2_psrai_w: case llvm::Intrinsic::x86_avx2_psrai_d: - case llvm::Intrinsic::x86_avx2_psrl_dq: case llvm::Intrinsic::x86_sse2_psll_w: case llvm::Intrinsic::x86_sse2_psll_d: case llvm::Intrinsic::x86_sse2_psll_q: case llvm::Intrinsic::x86_sse2_pslli_w: case llvm::Intrinsic::x86_sse2_pslli_d: case llvm::Intrinsic::x86_sse2_pslli_q: - case llvm::Intrinsic::x86_sse2_psll_dq: case llvm::Intrinsic::x86_sse2_psrl_w: case llvm::Intrinsic::x86_sse2_psrl_d: case llvm::Intrinsic::x86_sse2_psrl_q: @@ -2216,7 +2295,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case llvm::Intrinsic::x86_sse2_psrli_q: case llvm::Intrinsic::x86_sse2_psrai_w: case llvm::Intrinsic::x86_sse2_psrai_d: - case llvm::Intrinsic::x86_sse2_psrl_dq: case llvm::Intrinsic::x86_mmx_psll_w: case llvm::Intrinsic::x86_mmx_psll_d: case llvm::Intrinsic::x86_mmx_psll_q: @@ -2248,14 +2326,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { handleVectorShiftIntrinsic(I, /* Variable */ true); break; - // Byte shifts are not implemented. - // case llvm::Intrinsic::x86_avx512_psll_dq_bs: - // case llvm::Intrinsic::x86_avx512_psrl_dq_bs: - // case llvm::Intrinsic::x86_avx2_psll_dq_bs: - // case llvm::Intrinsic::x86_avx2_psrl_dq_bs: - // case llvm::Intrinsic::x86_sse2_psll_dq_bs: - // case llvm::Intrinsic::x86_sse2_psrl_dq_bs: - case llvm::Intrinsic::x86_sse2_packsswb_128: case llvm::Intrinsic::x86_sse2_packssdw_128: case llvm::Intrinsic::x86_sse2_packuswb_128: @@ -2337,9 +2407,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } IRBuilder<> IRB(&I); - if (MS.WrapIndirectCalls && !CS.getCalledFunction()) - IndirectCallList.push_back(CS); - unsigned ArgOffset = 0; DEBUG(dbgs() << " CallSite: " << I << "\n"); for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end(); @@ -2448,6 +2515,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(&I); if (!PropagateShadow) { setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); return; } @@ -2461,6 +2529,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void visitAllocaInst(AllocaInst &I) { setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); IRBuilder<> IRB(I.getNextNode()); uint64_t Size = MS.DL->getTypeAllocSize(I.getAllocatedType()); if (PoisonStack && ClPoisonStackWithCall) { @@ -2474,7 +2543,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } if (PoisonStack && MS.TrackOrigins) { - setOrigin(&I, getCleanOrigin()); SmallString<2048> StackDescriptionStorage; raw_svector_ostream StackDescription(StackDescriptionStorage); // We create a string with a description of the stack allocation and @@ -2540,9 +2608,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } // a = select b, c, d // Oa = Sb ? Ob : (b ? Oc : Od) - setOrigin(&I, IRB.CreateSelect( - Sb, getOrigin(I.getCondition()), - IRB.CreateSelect(B, getOrigin(C), getOrigin(D)))); + setOrigin( + &I, IRB.CreateSelect(Sb, getOrigin(I.getCondition()), + IRB.CreateSelect(B, getOrigin(I.getTrueValue()), + getOrigin(I.getFalseValue())))); } } @@ -2776,6 +2845,106 @@ struct VarArgAMD64Helper : public VarArgHelper { } }; +/// \brief MIPS64-specific implementation of VarArgHelper. +struct VarArgMIPS64Helper : public VarArgHelper { + Function &F; + MemorySanitizer &MS; + MemorySanitizerVisitor &MSV; + Value *VAArgTLSCopy; + Value *VAArgSize; + + SmallVector<CallInst*, 16> VAStartInstrumentationList; + + VarArgMIPS64Helper(Function &F, MemorySanitizer &MS, + MemorySanitizerVisitor &MSV) + : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr), + VAArgSize(nullptr) {} + + void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override { + unsigned VAArgOffset = 0; + for (CallSite::arg_iterator ArgIt = CS.arg_begin() + 1, End = CS.arg_end(); + ArgIt != End; ++ArgIt) { + Value *A = *ArgIt; + Value *Base; + uint64_t ArgSize = MS.DL->getTypeAllocSize(A->getType()); +#if defined(__MIPSEB__) || defined(MIPSEB) + // Adjusting the shadow for argument with size < 8 to match the placement + // of bits in big endian system + if (ArgSize < 8) + VAArgOffset += (8 - ArgSize); +#endif + Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset); + VAArgOffset += ArgSize; + VAArgOffset = RoundUpToAlignment(VAArgOffset, 8); + IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); + } + + Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset); + // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of + // a new class member i.e. it is the total size of all VarArgs. + IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS); + } + + /// \brief Compute the shadow address for a given va_arg. + Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, + int ArgOffset) { + Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); + Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); + return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0), + "_msarg"); + } + + void visitVAStartInst(VAStartInst &I) override { + IRBuilder<> IRB(&I); + VAStartInstrumentationList.push_back(&I); + Value *VAListTag = I.getArgOperand(0); + Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); + IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), + /* size */8, /* alignment */8, false); + } + + void visitVACopyInst(VACopyInst &I) override { + IRBuilder<> IRB(&I); + Value *VAListTag = I.getArgOperand(0); + Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); + // Unpoison the whole __va_list_tag. + // FIXME: magic ABI constants. + IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), + /* size */8, /* alignment */8, false); + } + + void finalizeInstrumentation() override { + assert(!VAArgSize && !VAArgTLSCopy && + "finalizeInstrumentation called twice"); + IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI()); + VAArgSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS); + Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0), + VAArgSize); + + if (!VAStartInstrumentationList.empty()) { + // If there is a va_start in this function, make a backup copy of + // va_arg_tls somewhere in the function entry block. + VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); + IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8); + } + + // Instrument va_start. + // Copy va_list shadow from the backup copy of the TLS contents. + for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) { + CallInst *OrigInst = VAStartInstrumentationList[i]; + IRBuilder<> IRB(OrigInst->getNextNode()); + Value *VAListTag = OrigInst->getArgOperand(0); + Value *RegSaveAreaPtrPtr = + IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), + Type::getInt64PtrTy(*MS.C)); + Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr); + Value *RegSaveAreaShadowPtr = + MSV.getShadowPtr(RegSaveAreaPtr, IRB.getInt8Ty(), IRB); + IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, CopySize, 8); + } + } +}; + /// \brief A no-op implementation of VarArgHelper. struct VarArgNoOpHelper : public VarArgHelper { VarArgNoOpHelper(Function &F, MemorySanitizer &MS, @@ -2797,6 +2966,9 @@ VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, llvm::Triple TargetTriple(Func.getParent()->getTargetTriple()); if (TargetTriple.getArch() == llvm::Triple::x86_64) return new VarArgAMD64Helper(Func, Msan, Visitor); + else if (TargetTriple.getArch() == llvm::Triple::mips64 || + TargetTriple.getArch() == llvm::Triple::mips64el) + return new VarArgMIPS64Helper(Func, Msan, Visitor); else return new VarArgNoOpHelper(Func, Msan, Visitor); } diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index f882072..8c56e87 100644 --- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -10,12 +10,11 @@ // Coverage instrumentation that works with AddressSanitizer // and potentially with other Sanitizers. // -// We create a Guard boolean variable with the same linkage +// We create a Guard variable with the same linkage // as the function and inject this code into the entry block (CoverageLevel=1) // or all blocks (CoverageLevel>=2): -// if (*Guard) { -// __sanitizer_cov(); -// *Guard = 1; +// if (Guard < 0) { +// __sanitizer_cov(&Guard); // } // The accesses to Guard are atomic. The rest of the logic is // in __sanitizer_cov (it's fine to call it more than once). @@ -38,6 +37,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" @@ -55,11 +55,12 @@ using namespace llvm; static const char *const kSanCovModuleInitName = "__sanitizer_cov_module_init"; static const char *const kSanCovName = "__sanitizer_cov"; +static const char *const kSanCovWithCheckName = "__sanitizer_cov_with_check"; static const char *const kSanCovIndirCallName = "__sanitizer_cov_indir_call16"; static const char *const kSanCovTraceEnter = "__sanitizer_cov_trace_func_enter"; static const char *const kSanCovTraceBB = "__sanitizer_cov_trace_basic_block"; static const char *const kSanCovModuleCtorName = "sancov.module_ctor"; -static const uint64_t kSanCtorAndDtorPriority = 1; +static const uint64_t kSanCtorAndDtorPriority = 2; static cl::opt<int> ClCoverageLevel("sanitizer-coverage-level", cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, " @@ -67,11 +68,11 @@ static cl::opt<int> ClCoverageLevel("sanitizer-coverage-level", "4: above plus indirect calls"), cl::Hidden, cl::init(0)); -static cl::opt<int> ClCoverageBlockThreshold( +static cl::opt<unsigned> ClCoverageBlockThreshold( "sanitizer-coverage-block-threshold", - cl::desc("Add coverage instrumentation only to the entry block if there " - "are more than this number of blocks."), - cl::Hidden, cl::init(1500)); + cl::desc("Use a callback with a guard check inside it if there are" + " more than this number of blocks."), + cl::Hidden, cl::init(1000)); static cl::opt<bool> ClExperimentalTracing("sanitizer-coverage-experimental-tracing", @@ -102,15 +103,18 @@ class SanitizerCoverageModule : public ModulePass { ArrayRef<Instruction *> IndirCalls); bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks, ArrayRef<Instruction *> IndirCalls); - bool InjectTracing(Function &F, ArrayRef<BasicBlock *> AllBlocks); - void InjectCoverageAtBlock(Function &F, BasicBlock &BB); + void InjectCoverageAtBlock(Function &F, BasicBlock &BB, bool UseCalls); Function *SanCovFunction; + Function *SanCovWithCheckFunction; Function *SanCovIndirCallFunction; Function *SanCovModuleInit; Function *SanCovTraceEnter, *SanCovTraceBB; + InlineAsm *EmptyAsm; Type *IntptrTy; LLVMContext *C; + GlobalVariable *GuardArray; + int CoverageLevel; }; @@ -132,6 +136,9 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { DataLayoutPass *DLP = &getAnalysis<DataLayoutPass>(); IntptrTy = Type::getIntNTy(*C, DLP->getDataLayout().getPointerSizeInBits()); Type *VoidTy = Type::getVoidTy(*C); + IRBuilder<> IRB(*C); + Type *Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty()); + Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); Function *CtorFunc = Function::Create(FunctionType::get(VoidTy, false), @@ -139,37 +146,73 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { ReturnInst::Create(*C, BasicBlock::Create(*C, "", CtorFunc)); appendToGlobalCtors(M, CtorFunc, kSanCtorAndDtorPriority); - SanCovFunction = - checkInterfaceFunction(M.getOrInsertFunction(kSanCovName, VoidTy, nullptr)); + SanCovFunction = checkInterfaceFunction( + M.getOrInsertFunction(kSanCovName, VoidTy, Int32PtrTy, nullptr)); + SanCovWithCheckFunction = checkInterfaceFunction( + M.getOrInsertFunction(kSanCovWithCheckName, VoidTy, Int32PtrTy, nullptr)); SanCovIndirCallFunction = checkInterfaceFunction(M.getOrInsertFunction( kSanCovIndirCallName, VoidTy, IntptrTy, IntptrTy, nullptr)); - SanCovModuleInit = checkInterfaceFunction(M.getOrInsertFunction( - kSanCovModuleInitName, Type::getVoidTy(*C), IntptrTy, nullptr)); + SanCovModuleInit = checkInterfaceFunction( + M.getOrInsertFunction(kSanCovModuleInitName, Type::getVoidTy(*C), + Int32PtrTy, IntptrTy, Int8PtrTy, nullptr)); SanCovModuleInit->setLinkage(Function::ExternalLinkage); + // We insert an empty inline asm after cov callbacks to avoid callback merge. + EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), + StringRef(""), StringRef(""), + /*hasSideEffects=*/true); if (ClExperimentalTracing) { SanCovTraceEnter = checkInterfaceFunction( - M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, IntptrTy, nullptr)); + M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr)); SanCovTraceBB = checkInterfaceFunction( - M.getOrInsertFunction(kSanCovTraceBB, VoidTy, IntptrTy, nullptr)); + M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr)); } + // At this point we create a dummy array of guards because we don't + // know how many elements we will need. + Type *Int32Ty = IRB.getInt32Ty(); + GuardArray = + new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage, + nullptr, "__sancov_gen_cov_tmp"); + for (auto &F : M) runOnFunction(F); - IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator()); - IRB.CreateCall(SanCovModuleInit, - ConstantInt::get(IntptrTy, SanCovFunction->getNumUses())); + // Now we know how many elements we need. Create an array of guards + // with one extra element at the beginning for the size. + Type *Int32ArrayNTy = + ArrayType::get(Int32Ty, SanCovFunction->getNumUses() + 1); + GlobalVariable *RealGuardArray = new GlobalVariable( + M, Int32ArrayNTy, false, GlobalValue::PrivateLinkage, + Constant::getNullValue(Int32ArrayNTy), "__sancov_gen_cov"); + + // Replace the dummy array with the real one. + GuardArray->replaceAllUsesWith( + IRB.CreatePointerCast(RealGuardArray, Int32PtrTy)); + GuardArray->eraseFromParent(); + + // Create variable for module (compilation unit) name + Constant *ModNameStrConst = + ConstantDataArray::getString(M.getContext(), M.getName(), true); + GlobalVariable *ModuleName = + new GlobalVariable(M, ModNameStrConst->getType(), true, + GlobalValue::PrivateLinkage, ModNameStrConst); + + // Call __sanitizer_cov_module_init + IRB.SetInsertPoint(CtorFunc->getEntryBlock().getTerminator()); + IRB.CreateCall3(SanCovModuleInit, + IRB.CreatePointerCast(RealGuardArray, Int32PtrTy), + ConstantInt::get(IntptrTy, SanCovFunction->getNumUses()), + IRB.CreatePointerCast(ModuleName, Int8PtrTy)); return true; } bool SanitizerCoverageModule::runOnFunction(Function &F) { if (F.empty()) return false; - // For now instrument only functions that will also be asan-instrumented. - if (!F.hasFnAttribute(Attribute::SanitizeAddress)) - return false; + if (F.getName().find(".module_ctor") != std::string::npos) + return false; // Should not instrument sanitizer init functions. if (CoverageLevel >= 3) - SplitAllCriticalEdges(F, this); + SplitAllCriticalEdges(F); SmallVector<Instruction*, 8> IndirCalls; SmallVector<BasicBlock*, 16> AllBlocks; for (auto &BB : F) { @@ -182,25 +225,6 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) { } } InjectCoverage(F, AllBlocks, IndirCalls); - InjectTracing(F, AllBlocks); - return true; -} - -// Experimental support for tracing. -// Basicaly, insert a callback at the beginning of every basic block. -// Every callback gets a pointer to a uniqie global for internal storage. -bool SanitizerCoverageModule::InjectTracing(Function &F, - ArrayRef<BasicBlock *> AllBlocks) { - if (!ClExperimentalTracing) return false; - Type *Ty = ArrayType::get(IntptrTy, 1); // May need to use more words later. - for (auto BB : AllBlocks) { - IRBuilder<> IRB(BB->getFirstInsertionPt()); - GlobalVariable *TraceCache = new GlobalVariable( - *F.getParent(), Ty, false, GlobalValue::PrivateLinkage, - Constant::getNullValue(Ty), "__sancov_gen_trace_cache"); - IRB.CreateCall(&F.getEntryBlock() == BB ? SanCovTraceEnter : SanCovTraceBB, - IRB.CreatePointerCast(TraceCache, IntptrTy)); - } return true; } @@ -210,12 +234,12 @@ SanitizerCoverageModule::InjectCoverage(Function &F, ArrayRef<Instruction *> IndirCalls) { if (!CoverageLevel) return false; - if (CoverageLevel == 1 || - (unsigned)ClCoverageBlockThreshold < AllBlocks.size()) { - InjectCoverageAtBlock(F, F.getEntryBlock()); + if (CoverageLevel == 1) { + InjectCoverageAtBlock(F, F.getEntryBlock(), false); } else { for (auto BB : AllBlocks) - InjectCoverageAtBlock(F, *BB); + InjectCoverageAtBlock(F, *BB, + ClCoverageBlockThreshold < AllBlocks.size()); } InjectCoverageForIndirectCalls(F, IndirCalls); return true; @@ -249,8 +273,8 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls( } } -void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, - BasicBlock &BB) { +void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, + bool UseCalls) { BasicBlock::iterator IP = BB.getFirstInsertionPt(), BE = BB.end(); // Skip static allocas at the top of the entry block so they don't become // dynamic when we split the block. If we used our optimized stack layout, @@ -261,28 +285,41 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, break; } - DebugLoc EntryLoc = &BB == &F.getEntryBlock() - ? IP->getDebugLoc().getFnDebugLoc(*C) - : IP->getDebugLoc(); + bool IsEntryBB = &BB == &F.getEntryBlock(); + DebugLoc EntryLoc = + IsEntryBB ? IP->getDebugLoc().getFnDebugLoc(*C) : IP->getDebugLoc(); IRBuilder<> IRB(IP); IRB.SetCurrentDebugLocation(EntryLoc); - Type *Int8Ty = IRB.getInt8Ty(); - GlobalVariable *Guard = new GlobalVariable( - *F.getParent(), Int8Ty, false, GlobalValue::PrivateLinkage, - Constant::getNullValue(Int8Ty), "__sancov_gen_cov_" + F.getName()); - LoadInst *Load = IRB.CreateLoad(Guard); - Load->setAtomic(Monotonic); - Load->setAlignment(1); - Value *Cmp = IRB.CreateICmpEQ(Constant::getNullValue(Int8Ty), Load); - Instruction *Ins = SplitBlockAndInsertIfThen( - Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); - IRB.SetInsertPoint(Ins); - IRB.SetCurrentDebugLocation(EntryLoc); - // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC. - IRB.CreateCall(SanCovFunction); - StoreInst *Store = IRB.CreateStore(ConstantInt::get(Int8Ty, 1), Guard); - Store->setAtomic(Monotonic); - Store->setAlignment(1); + SmallVector<Value *, 1> Indices; + Value *GuardP = IRB.CreateAdd( + IRB.CreatePointerCast(GuardArray, IntptrTy), + ConstantInt::get(IntptrTy, (1 + SanCovFunction->getNumUses()) * 4)); + Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); + GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy); + if (UseCalls) { + IRB.CreateCall(SanCovWithCheckFunction, GuardP); + } else { + LoadInst *Load = IRB.CreateLoad(GuardP); + Load->setAtomic(Monotonic); + Load->setAlignment(4); + Load->setMetadata(F.getParent()->getMDKindID("nosanitize"), + MDNode::get(*C, None)); + Value *Cmp = IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load); + Instruction *Ins = SplitBlockAndInsertIfThen( + Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); + IRB.SetInsertPoint(Ins); + IRB.SetCurrentDebugLocation(EntryLoc); + // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC. + IRB.CreateCall(SanCovFunction, GuardP); + IRB.CreateCall(EmptyAsm); // Avoids callback merge. + } + + if (ClExperimentalTracing) { + // Experimental support for tracing. + // Insert a callback with the same guard variable as used for coverage. + IRB.SetInsertPoint(IP); + IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP); + } } char SanitizerCoverageModule::ID = 0; diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 8a56a1f..e4a4911 100644 --- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -19,6 +19,8 @@ // The rest is handled by the run-time library. //===----------------------------------------------------------------------===// +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Transforms/Instrumentation.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" @@ -68,6 +70,7 @@ STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads"); STATISTIC(NumOmittedReadsFromConstantGlobals, "Number of reads from constant globals"); STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads"); +STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing"); namespace { @@ -99,6 +102,8 @@ struct ThreadSanitizer : public FunctionPass { static const size_t kNumberOfAccessSizes = 5; Function *TsanRead[kNumberOfAccessSizes]; Function *TsanWrite[kNumberOfAccessSizes]; + Function *TsanUnalignedRead[kNumberOfAccessSizes]; + Function *TsanUnalignedWrite[kNumberOfAccessSizes]; Function *TsanAtomicLoad[kNumberOfAccessSizes]; Function *TsanAtomicStore[kNumberOfAccessSizes]; Function *TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1][kNumberOfAccessSizes]; @@ -150,6 +155,16 @@ void ThreadSanitizer::initializeCallbacks(Module &M) { TsanWrite[i] = checkInterfaceFunction(M.getOrInsertFunction( WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); + SmallString<64> UnalignedReadName("__tsan_unaligned_read" + + itostr(ByteSize)); + TsanUnalignedRead[i] = checkInterfaceFunction(M.getOrInsertFunction( + UnalignedReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); + + SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + + itostr(ByteSize)); + TsanUnalignedWrite[i] = checkInterfaceFunction(M.getOrInsertFunction( + UnalignedWriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); + Type *Ty = Type::getIntNTy(M.getContext(), BitSize); Type *PtrTy = Ty->getPointerTo(); SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) + @@ -260,6 +275,7 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) { // Instrumenting some of the accesses may be proven redundant. // Currently handled: // - read-before-write (within same BB, no calls between) +// - not captured variables // // We do not handle some of the patterns that should not survive // after the classic compiler optimizations. @@ -291,6 +307,17 @@ void ThreadSanitizer::chooseInstructionsToInstrument( continue; } } + Value *Addr = isa<StoreInst>(*I) + ? cast<StoreInst>(I)->getPointerOperand() + : cast<LoadInst>(I)->getPointerOperand(); + if (isa<AllocaInst>(GetUnderlyingObject(Addr, nullptr)) && + !PointerMayBeCaptured(Addr, true, true)) { + // The variable is addressable but not captured, so it cannot be + // referenced from a different thread and participate in a data race + // (see llvm/Analysis/CaptureTracking.h for details). + NumOmittedNonCaptured++; + continue; + } All.push_back(I); } Local.clear(); @@ -412,7 +439,16 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) { NumInstrumentedVtableReads++; return true; } - Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx]; + const unsigned Alignment = IsWrite + ? cast<StoreInst>(I)->getAlignment() + : cast<LoadInst>(I)->getAlignment(); + Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType(); + const uint32_t TypeSize = DL->getTypeStoreSizeInBits(OrigTy); + Value *OnAccessFunc = nullptr; + if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0) + OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx]; + else + OnAccessFunc = IsWrite ? TsanUnalignedWrite[Idx] : TsanUnalignedRead[Idx]; IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy())); if (IsWrite) NumInstrumentedWrites++; else NumInstrumentedReads++; @@ -422,7 +458,7 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) { static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) { uint32_t v = 0; switch (ord) { - case NotAtomic: assert(false); + case NotAtomic: llvm_unreachable("unexpected atomic ordering!"); case Unordered: // Fall-through. case Monotonic: v = 0; break; // case Consume: v = 1; break; // Not specified yet. diff --git a/lib/Transforms/ObjCARC/ARCInstKind.cpp b/lib/Transforms/ObjCARC/ARCInstKind.cpp new file mode 100644 index 0000000..f1e9dce --- /dev/null +++ b/lib/Transforms/ObjCARC/ARCInstKind.cpp @@ -0,0 +1,645 @@ +//===- ARCInstKind.cpp - ObjC ARC Optimization ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file defines several utility functions used by various ARC +/// optimizations which are IMHO too big to be in a header file. +/// +/// WARNING: This file knows about certain library functions. It recognizes them +/// by name, and hardwires knowledge of their semantics. +/// +/// WARNING: This file knows about how certain Objective-C library functions are +/// used. Naive LLVM IR transformations which would otherwise be +/// behavior-preserving may break these assumptions. +/// +//===----------------------------------------------------------------------===// + +#include "ObjCARC.h" +#include "llvm/IR/Intrinsics.h" + +using namespace llvm; +using namespace llvm::objcarc; + +raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, + const ARCInstKind Class) { + switch (Class) { + case ARCInstKind::Retain: + return OS << "ARCInstKind::Retain"; + case ARCInstKind::RetainRV: + return OS << "ARCInstKind::RetainRV"; + case ARCInstKind::RetainBlock: + return OS << "ARCInstKind::RetainBlock"; + case ARCInstKind::Release: + return OS << "ARCInstKind::Release"; + case ARCInstKind::Autorelease: + return OS << "ARCInstKind::Autorelease"; + case ARCInstKind::AutoreleaseRV: + return OS << "ARCInstKind::AutoreleaseRV"; + case ARCInstKind::AutoreleasepoolPush: + return OS << "ARCInstKind::AutoreleasepoolPush"; + case ARCInstKind::AutoreleasepoolPop: + return OS << "ARCInstKind::AutoreleasepoolPop"; + case ARCInstKind::NoopCast: + return OS << "ARCInstKind::NoopCast"; + case ARCInstKind::FusedRetainAutorelease: + return OS << "ARCInstKind::FusedRetainAutorelease"; + case ARCInstKind::FusedRetainAutoreleaseRV: + return OS << "ARCInstKind::FusedRetainAutoreleaseRV"; + case ARCInstKind::LoadWeakRetained: + return OS << "ARCInstKind::LoadWeakRetained"; + case ARCInstKind::StoreWeak: + return OS << "ARCInstKind::StoreWeak"; + case ARCInstKind::InitWeak: + return OS << "ARCInstKind::InitWeak"; + case ARCInstKind::LoadWeak: + return OS << "ARCInstKind::LoadWeak"; + case ARCInstKind::MoveWeak: + return OS << "ARCInstKind::MoveWeak"; + case ARCInstKind::CopyWeak: + return OS << "ARCInstKind::CopyWeak"; + case ARCInstKind::DestroyWeak: + return OS << "ARCInstKind::DestroyWeak"; + case ARCInstKind::StoreStrong: + return OS << "ARCInstKind::StoreStrong"; + case ARCInstKind::CallOrUser: + return OS << "ARCInstKind::CallOrUser"; + case ARCInstKind::Call: + return OS << "ARCInstKind::Call"; + case ARCInstKind::User: + return OS << "ARCInstKind::User"; + case ARCInstKind::IntrinsicUser: + return OS << "ARCInstKind::IntrinsicUser"; + case ARCInstKind::None: + return OS << "ARCInstKind::None"; + } + llvm_unreachable("Unknown instruction class!"); +} + +ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) { + Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end(); + + // No (mandatory) arguments. + if (AI == AE) + return StringSwitch<ARCInstKind>(F->getName()) + .Case("objc_autoreleasePoolPush", ARCInstKind::AutoreleasepoolPush) + .Case("clang.arc.use", ARCInstKind::IntrinsicUser) + .Default(ARCInstKind::CallOrUser); + + // One argument. + const Argument *A0 = AI++; + if (AI == AE) + // Argument is a pointer. + if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) { + Type *ETy = PTy->getElementType(); + // Argument is i8*. + if (ETy->isIntegerTy(8)) + return StringSwitch<ARCInstKind>(F->getName()) + .Case("objc_retain", ARCInstKind::Retain) + .Case("objc_retainAutoreleasedReturnValue", ARCInstKind::RetainRV) + .Case("objc_retainBlock", ARCInstKind::RetainBlock) + .Case("objc_release", ARCInstKind::Release) + .Case("objc_autorelease", ARCInstKind::Autorelease) + .Case("objc_autoreleaseReturnValue", ARCInstKind::AutoreleaseRV) + .Case("objc_autoreleasePoolPop", ARCInstKind::AutoreleasepoolPop) + .Case("objc_retainedObject", ARCInstKind::NoopCast) + .Case("objc_unretainedObject", ARCInstKind::NoopCast) + .Case("objc_unretainedPointer", ARCInstKind::NoopCast) + .Case("objc_retain_autorelease", + ARCInstKind::FusedRetainAutorelease) + .Case("objc_retainAutorelease", ARCInstKind::FusedRetainAutorelease) + .Case("objc_retainAutoreleaseReturnValue", + ARCInstKind::FusedRetainAutoreleaseRV) + .Case("objc_sync_enter", ARCInstKind::User) + .Case("objc_sync_exit", ARCInstKind::User) + .Default(ARCInstKind::CallOrUser); + + // Argument is i8** + if (PointerType *Pte = dyn_cast<PointerType>(ETy)) + if (Pte->getElementType()->isIntegerTy(8)) + return StringSwitch<ARCInstKind>(F->getName()) + .Case("objc_loadWeakRetained", ARCInstKind::LoadWeakRetained) + .Case("objc_loadWeak", ARCInstKind::LoadWeak) + .Case("objc_destroyWeak", ARCInstKind::DestroyWeak) + .Default(ARCInstKind::CallOrUser); + } + + // Two arguments, first is i8**. + const Argument *A1 = AI++; + if (AI == AE) + if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) + if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType())) + if (Pte->getElementType()->isIntegerTy(8)) + if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) { + Type *ETy1 = PTy1->getElementType(); + // Second argument is i8* + if (ETy1->isIntegerTy(8)) + return StringSwitch<ARCInstKind>(F->getName()) + .Case("objc_storeWeak", ARCInstKind::StoreWeak) + .Case("objc_initWeak", ARCInstKind::InitWeak) + .Case("objc_storeStrong", ARCInstKind::StoreStrong) + .Default(ARCInstKind::CallOrUser); + // Second argument is i8**. + if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1)) + if (Pte1->getElementType()->isIntegerTy(8)) + return StringSwitch<ARCInstKind>(F->getName()) + .Case("objc_moveWeak", ARCInstKind::MoveWeak) + .Case("objc_copyWeak", ARCInstKind::CopyWeak) + // Ignore annotation calls. This is important to stop the + // optimizer from treating annotations as uses which would + // make the state of the pointers they are attempting to + // elucidate to be incorrect. + .Case("llvm.arc.annotation.topdown.bbstart", + ARCInstKind::None) + .Case("llvm.arc.annotation.topdown.bbend", + ARCInstKind::None) + .Case("llvm.arc.annotation.bottomup.bbstart", + ARCInstKind::None) + .Case("llvm.arc.annotation.bottomup.bbend", + ARCInstKind::None) + .Default(ARCInstKind::CallOrUser); + } + + // Anything else. + return ARCInstKind::CallOrUser; +} + +/// \brief Determine what kind of construct V is. +ARCInstKind llvm::objcarc::GetARCInstKind(const Value *V) { + if (const Instruction *I = dyn_cast<Instruction>(V)) { + // Any instruction other than bitcast and gep with a pointer operand have a + // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer + // to a subsequent use, rather than using it themselves, in this sense. + // As a short cut, several other opcodes are known to have no pointer + // operands of interest. And ret is never followed by a release, so it's + // not interesting to examine. + switch (I->getOpcode()) { + case Instruction::Call: { + const CallInst *CI = cast<CallInst>(I); + // Check for calls to special functions. + if (const Function *F = CI->getCalledFunction()) { + ARCInstKind Class = GetFunctionClass(F); + if (Class != ARCInstKind::CallOrUser) + return Class; + + // None of the intrinsic functions do objc_release. For intrinsics, the + // only question is whether or not they may be users. + switch (F->getIntrinsicID()) { + case Intrinsic::returnaddress: + case Intrinsic::frameaddress: + case Intrinsic::stacksave: + case Intrinsic::stackrestore: + case Intrinsic::vastart: + case Intrinsic::vacopy: + case Intrinsic::vaend: + case Intrinsic::objectsize: + case Intrinsic::prefetch: + case Intrinsic::stackprotector: + case Intrinsic::eh_return_i32: + case Intrinsic::eh_return_i64: + case Intrinsic::eh_typeid_for: + case Intrinsic::eh_dwarf_cfa: + case Intrinsic::eh_sjlj_lsda: + case Intrinsic::eh_sjlj_functioncontext: + case Intrinsic::init_trampoline: + case Intrinsic::adjust_trampoline: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + // Don't let dbg info affect our results. + case Intrinsic::dbg_declare: + case Intrinsic::dbg_value: + // Short cut: Some intrinsics obviously don't use ObjC pointers. + return ARCInstKind::None; + default: + break; + } + } + return GetCallSiteClass(CI); + } + case Instruction::Invoke: + return GetCallSiteClass(cast<InvokeInst>(I)); + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::Select: + case Instruction::PHI: + case Instruction::Ret: + case Instruction::Br: + case Instruction::Switch: + case Instruction::IndirectBr: + case Instruction::Alloca: + case Instruction::VAArg: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::FDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::SExt: + case Instruction::ZExt: + case Instruction::Trunc: + case Instruction::IntToPtr: + case Instruction::FCmp: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::InsertElement: + case Instruction::ExtractElement: + case Instruction::ShuffleVector: + case Instruction::ExtractValue: + break; + case Instruction::ICmp: + // Comparing a pointer with null, or any other constant, isn't an + // interesting use, because we don't care what the pointer points to, or + // about the values of any other dynamic reference-counted pointers. + if (IsPotentialRetainableObjPtr(I->getOperand(1))) + return ARCInstKind::User; + break; + default: + // For anything else, check all the operands. + // Note that this includes both operands of a Store: while the first + // operand isn't actually being dereferenced, it is being stored to + // memory where we can no longer track who might read it and dereference + // it, so we have to consider it potentially used. + for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end(); + OI != OE; ++OI) + if (IsPotentialRetainableObjPtr(*OI)) + return ARCInstKind::User; + } + } + + // Otherwise, it's totally inert for ARC purposes. + return ARCInstKind::None; +} + +/// \brief Test if the given class is a kind of user. +bool llvm::objcarc::IsUser(ARCInstKind Class) { + switch (Class) { + case ARCInstKind::User: + case ARCInstKind::CallOrUser: + case ARCInstKind::IntrinsicUser: + return true; + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::RetainBlock: + case ARCInstKind::Release: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::NoopCast: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::Call: + case ARCInstKind::None: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class is objc_retain or equivalent. +bool llvm::objcarc::IsRetain(ARCInstKind Class) { + switch (Class) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + return true; + // I believe we treat retain block as not a retain since it can copy its + // block. + case ARCInstKind::RetainBlock: + case ARCInstKind::Release: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::NoopCast: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class is objc_autorelease or equivalent. +bool llvm::objcarc::IsAutorelease(ARCInstKind Class) { + switch (Class) { + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + return true; + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::RetainBlock: + case ARCInstKind::Release: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::NoopCast: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class represents instructions which return their +/// argument verbatim. +bool llvm::objcarc::IsForwarding(ARCInstKind Class) { + switch (Class) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::NoopCast: + return true; + case ARCInstKind::RetainBlock: + case ARCInstKind::Release: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class represents instructions which do nothing if +/// passed a null pointer. +bool llvm::objcarc::IsNoopOnNull(ARCInstKind Class) { + switch (Class) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Release: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::RetainBlock: + return true; + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + case ARCInstKind::NoopCast: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class represents instructions which are always safe +/// to mark with the "tail" keyword. +bool llvm::objcarc::IsAlwaysTail(ARCInstKind Class) { + // ARCInstKind::RetainBlock may be given a stack argument. + switch (Class) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::AutoreleaseRV: + return true; + case ARCInstKind::Release: + case ARCInstKind::Autorelease: + case ARCInstKind::RetainBlock: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + case ARCInstKind::NoopCast: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class represents instructions which are never safe +/// to mark with the "tail" keyword. +bool llvm::objcarc::IsNeverTail(ARCInstKind Class) { + /// It is never safe to tail call objc_autorelease since by tail calling + /// objc_autorelease: fast autoreleasing causing our object to be potentially + /// reclaimed from the autorelease pool which violates the semantics of + /// __autoreleasing types in ARC. + switch (Class) { + case ARCInstKind::Autorelease: + return true; + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::Release: + case ARCInstKind::RetainBlock: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + case ARCInstKind::NoopCast: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class represents instructions which are always safe +/// to mark with the nounwind attribute. +bool llvm::objcarc::IsNoThrow(ARCInstKind Class) { + // objc_retainBlock is not nounwind because it calls user copy constructors + // which could theoretically throw. + switch (Class) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Release: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + return true; + case ARCInstKind::RetainBlock: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + case ARCInstKind::NoopCast: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// Test whether the given instruction can autorelease any pointer or cause an +/// autoreleasepool pop. +/// +/// This means that it *could* interrupt the RV optimization. +bool llvm::objcarc::CanInterruptRV(ARCInstKind Class) { + switch (Class) { + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + return true; + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Release: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::RetainBlock: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::User: + case ARCInstKind::None: + case ARCInstKind::NoopCast: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +bool llvm::objcarc::CanDecrementRefCount(ARCInstKind Kind) { + switch (Kind) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::NoopCast: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::User: + case ARCInstKind::None: + return false; + + // The cases below are conservative. + + // RetainBlock can result in user defined copy constructors being called + // implying releases may occur. + case ARCInstKind::RetainBlock: + case ARCInstKind::Release: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + return true; + } + + llvm_unreachable("covered switch isn't covered?"); +} diff --git a/lib/Transforms/ObjCARC/ARCInstKind.h b/lib/Transforms/ObjCARC/ARCInstKind.h new file mode 100644 index 0000000..636c65c --- /dev/null +++ b/lib/Transforms/ObjCARC/ARCInstKind.h @@ -0,0 +1,123 @@ +//===--- ARCInstKind.h - ARC instruction equivalence classes -*- C++ -*----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H +#define LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H + +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Function.h" + +namespace llvm { +namespace objcarc { + +/// \enum ARCInstKind +/// +/// \brief Equivalence classes of instructions in the ARC Model. +/// +/// Since we do not have "instructions" to represent ARC concepts in LLVM IR, +/// we instead operate on equivalence classes of instructions. +/// +/// TODO: This should be split into two enums: a runtime entry point enum +/// (possibly united with the ARCRuntimeEntrypoint class) and an enum that deals +/// with effects of instructions in the ARC model (which would handle the notion +/// of a User or CallOrUser). +enum class ARCInstKind { + Retain, ///< objc_retain + RetainRV, ///< objc_retainAutoreleasedReturnValue + RetainBlock, ///< objc_retainBlock + Release, ///< objc_release + Autorelease, ///< objc_autorelease + AutoreleaseRV, ///< objc_autoreleaseReturnValue + AutoreleasepoolPush, ///< objc_autoreleasePoolPush + AutoreleasepoolPop, ///< objc_autoreleasePoolPop + NoopCast, ///< objc_retainedObject, etc. + FusedRetainAutorelease, ///< objc_retainAutorelease + FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue + LoadWeakRetained, ///< objc_loadWeakRetained (primitive) + StoreWeak, ///< objc_storeWeak (primitive) + InitWeak, ///< objc_initWeak (derived) + LoadWeak, ///< objc_loadWeak (derived) + MoveWeak, ///< objc_moveWeak (derived) + CopyWeak, ///< objc_copyWeak (derived) + DestroyWeak, ///< objc_destroyWeak (derived) + StoreStrong, ///< objc_storeStrong (derived) + IntrinsicUser, ///< clang.arc.use + CallOrUser, ///< could call objc_release and/or "use" pointers + Call, ///< could call objc_release + User, ///< could "use" a pointer + None ///< anything that is inert from an ARC perspective. +}; + +raw_ostream &operator<<(raw_ostream &OS, const ARCInstKind Class); + +/// \brief Test if the given class is a kind of user. +bool IsUser(ARCInstKind Class); + +/// \brief Test if the given class is objc_retain or equivalent. +bool IsRetain(ARCInstKind Class); + +/// \brief Test if the given class is objc_autorelease or equivalent. +bool IsAutorelease(ARCInstKind Class); + +/// \brief Test if the given class represents instructions which return their +/// argument verbatim. +bool IsForwarding(ARCInstKind Class); + +/// \brief Test if the given class represents instructions which do nothing if +/// passed a null pointer. +bool IsNoopOnNull(ARCInstKind Class); + +/// \brief Test if the given class represents instructions which are always safe +/// to mark with the "tail" keyword. +bool IsAlwaysTail(ARCInstKind Class); + +/// \brief Test if the given class represents instructions which are never safe +/// to mark with the "tail" keyword. +bool IsNeverTail(ARCInstKind Class); + +/// \brief Test if the given class represents instructions which are always safe +/// to mark with the nounwind attribute. +bool IsNoThrow(ARCInstKind Class); + +/// Test whether the given instruction can autorelease any pointer or cause an +/// autoreleasepool pop. +bool CanInterruptRV(ARCInstKind Class); + +/// \brief Determine if F is one of the special known Functions. If it isn't, +/// return ARCInstKind::CallOrUser. +ARCInstKind GetFunctionClass(const Function *F); + +/// \brief Determine which objc runtime call instruction class V belongs to. +/// +/// This is similar to GetARCInstKind except that it only detects objc +/// runtime calls. This allows it to be faster. +/// +static inline ARCInstKind GetBasicARCInstKind(const Value *V) { + if (const CallInst *CI = dyn_cast<CallInst>(V)) { + if (const Function *F = CI->getCalledFunction()) + return GetFunctionClass(F); + // Otherwise, be conservative. + return ARCInstKind::CallOrUser; + } + + // Otherwise, be conservative. + return isa<InvokeInst>(V) ? ARCInstKind::CallOrUser : ARCInstKind::User; +} + +/// Map V to its ARCInstKind equivalence class. +ARCInstKind GetARCInstKind(const Value *V); + +/// Returns false if conservatively we can prove that any instruction mapped to +/// this kind can not decrement ref counts. Returns true otherwise. +bool CanDecrementRefCount(ARCInstKind Kind); + +} // end namespace objcarc +} // end namespace llvm + +#endif diff --git a/lib/Transforms/ObjCARC/Android.mk b/lib/Transforms/ObjCARC/Android.mk index cf45a95..97c5a9d 100644 --- a/lib/Transforms/ObjCARC/Android.mk +++ b/lib/Transforms/ObjCARC/Android.mk @@ -1,6 +1,7 @@ LOCAL_PATH:= $(call my-dir) transforms_objcarc_SRC_FILES := \ + ARCInstKind.cpp \ DependencyAnalysis.cpp \ ObjCARCAliasAnalysis.cpp \ ObjCARCAPElim.cpp \ @@ -8,7 +9,6 @@ transforms_objcarc_SRC_FILES := \ ObjCARC.cpp \ ObjCARCExpand.cpp \ ObjCARCOpts.cpp \ - ObjCARCUtil.cpp \ ProvenanceAnalysis.cpp \ ProvenanceAnalysisEvaluator.cpp diff --git a/lib/Transforms/ObjCARC/CMakeLists.txt b/lib/Transforms/ObjCARC/CMakeLists.txt index b449fac..2adea88 100644 --- a/lib/Transforms/ObjCARC/CMakeLists.txt +++ b/lib/Transforms/ObjCARC/CMakeLists.txt @@ -4,11 +4,14 @@ add_llvm_library(LLVMObjCARCOpts ObjCARCExpand.cpp ObjCARCAPElim.cpp ObjCARCAliasAnalysis.cpp - ObjCARCUtil.cpp + ARCInstKind.cpp ObjCARCContract.cpp DependencyAnalysis.cpp ProvenanceAnalysis.cpp ProvenanceAnalysisEvaluator.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms ) add_dependencies(LLVMObjCARCOpts intrinsics_gen) diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp index f6c236c..4985d0e 100644 --- a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp +++ b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp @@ -32,15 +32,14 @@ using namespace llvm::objcarc; /// Test whether the given instruction can result in a reference count /// modification (positive or negative) for the pointer's object. -bool -llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr, - ProvenanceAnalysis &PA, - InstructionClass Class) { +bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, + ARCInstKind Class) { switch (Class) { - case IC_Autorelease: - case IC_AutoreleaseRV: - case IC_IntrinsicUser: - case IC_User: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::User: // These operations never directly modify a reference count. return false; default: break; @@ -67,13 +66,25 @@ llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr, return true; } +bool llvm::objcarc::CanDecrementRefCount(const Instruction *Inst, + const Value *Ptr, + ProvenanceAnalysis &PA, + ARCInstKind Class) { + // First perform a quick check if Class can not touch ref counts. + if (!CanDecrementRefCount(Class)) + return false; + + // Otherwise, just use CanAlterRefCount for now. + return CanAlterRefCount(Inst, Ptr, PA, Class); +} + /// Test whether the given instruction can "use" the given pointer's object in a /// way that requires the reference count to be positive. -bool -llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr, - ProvenanceAnalysis &PA, InstructionClass Class) { - // IC_Call operations (as opposed to IC_CallOrUser) never "use" objc pointers. - if (Class == IC_Call) +bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, ARCInstKind Class) { + // ARCInstKind::Call operations (as opposed to + // ARCInstKind::CallOrUser) never "use" objc pointers. + if (Class == ARCInstKind::Call) return false; // Consider various instructions which may have pointer arguments which are @@ -123,11 +134,11 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst, switch (Flavor) { case NeedsPositiveRetainCount: { - InstructionClass Class = GetInstructionClass(Inst); + ARCInstKind Class = GetARCInstKind(Inst); switch (Class) { - case IC_AutoreleasepoolPop: - case IC_AutoreleasepoolPush: - case IC_None: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::None: return false; default: return CanUse(Inst, Arg, PA, Class); @@ -135,10 +146,10 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst, } case AutoreleasePoolBoundary: { - InstructionClass Class = GetInstructionClass(Inst); + ARCInstKind Class = GetARCInstKind(Inst); switch (Class) { - case IC_AutoreleasepoolPop: - case IC_AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPush: // These mark the end and begin of an autorelease pool scope. return true; default: @@ -148,13 +159,13 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst, } case CanChangeRetainCount: { - InstructionClass Class = GetInstructionClass(Inst); + ARCInstKind Class = GetARCInstKind(Inst); switch (Class) { - case IC_AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPop: // Conservatively assume this can decrement any count. return true; - case IC_AutoreleasepoolPush: - case IC_None: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::None: return false; default: return CanAlterRefCount(Inst, Arg, PA, Class); @@ -162,28 +173,28 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst, } case RetainAutoreleaseDep: - switch (GetBasicInstructionClass(Inst)) { - case IC_AutoreleasepoolPop: - case IC_AutoreleasepoolPush: + switch (GetBasicARCInstKind(Inst)) { + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPush: // Don't merge an objc_autorelease with an objc_retain inside a different // autoreleasepool scope. return true; - case IC_Retain: - case IC_RetainRV: + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: // Check for a retain of the same pointer for merging. - return GetObjCArg(Inst) == Arg; + return GetArgRCIdentityRoot(Inst) == Arg; default: // Nothing else matters for objc_retainAutorelease formation. return false; } case RetainAutoreleaseRVDep: { - InstructionClass Class = GetBasicInstructionClass(Inst); + ARCInstKind Class = GetBasicARCInstKind(Inst); switch (Class) { - case IC_Retain: - case IC_RetainRV: + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: // Check for a retain of the same pointer for merging. - return GetObjCArg(Inst) == Arg; + return GetArgRCIdentityRoot(Inst) == Arg; default: // Anything that can autorelease interrupts // retainAutoreleaseReturnValue formation. @@ -192,7 +203,7 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst, } case RetainRVDep: - return CanInterruptRV(GetBasicInstructionClass(Inst)); + return CanInterruptRV(GetBasicARCInstKind(Inst)); } llvm_unreachable("Invalid dependence flavor"); diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.h b/lib/Transforms/ObjCARC/DependencyAnalysis.h index 7b5601a..8e042d4 100644 --- a/lib/Transforms/ObjCARC/DependencyAnalysis.h +++ b/lib/Transforms/ObjCARC/DependencyAnalysis.h @@ -63,15 +63,24 @@ Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg, /// Test whether the given instruction can "use" the given pointer's object in a /// way that requires the reference count to be positive. -bool -CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA, - InstructionClass Class); +bool CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA, + ARCInstKind Class); /// Test whether the given instruction can result in a reference count /// modification (positive or negative) for the pointer's object. -bool -CanAlterRefCount(const Instruction *Inst, const Value *Ptr, - ProvenanceAnalysis &PA, InstructionClass Class); +bool CanAlterRefCount(const Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, ARCInstKind Class); + +/// Returns true if we can not conservatively prove that Inst can not decrement +/// the reference count of Ptr. Returns false if we can. +bool CanDecrementRefCount(const Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, ARCInstKind Class); + +static inline bool CanDecrementRefCount(const Instruction *Inst, + const Value *Ptr, + ProvenanceAnalysis &PA) { + return CanDecrementRefCount(Inst, Ptr, PA, GetARCInstKind(Inst)); +} } // namespace objcarc } // namespace llvm diff --git a/lib/Transforms/ObjCARC/ObjCARC.h b/lib/Transforms/ObjCARC/ObjCARC.h index 7a7eae8..df29f05 100644 --- a/lib/Transforms/ObjCARC/ObjCARC.h +++ b/lib/Transforms/ObjCARC/ObjCARC.h @@ -33,6 +33,7 @@ #include "llvm/Pass.h" #include "llvm/Transforms/ObjCARC.h" #include "llvm/Transforms/Utils/Local.h" +#include "ARCInstKind.h" namespace llvm { class raw_ostream; @@ -68,160 +69,13 @@ static inline bool ModuleHasARC(const Module &M) { M.getNamedValue("clang.arc.use"); } -/// \enum InstructionClass -/// \brief A simple classification for instructions. -enum InstructionClass { - IC_Retain, ///< objc_retain - IC_RetainRV, ///< objc_retainAutoreleasedReturnValue - IC_RetainBlock, ///< objc_retainBlock - IC_Release, ///< objc_release - IC_Autorelease, ///< objc_autorelease - IC_AutoreleaseRV, ///< objc_autoreleaseReturnValue - IC_AutoreleasepoolPush, ///< objc_autoreleasePoolPush - IC_AutoreleasepoolPop, ///< objc_autoreleasePoolPop - IC_NoopCast, ///< objc_retainedObject, etc. - IC_FusedRetainAutorelease, ///< objc_retainAutorelease - IC_FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue - IC_LoadWeakRetained, ///< objc_loadWeakRetained (primitive) - IC_StoreWeak, ///< objc_storeWeak (primitive) - IC_InitWeak, ///< objc_initWeak (derived) - IC_LoadWeak, ///< objc_loadWeak (derived) - IC_MoveWeak, ///< objc_moveWeak (derived) - IC_CopyWeak, ///< objc_copyWeak (derived) - IC_DestroyWeak, ///< objc_destroyWeak (derived) - IC_StoreStrong, ///< objc_storeStrong (derived) - IC_IntrinsicUser, ///< clang.arc.use - IC_CallOrUser, ///< could call objc_release and/or "use" pointers - IC_Call, ///< could call objc_release - IC_User, ///< could "use" a pointer - IC_None ///< anything else -}; - -raw_ostream &operator<<(raw_ostream &OS, const InstructionClass Class); - -/// \brief Test if the given class is a kind of user. -inline static bool IsUser(InstructionClass Class) { - return Class == IC_User || - Class == IC_CallOrUser || - Class == IC_IntrinsicUser; -} - -/// \brief Test if the given class is objc_retain or equivalent. -static inline bool IsRetain(InstructionClass Class) { - return Class == IC_Retain || - Class == IC_RetainRV; -} - -/// \brief Test if the given class is objc_autorelease or equivalent. -static inline bool IsAutorelease(InstructionClass Class) { - return Class == IC_Autorelease || - Class == IC_AutoreleaseRV; -} - -/// \brief Test if the given class represents instructions which return their -/// argument verbatim. -static inline bool IsForwarding(InstructionClass Class) { - return Class == IC_Retain || - Class == IC_RetainRV || - Class == IC_Autorelease || - Class == IC_AutoreleaseRV || - Class == IC_NoopCast; -} - -/// \brief Test if the given class represents instructions which do nothing if -/// passed a null pointer. -static inline bool IsNoopOnNull(InstructionClass Class) { - return Class == IC_Retain || - Class == IC_RetainRV || - Class == IC_Release || - Class == IC_Autorelease || - Class == IC_AutoreleaseRV || - Class == IC_RetainBlock; -} - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the "tail" keyword. -static inline bool IsAlwaysTail(InstructionClass Class) { - // IC_RetainBlock may be given a stack argument. - return Class == IC_Retain || - Class == IC_RetainRV || - Class == IC_AutoreleaseRV; -} - -/// \brief Test if the given class represents instructions which are never safe -/// to mark with the "tail" keyword. -static inline bool IsNeverTail(InstructionClass Class) { - /// It is never safe to tail call objc_autorelease since by tail calling - /// objc_autorelease, we also tail call -[NSObject autorelease] which supports - /// fast autoreleasing causing our object to be potentially reclaimed from the - /// autorelease pool which violates the semantics of __autoreleasing types in - /// ARC. - return Class == IC_Autorelease; -} - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the nounwind attribute. -static inline bool IsNoThrow(InstructionClass Class) { - // objc_retainBlock is not nounwind because it calls user copy constructors - // which could theoretically throw. - return Class == IC_Retain || - Class == IC_RetainRV || - Class == IC_Release || - Class == IC_Autorelease || - Class == IC_AutoreleaseRV || - Class == IC_AutoreleasepoolPush || - Class == IC_AutoreleasepoolPop; -} - -/// Test whether the given instruction can autorelease any pointer or cause an -/// autoreleasepool pop. -static inline bool -CanInterruptRV(InstructionClass Class) { - switch (Class) { - case IC_AutoreleasepoolPop: - case IC_CallOrUser: - case IC_Call: - case IC_Autorelease: - case IC_AutoreleaseRV: - case IC_FusedRetainAutorelease: - case IC_FusedRetainAutoreleaseRV: - return true; - default: - return false; - } -} - -/// \brief Determine if F is one of the special known Functions. If it isn't, -/// return IC_CallOrUser. -InstructionClass GetFunctionClass(const Function *F); - -/// \brief Determine which objc runtime call instruction class V belongs to. -/// -/// This is similar to GetInstructionClass except that it only detects objc -/// runtime calls. This allows it to be faster. -/// -static inline InstructionClass GetBasicInstructionClass(const Value *V) { - if (const CallInst *CI = dyn_cast<CallInst>(V)) { - if (const Function *F = CI->getCalledFunction()) - return GetFunctionClass(F); - // Otherwise, be conservative. - return IC_CallOrUser; - } - - // Otherwise, be conservative. - return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User; -} - -/// \brief Determine what kind of construct V is. -InstructionClass GetInstructionClass(const Value *V); - /// \brief This is a wrapper around getUnderlyingObject which also knows how to /// look through objc_retain and objc_autorelease calls, which we know to return /// their argument verbatim. static inline const Value *GetUnderlyingObjCPtr(const Value *V) { for (;;) { V = GetUnderlyingObject(V); - if (!IsForwarding(GetBasicInstructionClass(V))) + if (!IsForwarding(GetBasicARCInstKind(V))) break; V = cast<CallInst>(V)->getArgOperand(0); } @@ -229,37 +83,44 @@ static inline const Value *GetUnderlyingObjCPtr(const Value *V) { return V; } -/// \brief This is a wrapper around Value::stripPointerCasts which also knows -/// how to look through objc_retain and objc_autorelease calls, which we know to -/// return their argument verbatim. -static inline const Value *StripPointerCastsAndObjCCalls(const Value *V) { +/// The RCIdentity root of a value \p V is a dominating value U for which +/// retaining or releasing U is equivalent to retaining or releasing V. In other +/// words, ARC operations on \p V are equivalent to ARC operations on \p U. +/// +/// We use this in the ARC optimizer to make it easier to match up ARC +/// operations by always mapping ARC operations to RCIdentityRoots instead of +/// pointers themselves. +/// +/// The two ways that we see RCIdentical values in ObjC are via: +/// +/// 1. PointerCasts +/// 2. Forwarding Calls that return their argument verbatim. +/// +/// Thus this function strips off pointer casts and forwarding calls. *NOTE* +/// This implies that two RCIdentical values must alias. +static inline const Value *GetRCIdentityRoot(const Value *V) { for (;;) { V = V->stripPointerCasts(); - if (!IsForwarding(GetBasicInstructionClass(V))) + if (!IsForwarding(GetBasicARCInstKind(V))) break; V = cast<CallInst>(V)->getArgOperand(0); } return V; } -/// \brief This is a wrapper around Value::stripPointerCasts which also knows -/// how to look through objc_retain and objc_autorelease calls, which we know to -/// return their argument verbatim. -static inline Value *StripPointerCastsAndObjCCalls(Value *V) { - for (;;) { - V = V->stripPointerCasts(); - if (!IsForwarding(GetBasicInstructionClass(V))) - break; - V = cast<CallInst>(V)->getArgOperand(0); - } - return V; +/// Helper which calls const Value *GetRCIdentityRoot(const Value *V) and just +/// casts away the const of the result. For documentation about what an +/// RCIdentityRoot (and by extension GetRCIdentityRoot is) look at that +/// function. +static inline Value *GetRCIdentityRoot(Value *V) { + return const_cast<Value *>(GetRCIdentityRoot((const Value *)V)); } /// \brief Assuming the given instruction is one of the special calls such as -/// objc_retain or objc_release, return the argument value, stripped of no-op -/// casts and forwarding calls. -static inline Value *GetObjCArg(Value *Inst) { - return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0)); +/// objc_retain or objc_release, return the RCIdentity root of the argument of +/// the call. +static inline Value *GetArgRCIdentityRoot(Value *Inst) { + return GetRCIdentityRoot(cast<CallInst>(Inst)->getArgOperand(0)); } static inline bool IsNullOrUndef(const Value *V) { @@ -286,8 +147,8 @@ static inline void EraseInstruction(Instruction *CI) { if (!Unused) { // Replace the return value with the argument. - assert((IsForwarding(GetBasicInstructionClass(CI)) || - (IsNoopOnNull(GetBasicInstructionClass(CI)) && + assert((IsForwarding(GetBasicARCInstKind(CI)) || + (IsNoopOnNull(GetBasicARCInstKind(CI)) && isa<ConstantPointerNull>(OldArg))) && "Can't delete non-forwarding instruction with users!"); CI->replaceAllUsesWith(OldArg); @@ -344,15 +205,15 @@ static inline bool IsPotentialRetainableObjPtr(const Value *Op, return true; } -/// \brief Helper for GetInstructionClass. Determines what kind of construct CS +/// \brief Helper for GetARCInstKind. Determines what kind of construct CS /// is. -static inline InstructionClass GetCallSiteClass(ImmutableCallSite CS) { +static inline ARCInstKind GetCallSiteClass(ImmutableCallSite CS) { for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; ++I) if (IsPotentialRetainableObjPtr(*I)) - return CS.onlyReadsMemory() ? IC_User : IC_CallOrUser; + return CS.onlyReadsMemory() ? ARCInstKind::User : ARCInstKind::CallOrUser; - return CS.onlyReadsMemory() ? IC_None : IC_Call; + return CS.onlyReadsMemory() ? ARCInstKind::None : ARCInstKind::Call; } /// \brief Return true if this value refers to a distinct and identifiable @@ -371,7 +232,7 @@ static inline bool IsObjCIdentifiedObject(const Value *V) { if (const LoadInst *LI = dyn_cast<LoadInst>(V)) { const Value *Pointer = - StripPointerCastsAndObjCCalls(LI->getPointerOperand()); + GetRCIdentityRoot(LI->getPointerOperand()); if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) { // A constant pointer can't be pointing to an object on the heap. It may // be reference-counted, but it won't be deleted. diff --git a/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp index 1a25391..d318643 100644 --- a/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp +++ b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp @@ -97,11 +97,11 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) { Instruction *Push = nullptr; for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { Instruction *Inst = I++; - switch (GetBasicInstructionClass(Inst)) { - case IC_AutoreleasepoolPush: + switch (GetBasicARCInstKind(Inst)) { + case ARCInstKind::AutoreleasepoolPush: Push = Inst; break; - case IC_AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPop: // If this pop matches a push and nothing in between can autorelease, // zap the pair. if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) { @@ -115,7 +115,7 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) { } Push = nullptr; break; - case IC_CallOrUser: + case ARCInstKind::CallOrUser: if (MayAutorelease(ImmutableCallSite(Inst))) Push = nullptr; break; diff --git a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp index c61b6b0..be291a0 100644 --- a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp +++ b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp @@ -59,8 +59,8 @@ ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) { // First, strip off no-ops, including ObjC-specific no-ops, and try making a // precise alias query. - const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr); - const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr); + const Value *SA = GetRCIdentityRoot(LocA.Ptr); + const Value *SB = GetRCIdentityRoot(LocB.Ptr); AliasResult Result = AliasAnalysis::alias(Location(SA, LocA.Size, LocA.AATags), Location(SB, LocB.Size, LocB.AATags)); @@ -92,7 +92,7 @@ ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc, // First, strip off no-ops, including ObjC-specific no-ops, and try making // a precise alias query. - const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr); + const Value *S = GetRCIdentityRoot(Loc.Ptr); if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.AATags), OrLocal)) return true; @@ -120,7 +120,7 @@ ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) { return AliasAnalysis::getModRefBehavior(F); switch (GetFunctionClass(F)) { - case IC_NoopCast: + case ARCInstKind::NoopCast: return DoesNotAccessMemory; default: break; @@ -134,15 +134,15 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) { if (!EnableARCOpts) return AliasAnalysis::getModRefInfo(CS, Loc); - switch (GetBasicInstructionClass(CS.getInstruction())) { - case IC_Retain: - case IC_RetainRV: - case IC_Autorelease: - case IC_AutoreleaseRV: - case IC_NoopCast: - case IC_AutoreleasepoolPush: - case IC_FusedRetainAutorelease: - case IC_FusedRetainAutoreleaseRV: + switch (GetBasicARCInstKind(CS.getInstruction())) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::NoopCast: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: // These functions don't access any memory visible to the compiler. // Note that this doesn't include objc_retainBlock, because it updates // pointers when it copies block data. diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp index eb325eb..6473d3a 100644 --- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -44,6 +44,10 @@ using namespace llvm::objcarc; STATISTIC(NumPeeps, "Number of calls peephole-optimized"); STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed"); +//===----------------------------------------------------------------------===// +// Declarations +//===----------------------------------------------------------------------===// + namespace { /// \brief Late ARC optimizations /// @@ -68,17 +72,23 @@ namespace { /// "tail". SmallPtrSet<CallInst *, 8> StoreStrongCalls; - bool OptimizeRetainCall(Function &F, Instruction *Retain); + /// Returns true if we eliminated Inst. + bool tryToPeepholeInstruction(Function &F, Instruction *Inst, + inst_iterator &Iter, + SmallPtrSetImpl<Instruction *> &DepInsts, + SmallPtrSetImpl<const BasicBlock *> &Visited, + bool &TailOkForStoreStrong); - bool ContractAutorelease(Function &F, Instruction *Autorelease, - InstructionClass Class, - SmallPtrSetImpl<Instruction *> - &DependingInstructions, - SmallPtrSetImpl<const BasicBlock *> - &Visited); + bool optimizeRetainCall(Function &F, Instruction *Retain); - void ContractRelease(Instruction *Release, - inst_iterator &Iter); + bool + contractAutorelease(Function &F, Instruction *Autorelease, + ARCInstKind Class, + SmallPtrSetImpl<Instruction *> &DependingInstructions, + SmallPtrSetImpl<const BasicBlock *> &Visited); + + void tryToContractReleaseIntoStoreStrong(Instruction *Release, + inst_iterator &Iter); void getAnalysisUsage(AnalysisUsage &AU) const override; bool doInitialization(Module &M) override; @@ -92,30 +102,15 @@ namespace { }; } -char ObjCARCContract::ID = 0; -INITIALIZE_PASS_BEGIN(ObjCARCContract, - "objc-arc-contract", "ObjC ARC contraction", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(ObjCARCContract, - "objc-arc-contract", "ObjC ARC contraction", false, false) - -Pass *llvm::createObjCARCContractPass() { - return new ObjCARCContract(); -} - -void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AliasAnalysis>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.setPreservesCFG(); -} +//===----------------------------------------------------------------------===// +// Implementation +//===----------------------------------------------------------------------===// /// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a /// return value. We do this late so we do not disrupt the dataflow analysis in /// ObjCARCOpt. -bool -ObjCARCContract::OptimizeRetainCall(Function &F, Instruction *Retain) { - ImmutableCallSite CS(GetObjCArg(Retain)); +bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) { + ImmutableCallSite CS(GetArgRCIdentityRoot(Retain)); const Instruction *Call = CS.getInstruction(); if (!Call) return false; @@ -147,19 +142,16 @@ ObjCARCContract::OptimizeRetainCall(Function &F, Instruction *Retain) { } /// Merge an autorelease with a retain into a fused call. -bool -ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, - InstructionClass Class, - SmallPtrSetImpl<Instruction *> - &DependingInstructions, - SmallPtrSetImpl<const BasicBlock *> - &Visited) { - const Value *Arg = GetObjCArg(Autorelease); +bool ObjCARCContract::contractAutorelease( + Function &F, Instruction *Autorelease, ARCInstKind Class, + SmallPtrSetImpl<Instruction *> &DependingInstructions, + SmallPtrSetImpl<const BasicBlock *> &Visited) { + const Value *Arg = GetArgRCIdentityRoot(Autorelease); // Check that there are no instructions between the retain and the autorelease // (such as an autorelease_pop) which may change the count. CallInst *Retain = nullptr; - if (Class == IC_AutoreleaseRV) + if (Class == ARCInstKind::AutoreleaseRV) FindDependencies(RetainAutoreleaseRVDep, Arg, Autorelease->getParent(), Autorelease, DependingInstructions, Visited, PA); @@ -177,94 +169,208 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin()); DependingInstructions.clear(); - if (!Retain || - GetBasicInstructionClass(Retain) != IC_Retain || - GetObjCArg(Retain) != Arg) + if (!Retain || GetBasicARCInstKind(Retain) != ARCInstKind::Retain || + GetArgRCIdentityRoot(Retain) != Arg) return false; Changed = true; ++NumPeeps; - DEBUG(dbgs() << "ObjCARCContract::ContractAutorelease: Fusing " - "retain/autorelease. Erasing: " << *Autorelease << "\n" - " Old Retain: " - << *Retain << "\n"); + DEBUG(dbgs() << " Fusing retain/autorelease!\n" + " Autorelease:" << *Autorelease << "\n" + " Retain: " << *Retain << "\n"); - Constant *Decl = EP.get(Class == IC_AutoreleaseRV ? - ARCRuntimeEntryPoints::EPT_RetainAutoreleaseRV : - ARCRuntimeEntryPoints::EPT_RetainAutorelease); + Constant *Decl = EP.get(Class == ARCInstKind::AutoreleaseRV + ? ARCRuntimeEntryPoints::EPT_RetainAutoreleaseRV + : ARCRuntimeEntryPoints::EPT_RetainAutorelease); Retain->setCalledFunction(Decl); - DEBUG(dbgs() << " New Retain: " - << *Retain << "\n"); + DEBUG(dbgs() << " New RetainAutorelease: " << *Retain << "\n"); EraseInstruction(Autorelease); return true; } -/// Attempt to merge an objc_release with a store, load, and objc_retain to form -/// an objc_storeStrong. This can be a little tricky because the instructions -/// don't always appear in order, and there may be unrelated intervening -/// instructions. -void ObjCARCContract::ContractRelease(Instruction *Release, - inst_iterator &Iter) { - LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release)); - if (!Load || !Load->isSimple()) return; +static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load, + Instruction *Release, + ProvenanceAnalysis &PA, + AliasAnalysis *AA) { + StoreInst *Store = nullptr; + bool SawRelease = false; - // For now, require everything to be in one basic block. - BasicBlock *BB = Release->getParent(); - if (Load->getParent() != BB) return; + // Get the location associated with Load. + AliasAnalysis::Location Loc = AA->getLocation(Load); // Walk down to find the store and the release, which may be in either order. - BasicBlock::iterator I = Load, End = BB->end(); - ++I; - AliasAnalysis::Location Loc = AA->getLocation(Load); - StoreInst *Store = nullptr; - bool SawRelease = false; - for (; !Store || !SawRelease; ++I) { - if (I == End) - return; + for (auto I = std::next(BasicBlock::iterator(Load)), + E = Load->getParent()->end(); + I != E; ++I) { + // If we found the store we were looking for and saw the release, + // break. There is no more work to be done. + if (Store && SawRelease) + break; - Instruction *Inst = I; + // Now we know that we have not seen either the store or the release. If I + // is the the release, mark that we saw the release and continue. + Instruction *Inst = &*I; if (Inst == Release) { SawRelease = true; continue; } - InstructionClass Class = GetBasicInstructionClass(Inst); + // Otherwise, we check if Inst is a "good" store. Grab the instruction class + // of Inst. + ARCInstKind Class = GetBasicARCInstKind(Inst); - // Unrelated retains are harmless. + // If Inst is an unrelated retain, we don't care about it. + // + // TODO: This is one area where the optimization could be made more + // aggressive. if (IsRetain(Class)) continue; + // If we have seen the store, but not the release... if (Store) { - // The store is the point where we're going to put the objc_storeStrong, - // so make sure there are no uses after it. - if (CanUse(Inst, Load, PA, Class)) - return; - } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) { - // We are moving the load down to the store, so check for anything - // else which writes to the memory between the load and the store. - Store = dyn_cast<StoreInst>(Inst); - if (!Store || !Store->isSimple()) return; - if (Store->getPointerOperand() != Loc.Ptr) return; + // We need to make sure that it is safe to move the release from its + // current position to the store. This implies proving that any + // instruction in between Store and the Release conservatively can not use + // the RCIdentityRoot of Release. If we can prove we can ignore Inst, so + // continue... + if (!CanUse(Inst, Load, PA, Class)) { + continue; + } + + // Otherwise, be conservative and return nullptr. + return nullptr; } + + // Ok, now we know we have not seen a store yet. See if Inst can write to + // our load location, if it can not, just ignore the instruction. + if (!(AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod)) + continue; + + Store = dyn_cast<StoreInst>(Inst); + + // If Inst can, then check if Inst is a simple store. If Inst is not a + // store or a store that is not simple, then we have some we do not + // understand writing to this memory implying we can not move the load + // over the write to any subsequent store that we may find. + if (!Store || !Store->isSimple()) + return nullptr; + + // Then make sure that the pointer we are storing to is Ptr. If so, we + // found our Store! + if (Store->getPointerOperand() == Loc.Ptr) + continue; + + // Otherwise, we have an unknown store to some other ptr that clobbers + // Loc.Ptr. Bail! + return nullptr; } - Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand()); + // If we did not find the store or did not see the release, fail. + if (!Store || !SawRelease) + return nullptr; + + // We succeeded! + return Store; +} - // Walk up to find the retain. - I = Store; - BasicBlock::iterator Begin = BB->begin(); - while (I != Begin && GetBasicInstructionClass(I) != IC_Retain) +static Instruction * +findRetainForStoreStrongContraction(Value *New, StoreInst *Store, + Instruction *Release, + ProvenanceAnalysis &PA) { + // Walk up from the Store to find the retain. + BasicBlock::iterator I = Store; + BasicBlock::iterator Begin = Store->getParent()->begin(); + while (I != Begin && GetBasicARCInstKind(I) != ARCInstKind::Retain) { + Instruction *Inst = &*I; + + // It is only safe to move the retain to the store if we can prove + // conservatively that nothing besides the release can decrement reference + // counts in between the retain and the store. + if (CanDecrementRefCount(Inst, New, PA) && Inst != Release) + return nullptr; --I; + } Instruction *Retain = I; - if (GetBasicInstructionClass(Retain) != IC_Retain) return; - if (GetObjCArg(Retain) != New) return; + if (GetBasicARCInstKind(Retain) != ARCInstKind::Retain) + return nullptr; + if (GetArgRCIdentityRoot(Retain) != New) + return nullptr; + return Retain; +} + +/// Attempt to merge an objc_release with a store, load, and objc_retain to form +/// an objc_storeStrong. An objc_storeStrong: +/// +/// objc_storeStrong(i8** %old_ptr, i8* new_value) +/// +/// is equivalent to the following IR sequence: +/// +/// ; Load old value. +/// %old_value = load i8** %old_ptr (1) +/// +/// ; Increment the new value and then release the old value. This must occur +/// ; in order in case old_value releases new_value in its destructor causing +/// ; us to potentially have a dangling ptr. +/// tail call i8* @objc_retain(i8* %new_value) (2) +/// tail call void @objc_release(i8* %old_value) (3) +/// +/// ; Store the new_value into old_ptr +/// store i8* %new_value, i8** %old_ptr (4) +/// +/// The safety of this optimization is based around the following +/// considerations: +/// +/// 1. We are forming the store strong at the store. Thus to perform this +/// optimization it must be safe to move the retain, load, and release to +/// (4). +/// 2. We need to make sure that any re-orderings of (1), (2), (3), (4) are +/// safe. +void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release, + inst_iterator &Iter) { + // See if we are releasing something that we just loaded. + auto *Load = dyn_cast<LoadInst>(GetArgRCIdentityRoot(Release)); + if (!Load || !Load->isSimple()) + return; + + // For now, require everything to be in one basic block. + BasicBlock *BB = Release->getParent(); + if (Load->getParent() != BB) + return; + + // First scan down the BB from Load, looking for a store of the RCIdentityRoot + // of Load's + StoreInst *Store = + findSafeStoreForStoreStrongContraction(Load, Release, PA, AA); + // If we fail, bail. + if (!Store) + return; + + // Then find what new_value's RCIdentity Root is. + Value *New = GetRCIdentityRoot(Store->getValueOperand()); + + // Then walk up the BB and look for a retain on New without any intervening + // instructions which conservatively might decrement ref counts. + Instruction *Retain = + findRetainForStoreStrongContraction(New, Store, Release, PA); + + // If we fail, bail. + if (!Retain) + return; Changed = true; ++NumStoreStrongs; + DEBUG( + llvm::dbgs() << " Contracting retain, release into objc_storeStrong.\n" + << " Old:\n" + << " Store: " << *Store << "\n" + << " Release: " << *Release << "\n" + << " Retain: " << *Retain << "\n" + << " Load: " << *Load << "\n"); + LLVMContext &C = Release->getContext(); Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *I8XX = PointerType::getUnqual(I8X); @@ -284,6 +390,8 @@ void ObjCARCContract::ContractRelease(Instruction *Release, // we can set the tail flag once we know it's safe. StoreStrongCalls.insert(StoreStrong); + DEBUG(llvm::dbgs() << " New Store Strong: " << *StoreStrong << "\n"); + if (&*Iter == Store) ++Iter; Store->eraseFromParent(); Release->eraseFromParent(); @@ -292,85 +400,34 @@ void ObjCARCContract::ContractRelease(Instruction *Release, Load->eraseFromParent(); } -bool ObjCARCContract::doInitialization(Module &M) { - // If nothing in the Module uses ARC, don't do anything. - Run = ModuleHasARC(M); - if (!Run) - return false; - - EP.Initialize(&M); - - // Initialize RetainRVMarker. - RetainRVMarker = nullptr; - if (NamedMDNode *NMD = - M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker")) - if (NMD->getNumOperands() == 1) { - const MDNode *N = NMD->getOperand(0); - if (N->getNumOperands() == 1) - if (const MDString *S = dyn_cast<MDString>(N->getOperand(0))) - RetainRVMarker = S; - } - - return false; -} - -bool ObjCARCContract::runOnFunction(Function &F) { - if (!EnableARCOpts) - return false; - - // If nothing in the Module uses ARC, don't do anything. - if (!Run) - return false; - - Changed = false; - AA = &getAnalysis<AliasAnalysis>(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - - PA.setAA(&getAnalysis<AliasAnalysis>()); - - // Track whether it's ok to mark objc_storeStrong calls with the "tail" - // keyword. Be conservative if the function has variadic arguments. - // It seems that functions which "return twice" are also unsafe for the - // "tail" argument, because they are setjmp, which could need to - // return to an earlier stack state. - bool TailOkForStoreStrongs = !F.isVarArg() && - !F.callsFunctionThatReturnsTwice(); - - // For ObjC library calls which return their argument, replace uses of the - // argument with uses of the call return value, if it dominates the use. This - // reduces register pressure. - SmallPtrSet<Instruction *, 4> DependingInstructions; - SmallPtrSet<const BasicBlock *, 4> Visited; - for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { - Instruction *Inst = &*I++; - - DEBUG(dbgs() << "ObjCARCContract: Visiting: " << *Inst << "\n"); - +bool ObjCARCContract::tryToPeepholeInstruction( + Function &F, Instruction *Inst, inst_iterator &Iter, + SmallPtrSetImpl<Instruction *> &DependingInsts, + SmallPtrSetImpl<const BasicBlock *> &Visited, + bool &TailOkForStoreStrongs) { // Only these library routines return their argument. In particular, // objc_retainBlock does not necessarily return its argument. - InstructionClass Class = GetBasicInstructionClass(Inst); + ARCInstKind Class = GetBasicARCInstKind(Inst); switch (Class) { - case IC_FusedRetainAutorelease: - case IC_FusedRetainAutoreleaseRV: - break; - case IC_Autorelease: - case IC_AutoreleaseRV: - if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited)) - continue; - break; - case IC_Retain: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + return false; + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + return contractAutorelease(F, Inst, Class, DependingInsts, Visited); + case ARCInstKind::Retain: // Attempt to convert retains to retainrvs if they are next to function // calls. - if (!OptimizeRetainCall(F, Inst)) - break; + if (!optimizeRetainCall(F, Inst)) + return false; // If we succeed in our optimization, fall through. // FALLTHROUGH - case IC_RetainRV: { + case ARCInstKind::RetainRV: { // If we're compiling for a target which needs a special inline-asm // marker to do the retainAutoreleasedReturnValue optimization, // insert it now. if (!RetainRVMarker) - break; + return false; BasicBlock::iterator BBI = Inst; BasicBlock *InstParent = Inst->getParent(); @@ -388,8 +445,8 @@ bool ObjCARCContract::runOnFunction(Function &F) { --BBI; } while (IsNoopInstruction(BBI)); - if (&*BBI == GetObjCArg(Inst)) { - DEBUG(dbgs() << "ObjCARCContract: Adding inline asm marker for " + if (&*BBI == GetArgRCIdentityRoot(Inst)) { + DEBUG(dbgs() << "Adding inline asm marker for " "retainAutoreleasedReturnValue optimization.\n"); Changed = true; InlineAsm *IA = @@ -400,9 +457,9 @@ bool ObjCARCContract::runOnFunction(Function &F) { CallInst::Create(IA, "", Inst); } decline_rv_optimization: - break; + return false; } - case IC_InitWeak: { + case ARCInstKind::InitWeak: { // objc_initWeak(p, null) => *p = null CallInst *CI = cast<CallInst>(Inst); if (IsNullOrUndef(CI->getArgOperand(1))) { @@ -417,31 +474,80 @@ bool ObjCARCContract::runOnFunction(Function &F) { CI->replaceAllUsesWith(Null); CI->eraseFromParent(); } - continue; + return true; } - case IC_Release: - ContractRelease(Inst, I); - continue; - case IC_User: + case ARCInstKind::Release: + // Try to form an objc store strong from our release. If we fail, there is + // nothing further to do below, so continue. + tryToContractReleaseIntoStoreStrong(Inst, Iter); + return true; + case ARCInstKind::User: // Be conservative if the function has any alloca instructions. // Technically we only care about escaping alloca instructions, // but this is sufficient to handle some interesting cases. if (isa<AllocaInst>(Inst)) TailOkForStoreStrongs = false; - continue; - case IC_IntrinsicUser: + return true; + case ARCInstKind::IntrinsicUser: // Remove calls to @clang.arc.use(...). Inst->eraseFromParent(); - continue; + return true; default: - continue; + return true; } +} + +//===----------------------------------------------------------------------===// +// Top Level Driver +//===----------------------------------------------------------------------===// + +bool ObjCARCContract::runOnFunction(Function &F) { + if (!EnableARCOpts) + return false; + + // If nothing in the Module uses ARC, don't do anything. + if (!Run) + return false; - DEBUG(dbgs() << "ObjCARCContract: Finished List.\n\n"); + Changed = false; + AA = &getAnalysis<AliasAnalysis>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + PA.setAA(&getAnalysis<AliasAnalysis>()); + + DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n"); + + // Track whether it's ok to mark objc_storeStrong calls with the "tail" + // keyword. Be conservative if the function has variadic arguments. + // It seems that functions which "return twice" are also unsafe for the + // "tail" argument, because they are setjmp, which could need to + // return to an earlier stack state. + bool TailOkForStoreStrongs = + !F.isVarArg() && !F.callsFunctionThatReturnsTwice(); - // Don't use GetObjCArg because we don't want to look through bitcasts + // For ObjC library calls which return their argument, replace uses of the + // argument with uses of the call return value, if it dominates the use. This + // reduces register pressure. + SmallPtrSet<Instruction *, 4> DependingInstructions; + SmallPtrSet<const BasicBlock *, 4> Visited; + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E;) { + Instruction *Inst = &*I++; + + DEBUG(dbgs() << "Visiting: " << *Inst << "\n"); + + // First try to peephole Inst. If there is nothing further we can do in + // terms of undoing objc-arc-expand, process the next inst. + if (tryToPeepholeInstruction(F, Inst, I, DependingInstructions, Visited, + TailOkForStoreStrongs)) + continue; + + // Otherwise, try to undo objc-arc-expand. + + // Don't use GetArgRCIdentityRoot because we don't want to look through bitcasts // and such; to do the replacement, the argument must have type i8*. Value *Arg = cast<CallInst>(Inst)->getArgOperand(0); + + // TODO: Change this to a do-while. for (;;) { // If we're compiling bugpointed code, don't get in trouble. if (!isa<Instruction>(Arg) && !isa<Argument>(Arg)) @@ -458,7 +564,7 @@ bool ObjCARCContract::runOnFunction(Function &F) { // reachability here because an unreachable call is considered to // trivially dominate itself, which would lead us to rewriting its // argument in terms of its return value, which would lead to - // infinite loops in GetObjCArg. + // infinite loops in GetArgRCIdentityRoot. if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) { Changed = true; Instruction *Replacement = Inst; @@ -514,3 +620,45 @@ bool ObjCARCContract::runOnFunction(Function &F) { return Changed; } + +//===----------------------------------------------------------------------===// +// Misc Pass Manager +//===----------------------------------------------------------------------===// + +char ObjCARCContract::ID = 0; +INITIALIZE_PASS_BEGIN(ObjCARCContract, "objc-arc-contract", + "ObjC ARC contraction", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(ObjCARCContract, "objc-arc-contract", + "ObjC ARC contraction", false, false) + +void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<AliasAnalysis>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.setPreservesCFG(); +} + +Pass *llvm::createObjCARCContractPass() { return new ObjCARCContract(); } + +bool ObjCARCContract::doInitialization(Module &M) { + // If nothing in the Module uses ARC, don't do anything. + Run = ModuleHasARC(M); + if (!Run) + return false; + + EP.Initialize(&M); + + // Initialize RetainRVMarker. + RetainRVMarker = nullptr; + if (NamedMDNode *NMD = + M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker")) + if (NMD->getNumOperands() == 1) { + const MDNode *N = NMD->getOperand(0); + if (N->getNumOperands() == 1) + if (const MDString *S = dyn_cast<MDString>(N->getOperand(0))) + RetainRVMarker = S; + } + + return false; +} diff --git a/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp index bf9fcbb..53c19c3 100644 --- a/lib/Transforms/ObjCARC/ObjCARCExpand.cpp +++ b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp @@ -99,13 +99,13 @@ bool ObjCARCExpand::runOnFunction(Function &F) { DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n"); - switch (GetBasicInstructionClass(Inst)) { - case IC_Retain: - case IC_RetainRV: - case IC_Autorelease: - case IC_AutoreleaseRV: - case IC_FusedRetainAutorelease: - case IC_FusedRetainAutoreleaseRV: { + switch (GetBasicARCInstKind(Inst)) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: { // These calls return their argument verbatim, as a low-level // optimization. However, this makes high-level optimizations // harder. Undo any uses of this optimization that the front-end diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 95c6674..f55b77f 100644 --- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -144,7 +144,7 @@ namespace { /// \defgroup ARCUtilities Utility declarations/definitions specific to ARC. /// @{ -/// \brief This is similar to StripPointerCastsAndObjCCalls but it stops as soon +/// \brief This is similar to GetRCIdentityRoot but it stops as soon /// as it finds a value with multiple uses. static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { if (Arg->hasOneUse()) { @@ -153,7 +153,7 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg)) if (GEP->hasAllZeroIndices()) return FindSingleUseIdentifiedObject(GEP->getPointerOperand()); - if (IsForwarding(GetBasicInstructionClass(Arg))) + if (IsForwarding(GetBasicARCInstKind(Arg))) return FindSingleUseIdentifiedObject( cast<CallInst>(Arg)->getArgOperand(0)); if (!IsObjCIdentifiedObject(Arg)) @@ -165,7 +165,7 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { // trivial uses, we can still consider this to be a single-use value. if (IsObjCIdentifiedObject(Arg)) { for (const User *U : Arg->users()) - if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg) + if (!U->use_empty() || GetRCIdentityRoot(U) != Arg) return nullptr; return Arg; @@ -880,11 +880,9 @@ static void AppendMDNodeToInstForPtr(unsigned NodeId, Sequence OldSeq, Sequence NewSeq) { MDNode *Node = nullptr; - Value *tmp[3] = {PtrSourceMDNodeID, - SequenceToMDString(Inst->getContext(), - OldSeq), - SequenceToMDString(Inst->getContext(), - NewSeq)}; + Metadata *tmp[3] = {PtrSourceMDNodeID, + SequenceToMDString(Inst->getContext(), OldSeq), + SequenceToMDString(Inst->getContext(), NewSeq)}; Node = MDNode::get(Inst->getContext(), tmp); Inst->setMetadata(NodeId, Node); @@ -1098,7 +1096,7 @@ namespace { bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV); void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV, - InstructionClass &Class); + ARCInstKind &Class); void OptimizeIndividualCalls(Function &F); void CheckForCFGHazards(const BasicBlock *BB, @@ -1193,7 +1191,7 @@ void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const { bool ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { // Check for the argument being from an immediately preceding call or invoke. - const Value *Arg = GetObjCArg(RetainRV); + const Value *Arg = GetArgRCIdentityRoot(RetainRV); ImmutableCallSite CS(Arg); if (const Instruction *Call = CS.getInstruction()) { if (Call->getParent() == RetainRV->getParent()) { @@ -1218,8 +1216,8 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin(); if (I != Begin) { do --I; while (I != Begin && IsNoopInstruction(I)); - if (GetBasicInstructionClass(I) == IC_AutoreleaseRV && - GetObjCArg(I) == Arg) { + if (GetBasicARCInstKind(I) == ARCInstKind::AutoreleaseRV && + GetArgRCIdentityRoot(I) == Arg) { Changed = true; ++NumPeeps; @@ -1250,17 +1248,17 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { /// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not /// used as a return value. -void -ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV, - InstructionClass &Class) { +void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, + Instruction *AutoreleaseRV, + ARCInstKind &Class) { // Check for a return of the pointer value. - const Value *Ptr = GetObjCArg(AutoreleaseRV); + const Value *Ptr = GetArgRCIdentityRoot(AutoreleaseRV); SmallVector<const Value *, 2> Users; Users.push_back(Ptr); do { Ptr = Users.pop_back_val(); for (const User *U : Ptr->users()) { - if (isa<ReturnInst>(U) || GetBasicInstructionClass(U) == IC_RetainRV) + if (isa<ReturnInst>(U) || GetBasicARCInstKind(U) == ARCInstKind::RetainRV) return; if (isa<BitCastInst>(U)) Users.push_back(U); @@ -1279,7 +1277,7 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV, Constant *NewDecl = EP.get(ARCRuntimeEntryPoints::EPT_Autorelease); AutoreleaseRVCI->setCalledFunction(NewDecl); AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease. - Class = IC_Autorelease; + Class = ARCInstKind::Autorelease; DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n"); @@ -1296,7 +1294,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; - InstructionClass Class = GetBasicInstructionClass(Inst); + ARCInstKind Class = GetBasicARCInstKind(Inst); DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n"); @@ -1311,7 +1309,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // There are gray areas here, as the ability to cast reference-counted // pointers to raw void* and back allows code to break ARC assumptions, // however these are currently considered to be unimportant. - case IC_NoopCast: + case ARCInstKind::NoopCast: Changed = true; ++NumNoops; DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n"); @@ -1319,11 +1317,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { continue; // If the pointer-to-weak-pointer is null, it's undefined behavior. - case IC_StoreWeak: - case IC_LoadWeak: - case IC_LoadWeakRetained: - case IC_InitWeak: - case IC_DestroyWeak: { + case ARCInstKind::StoreWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::InitWeak: + case ARCInstKind::DestroyWeak: { CallInst *CI = cast<CallInst>(Inst); if (IsNullOrUndef(CI->getArgOperand(0))) { Changed = true; @@ -1340,8 +1338,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { } break; } - case IC_CopyWeak: - case IC_MoveWeak: { + case ARCInstKind::CopyWeak: + case ARCInstKind::MoveWeak: { CallInst *CI = cast<CallInst>(Inst); if (IsNullOrUndef(CI->getArgOperand(0)) || IsNullOrUndef(CI->getArgOperand(1))) { @@ -1361,11 +1359,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { } break; } - case IC_RetainRV: + case ARCInstKind::RetainRV: if (OptimizeRetainRVCall(F, Inst)) continue; break; - case IC_AutoreleaseRV: + case ARCInstKind::AutoreleaseRV: OptimizeAutoreleaseRVCall(F, Inst, Class); break; } @@ -1393,7 +1391,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { EraseInstruction(Call); Inst = NewCall; - Class = IC_Release; + Class = ARCInstKind::Release; } } @@ -1424,11 +1422,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { } if (!IsNoopOnNull(Class)) { - UsedInThisFunction |= 1 << Class; + UsedInThisFunction |= 1 << unsigned(Class); continue; } - const Value *Arg = GetObjCArg(Inst); + const Value *Arg = GetArgRCIdentityRoot(Inst); // ARC calls with null are no-ops. Delete them. if (IsNullOrUndef(Arg)) { @@ -1442,7 +1440,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // Keep track of which of retain, release, autorelease, and retain_block // are actually present in this function. - UsedInThisFunction |= 1 << Class; + UsedInThisFunction |= 1 << unsigned(Class); // If Arg is a PHI, and one or more incoming values to the // PHI are null, and the call is control-equivalent to the PHI, and there @@ -1465,7 +1463,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { bool HasCriticalEdges = false; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *Incoming = - StripPointerCastsAndObjCCalls(PN->getIncomingValue(i)); + GetRCIdentityRoot(PN->getIncomingValue(i)); if (IsNullOrUndef(Incoming)) HasNull = true; else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back()) @@ -1482,25 +1480,25 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // Check that there is nothing that cares about the reference // count between the call and the phi. switch (Class) { - case IC_Retain: - case IC_RetainBlock: + case ARCInstKind::Retain: + case ARCInstKind::RetainBlock: // These can always be moved up. break; - case IC_Release: + case ARCInstKind::Release: // These can't be moved across things that care about the retain // count. FindDependencies(NeedsPositiveRetainCount, Arg, Inst->getParent(), Inst, DependingInstructions, Visited, PA); break; - case IC_Autorelease: + case ARCInstKind::Autorelease: // These can't be moved across autorelease pool scope boundaries. FindDependencies(AutoreleasePoolBoundary, Arg, Inst->getParent(), Inst, DependingInstructions, Visited, PA); break; - case IC_RetainRV: - case IC_AutoreleaseRV: + case ARCInstKind::RetainRV: + case ARCInstKind::AutoreleaseRV: // Don't move these; the RV optimization depends on the autoreleaseRV // being tail called, and the retainRV being immediately after a call // (which might still happen if we get lucky with codegen layout, but @@ -1519,7 +1517,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { Type *ParamTy = CInst->getArgOperand(0)->getType(); for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *Incoming = - StripPointerCastsAndObjCCalls(PN->getIncomingValue(i)); + GetRCIdentityRoot(PN->getIncomingValue(i)); if (!IsNullOrUndef(Incoming)) { CallInst *Clone = cast<CallInst>(CInst->clone()); Value *Op = PN->getIncomingValue(i); @@ -1713,14 +1711,14 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, MapVector<Value *, RRInfo> &Retains, BBState &MyStates) { bool NestingDetected = false; - InstructionClass Class = GetInstructionClass(Inst); + ARCInstKind Class = GetARCInstKind(Inst); const Value *Arg = nullptr; DEBUG(dbgs() << "Class: " << Class << "\n"); switch (Class) { - case IC_Release: { - Arg = GetObjCArg(Inst); + case ARCInstKind::Release: { + Arg = GetArgRCIdentityRoot(Inst); PtrState &S = MyStates.getPtrBottomUpState(Arg); @@ -1747,14 +1745,14 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, S.SetKnownPositiveRefCount(); break; } - case IC_RetainBlock: + case ARCInstKind::RetainBlock: // In OptimizeIndividualCalls, we have strength reduced all optimizable // objc_retainBlocks to objc_retains. Thus at this point any // objc_retainBlocks that we see are not optimizable. break; - case IC_Retain: - case IC_RetainRV: { - Arg = GetObjCArg(Inst); + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: { + Arg = GetArgRCIdentityRoot(Inst); PtrState &S = MyStates.getPtrBottomUpState(Arg); S.SetKnownPositiveRefCount(); @@ -1771,9 +1769,10 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, S.ClearReverseInsertPts(); // FALL THROUGH case S_CanRelease: - // Don't do retain+release tracking for IC_RetainRV, because it's + // Don't do retain+release tracking for ARCInstKind::RetainRV, + // because it's // better to let it remain as the first instruction after a call. - if (Class != IC_RetainRV) + if (Class != ARCInstKind::RetainRV) Retains[Inst] = S.GetRRInfo(); S.ClearSequenceProgress(); break; @@ -1786,15 +1785,15 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, // A retain moving bottom up can be a use. break; } - case IC_AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPop: // Conservatively, clear MyStates for all known pointers. MyStates.clearBottomUpPointers(); return NestingDetected; - case IC_AutoreleasepoolPush: - case IC_None: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::None: // These are irrelevant. return NestingDetected; - case IC_User: + case ARCInstKind::User: // If we have a store into an alloca of a pointer we are tracking, the // pointer has multiple owners implying that we must be more conservative. // @@ -1810,7 +1809,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand())) { BBState::ptr_iterator I = MyStates.findPtrBottomUpState( - StripPointerCastsAndObjCCalls(SI->getValueOperand())); + GetRCIdentityRoot(SI->getValueOperand())); if (I != MyStates.bottom_up_ptr_end()) MultiOwnersSet.insert(I->first); } @@ -1969,24 +1968,25 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, DenseMap<Value *, RRInfo> &Releases, BBState &MyStates) { bool NestingDetected = false; - InstructionClass Class = GetInstructionClass(Inst); + ARCInstKind Class = GetARCInstKind(Inst); const Value *Arg = nullptr; switch (Class) { - case IC_RetainBlock: + case ARCInstKind::RetainBlock: // In OptimizeIndividualCalls, we have strength reduced all optimizable // objc_retainBlocks to objc_retains. Thus at this point any // objc_retainBlocks that we see are not optimizable. break; - case IC_Retain: - case IC_RetainRV: { - Arg = GetObjCArg(Inst); + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: { + Arg = GetArgRCIdentityRoot(Inst); PtrState &S = MyStates.getPtrTopDownState(Arg); - // Don't do retain+release tracking for IC_RetainRV, because it's + // Don't do retain+release tracking for ARCInstKind::RetainRV, because + // it's // better to let it remain as the first instruction after a call. - if (Class != IC_RetainRV) { + if (Class != ARCInstKind::RetainRV) { // If we see two retains in a row on the same pointer. If so, make // a note, and we'll cicle back to revisit it after we've // hopefully eliminated the second retain, which may allow us to @@ -2009,8 +2009,8 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, // code below. break; } - case IC_Release: { - Arg = GetObjCArg(Inst); + case ARCInstKind::Release: { + Arg = GetArgRCIdentityRoot(Inst); PtrState &S = MyStates.getPtrTopDownState(Arg); S.ClearKnownPositiveRefCount(); @@ -2041,12 +2041,12 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, } break; } - case IC_AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPop: // Conservatively, clear MyStates for all known pointers. MyStates.clearTopDownPointers(); return NestingDetected; - case IC_AutoreleasepoolPush: - case IC_None: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::None: // These are irrelevant. return NestingDetected; default: @@ -2374,7 +2374,7 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> const RRInfo &NewRetainRRI = It->second; KnownSafeTD &= NewRetainRRI.KnownSafe; MultipleOwners = - MultipleOwners || MultiOwnersSet.count(GetObjCArg(NewRetain)); + MultipleOwners || MultiOwnersSet.count(GetArgRCIdentityRoot(NewRetain)); for (Instruction *NewRetainRelease : NewRetainRRI.Calls) { DenseMap<Value *, RRInfo>::const_iterator Jt = Releases.find(NewRetainRelease); @@ -2583,7 +2583,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> DEBUG(dbgs() << "Visiting: " << *Retain << "\n"); - Value *Arg = GetObjCArg(Retain); + Value *Arg = GetArgRCIdentityRoot(Retain); // If the object being released is in static or stack storage, we know it's // not being managed by ObjC reference counting, so we can delete pairs @@ -2595,7 +2595,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> if (const LoadInst *LI = dyn_cast<LoadInst>(Arg)) if (const GlobalVariable *GV = dyn_cast<GlobalVariable>( - StripPointerCastsAndObjCCalls(LI->getPointerOperand()))) + GetRCIdentityRoot(LI->getPointerOperand()))) if (GV->isConstant()) KnownSafe = true; @@ -2642,12 +2642,13 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { DEBUG(dbgs() << "Visiting: " << *Inst << "\n"); - InstructionClass Class = GetBasicInstructionClass(Inst); - if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained) + ARCInstKind Class = GetBasicARCInstKind(Inst); + if (Class != ARCInstKind::LoadWeak && + Class != ARCInstKind::LoadWeakRetained) continue; // Delete objc_loadWeak calls with no users. - if (Class == IC_LoadWeak && Inst->use_empty()) { + if (Class == ARCInstKind::LoadWeak && Inst->use_empty()) { Inst->eraseFromParent(); continue; } @@ -2662,10 +2663,10 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { J = Current.getInstructionIterator(); J != B; --J) { Instruction *EarlierInst = &*std::prev(J); - InstructionClass EarlierClass = GetInstructionClass(EarlierInst); + ARCInstKind EarlierClass = GetARCInstKind(EarlierInst); switch (EarlierClass) { - case IC_LoadWeak: - case IC_LoadWeakRetained: { + case ARCInstKind::LoadWeak: + case ARCInstKind::LoadWeakRetained: { // If this is loading from the same pointer, replace this load's value // with that one. CallInst *Call = cast<CallInst>(Inst); @@ -2676,7 +2677,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { case AliasAnalysis::MustAlias: Changed = true; // If the load has a builtin retain, insert a plain retain for it. - if (Class == IC_LoadWeakRetained) { + if (Class == ARCInstKind::LoadWeakRetained) { Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain); CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call); CI->setTailCall(); @@ -2693,8 +2694,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { } break; } - case IC_StoreWeak: - case IC_InitWeak: { + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: { // If this is storing to the same pointer and has the same size etc. // replace this load's value with the stored value. CallInst *Call = cast<CallInst>(Inst); @@ -2705,7 +2706,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { case AliasAnalysis::MustAlias: Changed = true; // If the load has a builtin retain, insert a plain retain for it. - if (Class == IC_LoadWeakRetained) { + if (Class == ARCInstKind::LoadWeakRetained) { Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain); CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call); CI->setTailCall(); @@ -2722,14 +2723,14 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { } break; } - case IC_MoveWeak: - case IC_CopyWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: // TOOD: Grab the copied value. goto clobbered; - case IC_AutoreleasepoolPush: - case IC_None: - case IC_IntrinsicUser: - case IC_User: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::None: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::User: // Weak pointers are only modified through the weak entry points // (and arbitrary calls, which could call the weak entry points). break; @@ -2745,8 +2746,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { // the alloca and all its users can be zapped. for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; - InstructionClass Class = GetBasicInstructionClass(Inst); - if (Class != IC_DestroyWeak) + ARCInstKind Class = GetBasicARCInstKind(Inst); + if (Class != ARCInstKind::DestroyWeak) continue; CallInst *Call = cast<CallInst>(Inst); @@ -2754,10 +2755,10 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) { for (User *U : Alloca->users()) { const Instruction *UserInst = cast<Instruction>(U); - switch (GetBasicInstructionClass(UserInst)) { - case IC_InitWeak: - case IC_StoreWeak: - case IC_DestroyWeak: + switch (GetBasicARCInstKind(UserInst)) { + case ARCInstKind::InitWeak: + case ARCInstKind::StoreWeak: + case ARCInstKind::DestroyWeak: continue; default: goto done; @@ -2766,13 +2767,13 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { Changed = true; for (auto UI = Alloca->user_begin(), UE = Alloca->user_end(); UI != UE;) { CallInst *UserInst = cast<CallInst>(*UI++); - switch (GetBasicInstructionClass(UserInst)) { - case IC_InitWeak: - case IC_StoreWeak: + switch (GetBasicARCInstKind(UserInst)) { + case ARCInstKind::InitWeak: + case ARCInstKind::StoreWeak: // These functions return their second argument. UserInst->replaceAllUsesWith(UserInst->getArgOperand(1)); break; - case IC_DestroyWeak: + case ARCInstKind::DestroyWeak: // No return value. break; default: @@ -2835,8 +2836,8 @@ HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain, return false; // Check that the call is a regular call. - InstructionClass Class = GetBasicInstructionClass(Call); - if (Class != IC_CallOrUser && Class != IC_Call) + ARCInstKind Class = GetBasicARCInstKind(Call); + if (Class != ARCInstKind::CallOrUser && Class != ARCInstKind::Call) return false; return true; @@ -2860,9 +2861,8 @@ FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB, dyn_cast_or_null<CallInst>(*DepInsts.begin()); // Check that we found a retain with the same argument. - if (!Retain || - !IsRetain(GetBasicInstructionClass(Retain)) || - GetObjCArg(Retain) != Arg) { + if (!Retain || !IsRetain(GetBasicARCInstKind(Retain)) || + GetArgRCIdentityRoot(Retain) != Arg) { return nullptr; } @@ -2887,10 +2887,10 @@ FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB, dyn_cast_or_null<CallInst>(*DepInsts.begin()); if (!Autorelease) return nullptr; - InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease); + ARCInstKind AutoreleaseClass = GetBasicARCInstKind(Autorelease); if (!IsAutorelease(AutoreleaseClass)) return nullptr; - if (GetObjCArg(Autorelease) != Arg) + if (GetArgRCIdentityRoot(Autorelease) != Arg) return nullptr; return Autorelease; @@ -2921,7 +2921,7 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { if (!Ret) continue; - const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0)); + const Value *Arg = GetRCIdentityRoot(Ret->getOperand(0)); // Look for an ``autorelease'' instruction that is a predecessor of Ret and // dependent on Arg such that there are no instructions dependent on Arg @@ -2976,13 +2976,13 @@ ObjCARCOpt::GatherStatistics(Function &F, bool AfterOptimization) { for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; - switch (GetBasicInstructionClass(Inst)) { + switch (GetBasicARCInstKind(Inst)) { default: break; - case IC_Retain: + case ARCInstKind::Retain: ++NumRetains; break; - case IC_Release: + case ARCInstKind::Release: ++NumReleases; break; } @@ -3054,27 +3054,27 @@ bool ObjCARCOpt::runOnFunction(Function &F) { OptimizeIndividualCalls(F); // Optimizations for weak pointers. - if (UsedInThisFunction & ((1 << IC_LoadWeak) | - (1 << IC_LoadWeakRetained) | - (1 << IC_StoreWeak) | - (1 << IC_InitWeak) | - (1 << IC_CopyWeak) | - (1 << IC_MoveWeak) | - (1 << IC_DestroyWeak))) + if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::LoadWeak)) | + (1 << unsigned(ARCInstKind::LoadWeakRetained)) | + (1 << unsigned(ARCInstKind::StoreWeak)) | + (1 << unsigned(ARCInstKind::InitWeak)) | + (1 << unsigned(ARCInstKind::CopyWeak)) | + (1 << unsigned(ARCInstKind::MoveWeak)) | + (1 << unsigned(ARCInstKind::DestroyWeak)))) OptimizeWeakCalls(F); // Optimizations for retain+release pairs. - if (UsedInThisFunction & ((1 << IC_Retain) | - (1 << IC_RetainRV) | - (1 << IC_RetainBlock))) - if (UsedInThisFunction & (1 << IC_Release)) + if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Retain)) | + (1 << unsigned(ARCInstKind::RetainRV)) | + (1 << unsigned(ARCInstKind::RetainBlock)))) + if (UsedInThisFunction & (1 << unsigned(ARCInstKind::Release))) // Run OptimizeSequences until it either stops making changes or // no retain+release pair nesting is detected. while (OptimizeSequences(F)) {} // Optimizations if objc_autorelease is used. - if (UsedInThisFunction & ((1 << IC_Autorelease) | - (1 << IC_AutoreleaseRV))) + if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Autorelease)) | + (1 << unsigned(ARCInstKind::AutoreleaseRV)))) OptimizeReturns(F); // Gather statistics after optimization. diff --git a/lib/Transforms/ObjCARC/ObjCARCUtil.cpp b/lib/Transforms/ObjCARC/ObjCARCUtil.cpp deleted file mode 100644 index 53c077e..0000000 --- a/lib/Transforms/ObjCARC/ObjCARCUtil.cpp +++ /dev/null @@ -1,254 +0,0 @@ -//===- ObjCARCUtil.cpp - ObjC ARC Optimization ----------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file defines several utility functions used by various ARC -/// optimizations which are IMHO too big to be in a header file. -/// -/// WARNING: This file knows about certain library functions. It recognizes them -/// by name, and hardwires knowledge of their semantics. -/// -/// WARNING: This file knows about how certain Objective-C library functions are -/// used. Naive LLVM IR transformations which would otherwise be -/// behavior-preserving may break these assumptions. -/// -//===----------------------------------------------------------------------===// - -#include "ObjCARC.h" -#include "llvm/IR/Intrinsics.h" - -using namespace llvm; -using namespace llvm::objcarc; - -raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, - const InstructionClass Class) { - switch (Class) { - case IC_Retain: - return OS << "IC_Retain"; - case IC_RetainRV: - return OS << "IC_RetainRV"; - case IC_RetainBlock: - return OS << "IC_RetainBlock"; - case IC_Release: - return OS << "IC_Release"; - case IC_Autorelease: - return OS << "IC_Autorelease"; - case IC_AutoreleaseRV: - return OS << "IC_AutoreleaseRV"; - case IC_AutoreleasepoolPush: - return OS << "IC_AutoreleasepoolPush"; - case IC_AutoreleasepoolPop: - return OS << "IC_AutoreleasepoolPop"; - case IC_NoopCast: - return OS << "IC_NoopCast"; - case IC_FusedRetainAutorelease: - return OS << "IC_FusedRetainAutorelease"; - case IC_FusedRetainAutoreleaseRV: - return OS << "IC_FusedRetainAutoreleaseRV"; - case IC_LoadWeakRetained: - return OS << "IC_LoadWeakRetained"; - case IC_StoreWeak: - return OS << "IC_StoreWeak"; - case IC_InitWeak: - return OS << "IC_InitWeak"; - case IC_LoadWeak: - return OS << "IC_LoadWeak"; - case IC_MoveWeak: - return OS << "IC_MoveWeak"; - case IC_CopyWeak: - return OS << "IC_CopyWeak"; - case IC_DestroyWeak: - return OS << "IC_DestroyWeak"; - case IC_StoreStrong: - return OS << "IC_StoreStrong"; - case IC_CallOrUser: - return OS << "IC_CallOrUser"; - case IC_Call: - return OS << "IC_Call"; - case IC_User: - return OS << "IC_User"; - case IC_IntrinsicUser: - return OS << "IC_IntrinsicUser"; - case IC_None: - return OS << "IC_None"; - } - llvm_unreachable("Unknown instruction class!"); -} - -InstructionClass llvm::objcarc::GetFunctionClass(const Function *F) { - Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end(); - - // No (mandatory) arguments. - if (AI == AE) - return StringSwitch<InstructionClass>(F->getName()) - .Case("objc_autoreleasePoolPush", IC_AutoreleasepoolPush) - .Case("clang.arc.use", IC_IntrinsicUser) - .Default(IC_CallOrUser); - - // One argument. - const Argument *A0 = AI++; - if (AI == AE) - // Argument is a pointer. - if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) { - Type *ETy = PTy->getElementType(); - // Argument is i8*. - if (ETy->isIntegerTy(8)) - return StringSwitch<InstructionClass>(F->getName()) - .Case("objc_retain", IC_Retain) - .Case("objc_retainAutoreleasedReturnValue", IC_RetainRV) - .Case("objc_retainBlock", IC_RetainBlock) - .Case("objc_release", IC_Release) - .Case("objc_autorelease", IC_Autorelease) - .Case("objc_autoreleaseReturnValue", IC_AutoreleaseRV) - .Case("objc_autoreleasePoolPop", IC_AutoreleasepoolPop) - .Case("objc_retainedObject", IC_NoopCast) - .Case("objc_unretainedObject", IC_NoopCast) - .Case("objc_unretainedPointer", IC_NoopCast) - .Case("objc_retain_autorelease", IC_FusedRetainAutorelease) - .Case("objc_retainAutorelease", IC_FusedRetainAutorelease) - .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV) - .Case("objc_sync_enter", IC_User) - .Case("objc_sync_exit", IC_User) - .Default(IC_CallOrUser); - - // Argument is i8** - if (PointerType *Pte = dyn_cast<PointerType>(ETy)) - if (Pte->getElementType()->isIntegerTy(8)) - return StringSwitch<InstructionClass>(F->getName()) - .Case("objc_loadWeakRetained", IC_LoadWeakRetained) - .Case("objc_loadWeak", IC_LoadWeak) - .Case("objc_destroyWeak", IC_DestroyWeak) - .Default(IC_CallOrUser); - } - - // Two arguments, first is i8**. - const Argument *A1 = AI++; - if (AI == AE) - if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) - if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType())) - if (Pte->getElementType()->isIntegerTy(8)) - if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) { - Type *ETy1 = PTy1->getElementType(); - // Second argument is i8* - if (ETy1->isIntegerTy(8)) - return StringSwitch<InstructionClass>(F->getName()) - .Case("objc_storeWeak", IC_StoreWeak) - .Case("objc_initWeak", IC_InitWeak) - .Case("objc_storeStrong", IC_StoreStrong) - .Default(IC_CallOrUser); - // Second argument is i8**. - if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1)) - if (Pte1->getElementType()->isIntegerTy(8)) - return StringSwitch<InstructionClass>(F->getName()) - .Case("objc_moveWeak", IC_MoveWeak) - .Case("objc_copyWeak", IC_CopyWeak) - // Ignore annotation calls. This is important to stop the - // optimizer from treating annotations as uses which would - // make the state of the pointers they are attempting to - // elucidate to be incorrect. - .Case("llvm.arc.annotation.topdown.bbstart", IC_None) - .Case("llvm.arc.annotation.topdown.bbend", IC_None) - .Case("llvm.arc.annotation.bottomup.bbstart", IC_None) - .Case("llvm.arc.annotation.bottomup.bbend", IC_None) - .Default(IC_CallOrUser); - } - - // Anything else. - return IC_CallOrUser; -} - -/// \brief Determine what kind of construct V is. -InstructionClass -llvm::objcarc::GetInstructionClass(const Value *V) { - if (const Instruction *I = dyn_cast<Instruction>(V)) { - // Any instruction other than bitcast and gep with a pointer operand have a - // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer - // to a subsequent use, rather than using it themselves, in this sense. - // As a short cut, several other opcodes are known to have no pointer - // operands of interest. And ret is never followed by a release, so it's - // not interesting to examine. - switch (I->getOpcode()) { - case Instruction::Call: { - const CallInst *CI = cast<CallInst>(I); - // Check for calls to special functions. - if (const Function *F = CI->getCalledFunction()) { - InstructionClass Class = GetFunctionClass(F); - if (Class != IC_CallOrUser) - return Class; - - // None of the intrinsic functions do objc_release. For intrinsics, the - // only question is whether or not they may be users. - switch (F->getIntrinsicID()) { - case Intrinsic::returnaddress: case Intrinsic::frameaddress: - case Intrinsic::stacksave: case Intrinsic::stackrestore: - case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend: - case Intrinsic::objectsize: case Intrinsic::prefetch: - case Intrinsic::stackprotector: - case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64: - case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa: - case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext: - case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline: - case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: - case Intrinsic::invariant_start: case Intrinsic::invariant_end: - // Don't let dbg info affect our results. - case Intrinsic::dbg_declare: case Intrinsic::dbg_value: - // Short cut: Some intrinsics obviously don't use ObjC pointers. - return IC_None; - default: - break; - } - } - return GetCallSiteClass(CI); - } - case Instruction::Invoke: - return GetCallSiteClass(cast<InvokeInst>(I)); - case Instruction::BitCast: - case Instruction::GetElementPtr: - case Instruction::Select: case Instruction::PHI: - case Instruction::Ret: case Instruction::Br: - case Instruction::Switch: case Instruction::IndirectBr: - case Instruction::Alloca: case Instruction::VAArg: - case Instruction::Add: case Instruction::FAdd: - case Instruction::Sub: case Instruction::FSub: - case Instruction::Mul: case Instruction::FMul: - case Instruction::SDiv: case Instruction::UDiv: case Instruction::FDiv: - case Instruction::SRem: case Instruction::URem: case Instruction::FRem: - case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: - case Instruction::And: case Instruction::Or: case Instruction::Xor: - case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc: - case Instruction::IntToPtr: case Instruction::FCmp: - case Instruction::FPTrunc: case Instruction::FPExt: - case Instruction::FPToUI: case Instruction::FPToSI: - case Instruction::UIToFP: case Instruction::SIToFP: - case Instruction::InsertElement: case Instruction::ExtractElement: - case Instruction::ShuffleVector: - case Instruction::ExtractValue: - break; - case Instruction::ICmp: - // Comparing a pointer with null, or any other constant, isn't an - // interesting use, because we don't care what the pointer points to, or - // about the values of any other dynamic reference-counted pointers. - if (IsPotentialRetainableObjPtr(I->getOperand(1))) - return IC_User; - break; - default: - // For anything else, check all the operands. - // Note that this includes both operands of a Store: while the first - // operand isn't actually being dereferenced, it is being stored to - // memory where we can no longer track who might read it and dereference - // it, so we have to consider it potentially used. - for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end(); - OI != OE; ++OI) - if (IsPotentialRetainableObjPtr(*OI)) - return IC_User; - } - } - - // Otherwise, it's totally inert for ARC purposes. - return IC_None; -} diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h index 7820468..4b5f4d8 100644 --- a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h +++ b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h @@ -57,8 +57,8 @@ class ProvenanceAnalysis { bool relatedSelect(const SelectInst *A, const Value *B); bool relatedPHI(const PHINode *A, const Value *B); - void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION; - ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION; + void operator=(const ProvenanceAnalysis &) = delete; + ProvenanceAnalysis(const ProvenanceAnalysis &) = delete; public: ProvenanceAnalysis() {} diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index 3d91984..d6fc916 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -32,19 +32,18 @@ using namespace llvm; STATISTIC(NumRemoved, "Number of instructions removed"); namespace { - struct ADCE : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - ADCE() : FunctionPass(ID) { - initializeADCEPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function& F) override; +struct ADCE : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + ADCE() : FunctionPass(ID) { + initializeADCEPass(*PassRegistry::getPassRegistry()); + } - void getAnalysisUsage(AnalysisUsage& AU) const override { - AU.setPreservesCFG(); - } + bool runOnFunction(Function& F) override; - }; + void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.setPreservesCFG(); + } +}; } char ADCE::ID = 0; @@ -54,46 +53,45 @@ bool ADCE::runOnFunction(Function& F) { if (skipOptnoneFunction(F)) return false; - SmallPtrSet<Instruction*, 128> alive; - SmallVector<Instruction*, 128> worklist; + SmallPtrSet<Instruction*, 128> Alive; + SmallVector<Instruction*, 128> Worklist; // Collect the set of "root" instructions that are known live. - for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) - if (isa<TerminatorInst>(I.getInstructionIterator()) || - isa<DbgInfoIntrinsic>(I.getInstructionIterator()) || - isa<LandingPadInst>(I.getInstructionIterator()) || - I->mayHaveSideEffects()) { - alive.insert(I.getInstructionIterator()); - worklist.push_back(I.getInstructionIterator()); + for (Instruction &I : inst_range(F)) { + if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || + isa<LandingPadInst>(I) || I.mayHaveSideEffects()) { + Alive.insert(&I); + Worklist.push_back(&I); } + } // Propagate liveness backwards to operands. - while (!worklist.empty()) { - Instruction* curr = worklist.pop_back_val(); - for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end(); - OI != OE; ++OI) - if (Instruction* Inst = dyn_cast<Instruction>(OI)) - if (alive.insert(Inst).second) - worklist.push_back(Inst); + while (!Worklist.empty()) { + Instruction *Curr = Worklist.pop_back_val(); + for (Use &OI : Curr->operands()) { + if (Instruction *Inst = dyn_cast<Instruction>(OI)) + if (Alive.insert(Inst).second) + Worklist.push_back(Inst); + } } // The inverse of the live set is the dead set. These are those instructions // which have no side effects and do not influence the control flow or return // value of the function, and may therefore be deleted safely. - // NOTE: We reuse the worklist vector here for memory efficiency. - for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) - if (!alive.count(I.getInstructionIterator())) { - worklist.push_back(I.getInstructionIterator()); - I->dropAllReferences(); + // NOTE: We reuse the Worklist vector here for memory efficiency. + for (Instruction &I : inst_range(F)) { + if (!Alive.count(&I)) { + Worklist.push_back(&I); + I.dropAllReferences(); } + } - for (SmallVectorImpl<Instruction *>::iterator I = worklist.begin(), - E = worklist.end(); I != E; ++I) { + for (Instruction *&I : Worklist) { ++NumRemoved; - (*I)->eraseFromParent(); + I->eraseFromParent(); } - return !worklist.empty(); + return !Worklist.empty(); } FunctionPass *llvm::createAggressiveDCEPass() { diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 06c3dfd..5c74885 100644 --- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -21,7 +21,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -53,12 +53,12 @@ struct AlignmentFromAssumptions : public FunctionPass { bool runOnFunction(Function &F); virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<ScalarEvolution>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<ScalarEvolution>(); } @@ -69,7 +69,6 @@ struct AlignmentFromAssumptions : public FunctionPass { // another assumption later, then we may change the alignment at that point. DenseMap<MemTransferInst *, unsigned> NewDestAlignments, NewSrcAlignments; - AssumptionTracker *AT; ScalarEvolution *SE; DominatorTree *DT; const DataLayout *DL; @@ -84,7 +83,7 @@ char AlignmentFromAssumptions::ID = 0; static const char aip_name[] = "Alignment from assumptions"; INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME, aip_name, false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME, @@ -411,7 +410,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) { bool AlignmentFromAssumptions::runOnFunction(Function &F) { bool Changed = false; - AT = &getAnalysis<AssumptionTracker>(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); @@ -420,8 +419,9 @@ bool AlignmentFromAssumptions::runOnFunction(Function &F) { NewDestAlignments.clear(); NewSrcAlignments.clear(); - for (auto &I : AT->assumptions(&F)) - Changed |= processAssumption(I); + for (auto &AssumeVH : AC.assumptions()) + if (AssumeVH) + Changed |= processAssumption(cast<CallInst>(AssumeVH)); return Changed; } diff --git a/lib/Transforms/Scalar/Android.mk b/lib/Transforms/Scalar/Android.mk index 9028b42..ed803cd 100644 --- a/lib/Transforms/Scalar/Android.mk +++ b/lib/Transforms/Scalar/Android.mk @@ -2,6 +2,7 @@ LOCAL_PATH:= $(call my-dir) transforms_scalar_SRC_FILES := \ ADCE.cpp \ + BDCE.cpp \ AlignmentFromAssumptions.cpp \ ConstantProp.cpp \ ConstantHoisting.cpp \ @@ -12,6 +13,7 @@ transforms_scalar_SRC_FILES := \ FlattenCFGPass.cpp \ GVN.cpp \ IndVarSimplify.cpp \ + InductiveRangeCheckElimination.cpp \ JumpThreading.cpp \ LICM.cpp \ LoadCombine.cpp \ @@ -24,11 +26,14 @@ transforms_scalar_SRC_FILES := \ LoopUnrollPass.cpp \ LoopUnswitch.cpp \ LowerAtomic.cpp \ + LowerExpectIntrinsic.cpp \ MemCpyOptimizer.cpp \ MergedLoadStoreMotion.cpp \ PartiallyInlineLibCalls.cpp \ + PlaceSafepoints.cpp \ Reassociate.cpp \ Reg2Mem.cpp \ + RewriteStatepointsForGC.cpp \ SCCP.cpp \ SROA.cpp \ SampleProfile.cpp \ @@ -38,6 +43,7 @@ transforms_scalar_SRC_FILES := \ SeparateConstOffsetFromGEP.cpp \ SimplifyCFGPass.cpp \ Sink.cpp \ + StraightLineStrengthReduce.cpp \ StructurizeCFG.cpp \ TailRecursionElimination.cpp diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp new file mode 100644 index 0000000..c7bd79d --- /dev/null +++ b/lib/Transforms/Scalar/BDCE.cpp @@ -0,0 +1,411 @@ +//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Bit-Tracking Dead Code Elimination pass. Some +// instructions (shifts, some ands, ors, etc.) kill some of their input bits. +// We track these dead bits and remove instructions that compute only these +// dead bits. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "bdce" + +STATISTIC(NumRemoved, "Number of instructions removed (unused)"); +STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)"); + +namespace { +struct BDCE : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + BDCE() : FunctionPass(ID) { + initializeBDCEPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function& F) override; + + void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.setPreservesCFG(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + } + + void determineLiveOperandBits(const Instruction *UserI, + const Instruction *I, unsigned OperandNo, + const APInt &AOut, APInt &AB, + APInt &KnownZero, APInt &KnownOne, + APInt &KnownZero2, APInt &KnownOne2); + + AssumptionCache *AC; + const DataLayout *DL; + DominatorTree *DT; +}; +} + +char BDCE::ID = 0; +INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", + false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", + false, false) + +static bool isAlwaysLive(Instruction *I) { + return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || + isa<LandingPadInst>(I) || I->mayHaveSideEffects(); +} + +void BDCE::determineLiveOperandBits(const Instruction *UserI, + const Instruction *I, unsigned OperandNo, + const APInt &AOut, APInt &AB, + APInt &KnownZero, APInt &KnownOne, + APInt &KnownZero2, APInt &KnownOne2) { + unsigned BitWidth = AB.getBitWidth(); + + // We're called once per operand, but for some instructions, we need to + // compute known bits of both operands in order to determine the live bits of + // either (when both operands are instructions themselves). We don't, + // however, want to do this twice, so we cache the result in APInts that live + // in the caller. For the two-relevant-operands case, both operand values are + // provided here. + auto ComputeKnownBits = [&](unsigned BitWidth, const Value *V1, + const Value *V2) { + KnownZero = APInt(BitWidth, 0); + KnownOne = APInt(BitWidth, 0); + computeKnownBits(const_cast<Value*>(V1), KnownZero, KnownOne, DL, 0, AC, + UserI, DT); + + if (V2) { + KnownZero2 = APInt(BitWidth, 0); + KnownOne2 = APInt(BitWidth, 0); + computeKnownBits(const_cast<Value*>(V2), KnownZero2, KnownOne2, DL, 0, AC, + UserI, DT); + } + }; + + switch (UserI->getOpcode()) { + default: break; + case Instruction::Call: + case Instruction::Invoke: + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI)) + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::bswap: + // The alive bits of the input are the swapped alive bits of + // the output. + AB = AOut.byteSwap(); + break; + case Intrinsic::ctlz: + if (OperandNo == 0) { + // We need some output bits, so we need all bits of the + // input to the left of, and including, the leftmost bit + // known to be one. + ComputeKnownBits(BitWidth, I, nullptr); + AB = APInt::getHighBitsSet(BitWidth, + std::min(BitWidth, KnownOne.countLeadingZeros()+1)); + } + break; + case Intrinsic::cttz: + if (OperandNo == 0) { + // We need some output bits, so we need all bits of the + // input to the right of, and including, the rightmost bit + // known to be one. + ComputeKnownBits(BitWidth, I, nullptr); + AB = APInt::getLowBitsSet(BitWidth, + std::min(BitWidth, KnownOne.countTrailingZeros()+1)); + } + break; + } + break; + case Instruction::Add: + case Instruction::Sub: + // Find the highest live output bit. We don't need any more input + // bits than that (adds, and thus subtracts, ripple only to the + // left). + AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits()); + break; + case Instruction::Shl: + if (OperandNo == 0) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(UserI->getOperand(1))) { + uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); + AB = AOut.lshr(ShiftAmt); + + // If the shift is nuw/nsw, then the high bits are not dead + // (because we've promised that they *must* be zero). + const ShlOperator *S = cast<ShlOperator>(UserI); + if (S->hasNoSignedWrap()) + AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1); + else if (S->hasNoUnsignedWrap()) + AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt); + } + break; + case Instruction::LShr: + if (OperandNo == 0) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(UserI->getOperand(1))) { + uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); + AB = AOut.shl(ShiftAmt); + + // If the shift is exact, then the low bits are not dead + // (they must be zero). + if (cast<LShrOperator>(UserI)->isExact()) + AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + } + break; + case Instruction::AShr: + if (OperandNo == 0) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(UserI->getOperand(1))) { + uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); + AB = AOut.shl(ShiftAmt); + // Because the high input bit is replicated into the + // high-order bits of the result, if we need any of those + // bits, then we must keep the highest input bit. + if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt)) + .getBoolValue()) + AB.setBit(BitWidth-1); + + // If the shift is exact, then the low bits are not dead + // (they must be zero). + if (cast<AShrOperator>(UserI)->isExact()) + AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + } + break; + case Instruction::And: + AB = AOut; + + // For bits that are known zero, the corresponding bits in the + // other operand are dead (unless they're both zero, in which + // case they can't both be dead, so just mark the LHS bits as + // dead). + if (OperandNo == 0) { + ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); + AB &= ~KnownZero2; + } else { + if (!isa<Instruction>(UserI->getOperand(0))) + ComputeKnownBits(BitWidth, UserI->getOperand(0), I); + AB &= ~(KnownZero & ~KnownZero2); + } + break; + case Instruction::Or: + AB = AOut; + + // For bits that are known one, the corresponding bits in the + // other operand are dead (unless they're both one, in which + // case they can't both be dead, so just mark the LHS bits as + // dead). + if (OperandNo == 0) { + ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); + AB &= ~KnownOne2; + } else { + if (!isa<Instruction>(UserI->getOperand(0))) + ComputeKnownBits(BitWidth, UserI->getOperand(0), I); + AB &= ~(KnownOne & ~KnownOne2); + } + break; + case Instruction::Xor: + case Instruction::PHI: + AB = AOut; + break; + case Instruction::Trunc: + AB = AOut.zext(BitWidth); + break; + case Instruction::ZExt: + AB = AOut.trunc(BitWidth); + break; + case Instruction::SExt: + AB = AOut.trunc(BitWidth); + // Because the high input bit is replicated into the + // high-order bits of the result, if we need any of those + // bits, then we must keep the highest input bit. + if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(), + AOut.getBitWidth() - BitWidth)) + .getBoolValue()) + AB.setBit(BitWidth-1); + break; + case Instruction::Select: + if (OperandNo != 0) + AB = AOut; + break; + } +} + +bool BDCE::runOnFunction(Function& F) { + if (skipOptnoneFunction(F)) + return false; + + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + DL = F.getParent()->getDataLayout(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + DenseMap<Instruction *, APInt> AliveBits; + SmallVector<Instruction*, 128> Worklist; + + // The set of visited instructions (non-integer-typed only). + SmallPtrSet<Instruction*, 128> Visited; + + // Collect the set of "root" instructions that are known live. + for (Instruction &I : inst_range(F)) { + if (!isAlwaysLive(&I)) + continue; + + DEBUG(dbgs() << "BDCE: Root: " << I << "\n"); + // For integer-valued instructions, set up an initial empty set of alive + // bits and add the instruction to the work list. For other instructions + // add their operands to the work list (for integer values operands, mark + // all bits as live). + if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) { + if (!AliveBits.count(&I)) { + AliveBits[&I] = APInt(IT->getBitWidth(), 0); + Worklist.push_back(&I); + } + + continue; + } + + // Non-integer-typed instructions... + for (Use &OI : I.operands()) { + if (Instruction *J = dyn_cast<Instruction>(OI)) { + if (IntegerType *IT = dyn_cast<IntegerType>(J->getType())) + AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth()); + Worklist.push_back(J); + } + } + // To save memory, we don't add I to the Visited set here. Instead, we + // check isAlwaysLive on every instruction when searching for dead + // instructions later (we need to check isAlwaysLive for the + // integer-typed instructions anyway). + } + + // Propagate liveness backwards to operands. + while (!Worklist.empty()) { + Instruction *UserI = Worklist.pop_back_val(); + + DEBUG(dbgs() << "BDCE: Visiting: " << *UserI); + APInt AOut; + if (UserI->getType()->isIntegerTy()) { + AOut = AliveBits[UserI]; + DEBUG(dbgs() << " Alive Out: " << AOut); + } + DEBUG(dbgs() << "\n"); + + if (!UserI->getType()->isIntegerTy()) + Visited.insert(UserI); + + APInt KnownZero, KnownOne, KnownZero2, KnownOne2; + // Compute the set of alive bits for each operand. These are anded into the + // existing set, if any, and if that changes the set of alive bits, the + // operand is added to the work-list. + for (Use &OI : UserI->operands()) { + if (Instruction *I = dyn_cast<Instruction>(OI)) { + if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) { + unsigned BitWidth = IT->getBitWidth(); + APInt AB = APInt::getAllOnesValue(BitWidth); + if (UserI->getType()->isIntegerTy() && !AOut && + !isAlwaysLive(UserI)) { + AB = APInt(BitWidth, 0); + } else { + // If all bits of the output are dead, then all bits of the input + // Bits of each operand that are used to compute alive bits of the + // output are alive, all others are dead. + determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB, + KnownZero, KnownOne, + KnownZero2, KnownOne2); + } + + // If we've added to the set of alive bits (or the operand has not + // been previously visited), then re-queue the operand to be visited + // again. + APInt ABPrev(BitWidth, 0); + auto ABI = AliveBits.find(I); + if (ABI != AliveBits.end()) + ABPrev = ABI->second; + + APInt ABNew = AB | ABPrev; + if (ABNew != ABPrev || ABI == AliveBits.end()) { + AliveBits[I] = std::move(ABNew); + Worklist.push_back(I); + } + } else if (!Visited.count(I)) { + Worklist.push_back(I); + } + } + } + } + + bool Changed = false; + // The inverse of the live set is the dead set. These are those instructions + // which have no side effects and do not influence the control flow or return + // value of the function, and may therefore be deleted safely. + // NOTE: We reuse the Worklist vector here for memory efficiency. + for (Instruction &I : inst_range(F)) { + // For live instructions that have all dead bits, first make them dead by + // replacing all uses with something else. Then, if they don't need to + // remain live (because they have side effects, etc.) we can remove them. + if (I.getType()->isIntegerTy()) { + auto ABI = AliveBits.find(&I); + if (ABI != AliveBits.end()) { + if (ABI->second.getBoolValue()) + continue; + + DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); + // FIXME: In theory we could substitute undef here instead of zero. + // This should be reconsidered once we settle on the semantics of + // undef, poison, etc. + Value *Zero = ConstantInt::get(I.getType(), 0); + ++NumSimplified; + I.replaceAllUsesWith(Zero); + Changed = true; + } + } else if (Visited.count(&I)) { + continue; + } + + if (isAlwaysLive(&I)) + continue; + + Worklist.push_back(&I); + I.dropAllReferences(); + Changed = true; + } + + for (Instruction *&I : Worklist) { + ++NumRemoved; + I->eraseFromParent(); + } + + return Changed; +} + +FunctionPass *llvm::createBitTrackingDCEPass() { + return new BDCE(); +} + diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index b3ee11e..d297eb1 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_library(LLVMScalarOpts ADCE.cpp AlignmentFromAssumptions.cpp + BDCE.cpp ConstantHoisting.cpp ConstantProp.cpp CorrelatedValuePropagation.cpp @@ -9,6 +10,7 @@ add_llvm_library(LLVMScalarOpts EarlyCSE.cpp FlattenCFGPass.cpp GVN.cpp + InductiveRangeCheckElimination.cpp IndVarSimplify.cpp JumpThreading.cpp LICM.cpp @@ -22,11 +24,14 @@ add_llvm_library(LLVMScalarOpts LoopUnrollPass.cpp LoopUnswitch.cpp LowerAtomic.cpp + LowerExpectIntrinsic.cpp MemCpyOptimizer.cpp MergedLoadStoreMotion.cpp PartiallyInlineLibCalls.cpp + PlaceSafepoints.cpp Reassociate.cpp Reg2Mem.cpp + RewriteStatepointsForGC.cpp SCCP.cpp SROA.cpp SampleProfile.cpp @@ -36,8 +41,13 @@ add_llvm_library(LLVMScalarOpts SeparateConstOffsetFromGEP.cpp SimplifyCFGPass.cpp Sink.cpp + StraightLineStrengthReduce.cpp StructurizeCFG.cpp TailRecursionElimination.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Scalar ) add_dependencies(LLVMScalarOpts intrinsics_gen) diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp index 27c177a..e3aab4b 100644 --- a/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -131,14 +131,14 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } private: /// \brief Initialize the pass. void setup(Function &Fn) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TTI = &getAnalysis<TargetTransformInfo>(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn); Entry = &Fn.getEntryBlock(); } @@ -176,7 +176,7 @@ char ConstantHoisting::ID = 0; INITIALIZE_PASS_BEGIN(ConstantHoisting, "consthoist", "Constant Hoisting", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(ConstantHoisting, "consthoist", "Constant Hoisting", false, false) @@ -186,6 +186,9 @@ FunctionPass *llvm::createConstantHoistingPass() { /// \brief Perform the constant hoisting optimization for the given function. bool ConstantHoisting::runOnFunction(Function &Fn) { + if (skipOptnoneFunction(Fn)) + return false; + DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n"); DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp index dd51ce1..29d4e05 100644 --- a/lib/Transforms/Scalar/ConstantProp.cpp +++ b/lib/Transforms/Scalar/ConstantProp.cpp @@ -26,7 +26,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include <set> using namespace llvm; @@ -45,7 +45,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; } @@ -53,7 +53,7 @@ namespace { char ConstantPropagation::ID = 0; INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop", "Simple constant propagation", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(ConstantPropagation, "constprop", "Simple constant propagation", false, false) @@ -70,7 +70,8 @@ bool ConstantPropagation::runOnFunction(Function &F) { bool Changed = false; DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); while (!WorkList.empty()) { Instruction *I = *WorkList.begin(); diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp index 99fac75..3b262a2 100644 --- a/lib/Transforms/Scalar/DCE.cpp +++ b/lib/Transforms/Scalar/DCE.cpp @@ -21,7 +21,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -42,7 +42,8 @@ namespace { bool runOnBasicBlock(BasicBlock &BB) override { if (skipOptnoneFunction(BB)) return false; - TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; bool Changed = false; for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { Instruction *Inst = DI++; @@ -95,7 +96,8 @@ bool DCE::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; - TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; // Start out with all of the instructions in the worklist... std::vector<Instruction*> WorkList; diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index a1ddc00..c2ce1d5 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -33,7 +33,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index cd2ecad..9309623 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -12,12 +12,13 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" @@ -26,7 +27,8 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/RecyclingAllocator.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <deque> using namespace llvm; @@ -40,49 +42,44 @@ STATISTIC(NumCSELoad, "Number of load instructions CSE'd"); STATISTIC(NumCSECall, "Number of call instructions CSE'd"); STATISTIC(NumDSE, "Number of trivial dead stores removed"); -static unsigned getHash(const void *V) { - return DenseMapInfo<const void*>::getHashValue(V); -} - //===----------------------------------------------------------------------===// // SimpleValue //===----------------------------------------------------------------------===// namespace { - /// SimpleValue - Instances of this struct represent available values in the - /// scoped hash table. - struct SimpleValue { - Instruction *Inst; +/// \brief Struct representing the available values in the scoped hash table. +struct SimpleValue { + Instruction *Inst; - SimpleValue(Instruction *I) : Inst(I) { - assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); - } + SimpleValue(Instruction *I) : Inst(I) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } - bool isSentinel() const { - return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || - Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); - } + bool isSentinel() const { + return Inst == DenseMapInfo<Instruction *>::getEmptyKey() || + Inst == DenseMapInfo<Instruction *>::getTombstoneKey(); + } - static bool canHandle(Instruction *Inst) { - // This can only handle non-void readnone functions. - if (CallInst *CI = dyn_cast<CallInst>(Inst)) - return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy(); - return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) || - isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) || - isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || - isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) || - isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst); - } - }; + static bool canHandle(Instruction *Inst) { + // This can only handle non-void readnone functions. + if (CallInst *CI = dyn_cast<CallInst>(Inst)) + return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy(); + return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) || + isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) || + isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || + isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) || + isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst); + } +}; } namespace llvm { -template<> struct DenseMapInfo<SimpleValue> { +template <> struct DenseMapInfo<SimpleValue> { static inline SimpleValue getEmptyKey() { - return DenseMapInfo<Instruction*>::getEmptyKey(); + return DenseMapInfo<Instruction *>::getEmptyKey(); } static inline SimpleValue getTombstoneKey() { - return DenseMapInfo<Instruction*>::getTombstoneKey(); + return DenseMapInfo<Instruction *>::getTombstoneKey(); } static unsigned getHashValue(SimpleValue Val); static bool isEqual(SimpleValue LHS, SimpleValue RHS); @@ -92,7 +89,7 @@ template<> struct DenseMapInfo<SimpleValue> { unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { Instruction *Inst = Val.Inst; // Hash in all of the operands as pointers. - if (BinaryOperator* BinOp = dyn_cast<BinaryOperator>(Inst)) { + if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) { Value *LHS = BinOp->getOperand(0); Value *RHS = BinOp->getOperand(1); if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1)) @@ -101,8 +98,9 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { if (isa<OverflowingBinaryOperator>(BinOp)) { // Hash the overflow behavior unsigned Overflow = - BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap | - BinOp->hasNoUnsignedWrap() * OverflowingBinaryOperator::NoUnsignedWrap; + BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap | + BinOp->hasNoUnsignedWrap() * + OverflowingBinaryOperator::NoUnsignedWrap; return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS); } @@ -135,12 +133,13 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) || - isa<ShuffleVectorInst>(Inst)) && "Invalid/unknown instruction"); + isa<ShuffleVectorInst>(Inst)) && + "Invalid/unknown instruction"); // Mix in the opcode. - return hash_combine(Inst->getOpcode(), - hash_combine_range(Inst->value_op_begin(), - Inst->value_op_end())); + return hash_combine( + Inst->getOpcode(), + hash_combine_range(Inst->value_op_begin(), Inst->value_op_end())); } bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { @@ -149,22 +148,24 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { if (LHS.isSentinel() || RHS.isSentinel()) return LHSI == RHSI; - if (LHSI->getOpcode() != RHSI->getOpcode()) return false; - if (LHSI->isIdenticalTo(RHSI)) return true; + if (LHSI->getOpcode() != RHSI->getOpcode()) + return false; + if (LHSI->isIdenticalTo(RHSI)) + return true; // If we're not strictly identical, we still might be a commutable instruction if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) { if (!LHSBinOp->isCommutative()) return false; - assert(isa<BinaryOperator>(RHSI) - && "same opcode, but different instruction type?"); + assert(isa<BinaryOperator>(RHSI) && + "same opcode, but different instruction type?"); BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI); // Check overflow attributes if (isa<OverflowingBinaryOperator>(LHSBinOp)) { - assert(isa<OverflowingBinaryOperator>(RHSBinOp) - && "same opcode, but different operator type?"); + assert(isa<OverflowingBinaryOperator>(RHSBinOp) && + "same opcode, but different operator type?"); if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() || LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap()) return false; @@ -172,16 +173,16 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { // Commuted equality return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) && - LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0); + LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0); } if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) { - assert(isa<CmpInst>(RHSI) - && "same opcode, but different instruction type?"); + assert(isa<CmpInst>(RHSI) && + "same opcode, but different instruction type?"); CmpInst *RHSCmp = cast<CmpInst>(RHSI); // Commuted equality return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) && - LHSCmp->getOperand(1) == RHSCmp->getOperand(0) && - LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate(); + LHSCmp->getOperand(1) == RHSCmp->getOperand(0) && + LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate(); } return false; @@ -192,57 +193,52 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { //===----------------------------------------------------------------------===// namespace { - /// CallValue - Instances of this struct represent available call values in - /// the scoped hash table. - struct CallValue { - Instruction *Inst; +/// \brief Struct representing the available call values in the scoped hash +/// table. +struct CallValue { + Instruction *Inst; - CallValue(Instruction *I) : Inst(I) { - assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); - } + CallValue(Instruction *I) : Inst(I) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } - bool isSentinel() const { - return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || - Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); - } + bool isSentinel() const { + return Inst == DenseMapInfo<Instruction *>::getEmptyKey() || + Inst == DenseMapInfo<Instruction *>::getTombstoneKey(); + } - static bool canHandle(Instruction *Inst) { - // Don't value number anything that returns void. - if (Inst->getType()->isVoidTy()) - return false; + static bool canHandle(Instruction *Inst) { + // Don't value number anything that returns void. + if (Inst->getType()->isVoidTy()) + return false; - CallInst *CI = dyn_cast<CallInst>(Inst); - if (!CI || !CI->onlyReadsMemory()) - return false; - return true; - } - }; + CallInst *CI = dyn_cast<CallInst>(Inst); + if (!CI || !CI->onlyReadsMemory()) + return false; + return true; + } +}; } namespace llvm { - template<> struct DenseMapInfo<CallValue> { - static inline CallValue getEmptyKey() { - return DenseMapInfo<Instruction*>::getEmptyKey(); - } - static inline CallValue getTombstoneKey() { - return DenseMapInfo<Instruction*>::getTombstoneKey(); - } - static unsigned getHashValue(CallValue Val); - static bool isEqual(CallValue LHS, CallValue RHS); - }; +template <> struct DenseMapInfo<CallValue> { + static inline CallValue getEmptyKey() { + return DenseMapInfo<Instruction *>::getEmptyKey(); + } + static inline CallValue getTombstoneKey() { + return DenseMapInfo<Instruction *>::getTombstoneKey(); + } + static unsigned getHashValue(CallValue Val); + static bool isEqual(CallValue LHS, CallValue RHS); +}; } + unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) { Instruction *Inst = Val.Inst; - // Hash in all of the operands as pointers. - unsigned Res = 0; - for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) { - assert(!Inst->getOperand(i)->getType()->isMetadataTy() && - "Cannot value number calls with metadata operands"); - Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); - } - - // Mix in the opcode. - return (Res << 1) ^ Inst->getOpcode(); + // Hash all of the operands as pointers and mix in the opcode. + return hash_combine( + Inst->getOpcode(), + hash_combine_range(Inst->value_op_begin(), Inst->value_op_end())); } bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { @@ -252,103 +248,106 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { return LHSI->isIdenticalTo(RHSI); } - //===----------------------------------------------------------------------===// -// EarlyCSE pass. +// EarlyCSE implementation //===----------------------------------------------------------------------===// namespace { - -/// EarlyCSE - This pass does a simple depth-first walk over the dominator -/// tree, eliminating trivially redundant instructions and using instsimplify -/// to canonicalize things as it goes. It is intended to be fast and catch -/// obvious cases so that instcombine and other passes are more effective. It -/// is expected that a later pass of GVN will catch the interesting/hard -/// cases. -class EarlyCSE : public FunctionPass { +/// \brief A simple and fast domtree-based CSE pass. +/// +/// This pass does a simple depth-first walk over the dominator tree, +/// eliminating trivially redundant instructions and using instsimplify to +/// canonicalize things as it goes. It is intended to be fast and catch obvious +/// cases so that instcombine and other passes are more effective. It is +/// expected that a later pass of GVN will catch the interesting/hard cases. +class EarlyCSE { public: + Function &F; const DataLayout *DL; - const TargetLibraryInfo *TLI; - DominatorTree *DT; - AssumptionTracker *AT; - typedef RecyclingAllocator<BumpPtrAllocator, - ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy; - typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>, + const TargetLibraryInfo &TLI; + const TargetTransformInfo &TTI; + DominatorTree &DT; + AssumptionCache &AC; + typedef RecyclingAllocator< + BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy; + typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>, AllocatorTy> ScopedHTType; - /// AvailableValues - This scoped hash table contains the current values of - /// all of our simple scalar expressions. As we walk down the domtree, we - /// look to see if instructions are in this: if so, we replace them with what - /// we find, otherwise we insert them so that dominated values can succeed in - /// their lookup. - ScopedHTType *AvailableValues; - - /// AvailableLoads - This scoped hash table contains the current values - /// of loads. This allows us to get efficient access to dominating loads when - /// we have a fully redundant load. In addition to the most recent load, we - /// keep track of a generation count of the read, which is compared against - /// the current generation count. The current generation count is - /// incremented after every possibly writing memory operation, which ensures - /// that we only CSE loads with other loads that have no intervening store. - typedef RecyclingAllocator<BumpPtrAllocator, - ScopedHashTableVal<Value*, std::pair<Value*, unsigned> > > LoadMapAllocator; - typedef ScopedHashTable<Value*, std::pair<Value*, unsigned>, - DenseMapInfo<Value*>, LoadMapAllocator> LoadHTType; - LoadHTType *AvailableLoads; - - /// AvailableCalls - This scoped hash table contains the current values - /// of read-only call values. It uses the same generation count as loads. - typedef ScopedHashTable<CallValue, std::pair<Value*, unsigned> > CallHTType; - CallHTType *AvailableCalls; - - /// CurrentGeneration - This is the current generation of the memory value. + /// \brief A scoped hash table of the current values of all of our simple + /// scalar expressions. + /// + /// As we walk down the domtree, we look to see if instructions are in this: + /// if so, we replace them with what we find, otherwise we insert them so + /// that dominated values can succeed in their lookup. + ScopedHTType AvailableValues; + + /// \brief A scoped hash table of the current values of loads. + /// + /// This allows us to get efficient access to dominating loads when we have + /// a fully redundant load. In addition to the most recent load, we keep + /// track of a generation count of the read, which is compared against the + /// current generation count. The current generation count is incremented + /// after every possibly writing memory operation, which ensures that we only + /// CSE loads with other loads that have no intervening store. + typedef RecyclingAllocator< + BumpPtrAllocator, + ScopedHashTableVal<Value *, std::pair<Value *, unsigned>>> + LoadMapAllocator; + typedef ScopedHashTable<Value *, std::pair<Value *, unsigned>, + DenseMapInfo<Value *>, LoadMapAllocator> LoadHTType; + LoadHTType AvailableLoads; + + /// \brief A scoped hash table of the current values of read-only call + /// values. + /// + /// It uses the same generation count as loads. + typedef ScopedHashTable<CallValue, std::pair<Value *, unsigned>> CallHTType; + CallHTType AvailableCalls; + + /// \brief This is the current generation of the memory value. unsigned CurrentGeneration; - static char ID; - explicit EarlyCSE() : FunctionPass(ID) { - initializeEarlyCSEPass(*PassRegistry::getPassRegistry()); + /// \brief Set up the EarlyCSE runner for a particular function. + EarlyCSE(Function &F, const DataLayout *DL, const TargetLibraryInfo &TLI, + const TargetTransformInfo &TTI, DominatorTree &DT, + AssumptionCache &AC) + : F(F), DL(DL), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) { } - bool runOnFunction(Function &F) override; + bool run(); private: - - // NodeScope - almost a POD, but needs to call the constructors for the - // scoped hash tables so that a new scope gets pushed on. These are RAII so - // that the scope gets popped when the NodeScope is destroyed. + // Almost a POD, but needs to call the constructors for the scoped hash + // tables so that a new scope gets pushed on. These are RAII so that the + // scope gets popped when the NodeScope is destroyed. class NodeScope { - public: - NodeScope(ScopedHTType *availableValues, - LoadHTType *availableLoads, - CallHTType *availableCalls) : - Scope(*availableValues), - LoadScope(*availableLoads), - CallScope(*availableCalls) {} - - private: - NodeScope(const NodeScope&) LLVM_DELETED_FUNCTION; - void operator=(const NodeScope&) LLVM_DELETED_FUNCTION; + public: + NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, + CallHTType &AvailableCalls) + : Scope(AvailableValues), LoadScope(AvailableLoads), + CallScope(AvailableCalls) {} + + private: + NodeScope(const NodeScope &) = delete; + void operator=(const NodeScope &) = delete; ScopedHTType::ScopeTy Scope; LoadHTType::ScopeTy LoadScope; CallHTType::ScopeTy CallScope; }; - // StackNode - contains all the needed information to create a stack for - // doing a depth first tranversal of the tree. This includes scopes for - // values, loads, and calls as well as the generation. There is a child - // iterator so that the children do not need to be store spearately. + // Contains all the needed information to create a stack for doing a depth + // first tranversal of the tree. This includes scopes for values, loads, and + // calls as well as the generation. There is a child iterator so that the + // children do not need to be store spearately. class StackNode { - public: - StackNode(ScopedHTType *availableValues, - LoadHTType *availableLoads, - CallHTType *availableCalls, - unsigned cg, DomTreeNode *n, - DomTreeNode::iterator child, DomTreeNode::iterator end) : - CurrentGeneration(cg), ChildGeneration(cg), Node(n), - ChildIter(child), EndIter(end), - Scopes(availableValues, availableLoads, availableCalls), - Processed(false) {} + public: + StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, + CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n, + DomTreeNode::iterator child, DomTreeNode::iterator end) + : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child), + EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls), + Processed(false) {} // Accessors. unsigned currentGeneration() { return CurrentGeneration; } @@ -365,9 +364,9 @@ private: bool isProcessed() { return Processed; } void process() { Processed = true; } - private: - StackNode(const StackNode&) LLVM_DELETED_FUNCTION; - void operator=(const StackNode&) LLVM_DELETED_FUNCTION; + private: + StackNode(const StackNode &) = delete; + void operator=(const StackNode &) = delete; // Members. unsigned CurrentGeneration; @@ -379,31 +378,78 @@ private: bool Processed; }; + /// \brief Wrapper class to handle memory instructions, including loads, + /// stores and intrinsic loads and stores defined by the target. + class ParseMemoryInst { + public: + ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI) + : Load(false), Store(false), Vol(false), MayReadFromMemory(false), + MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) { + MayReadFromMemory = Inst->mayReadFromMemory(); + MayWriteToMemory = Inst->mayWriteToMemory(); + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { + MemIntrinsicInfo Info; + if (!TTI.getTgtMemIntrinsic(II, Info)) + return; + if (Info.NumMemRefs == 1) { + Store = Info.WriteMem; + Load = Info.ReadMem; + MatchingId = Info.MatchingId; + MayReadFromMemory = Info.ReadMem; + MayWriteToMemory = Info.WriteMem; + Vol = Info.Vol; + Ptr = Info.PtrVal; + } + } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + Load = true; + Vol = !LI->isSimple(); + Ptr = LI->getPointerOperand(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + Store = true; + Vol = !SI->isSimple(); + Ptr = SI->getPointerOperand(); + } + } + bool isLoad() { return Load; } + bool isStore() { return Store; } + bool isVolatile() { return Vol; } + bool isMatchingMemLoc(const ParseMemoryInst &Inst) { + return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId; + } + bool isValid() { return Ptr != nullptr; } + int getMatchingId() { return MatchingId; } + Value *getPtr() { return Ptr; } + bool mayReadFromMemory() { return MayReadFromMemory; } + bool mayWriteToMemory() { return MayWriteToMemory; } + + private: + bool Load; + bool Store; + bool Vol; + bool MayReadFromMemory; + bool MayWriteToMemory; + // For regular (non-intrinsic) loads/stores, this is set to -1. For + // intrinsic loads/stores, the id is retrieved from the corresponding + // field in the MemIntrinsicInfo structure. That field contains + // non-negative values only. + int MatchingId; + Value *Ptr; + }; + bool processNode(DomTreeNode *Node); - // This transformation requires dominator postdominator info - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfo>(); - AU.setPreservesCFG(); + Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const { + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) + return LI; + else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + return SI->getValueOperand(); + assert(isa<IntrinsicInst>(Inst) && "Instruction not supported"); + return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst), + ExpectedType); } }; } -char EarlyCSE::ID = 0; - -// createEarlyCSEPass - The public interface to this file. -FunctionPass *llvm::createEarlyCSEPass() { - return new EarlyCSE(); -} - -INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) -INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false) - bool EarlyCSE::processNode(DomTreeNode *Node) { BasicBlock *BB = Node->getBlock(); @@ -420,17 +466,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { /// as long as there in no instruction that reads memory. If we see a store /// to the same location, we delete the dead store. This zaps trivial dead /// stores which can occur in bitfield code among other things. - StoreInst *LastStore = nullptr; + Instruction *LastStore = nullptr; bool Changed = false; // See if any instructions in the block can be eliminated. If so, do it. If // not, add them to AvailableValues. - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { Instruction *Inst = I++; // Dead instructions should just be removed. - if (isInstructionTriviallyDead(Inst, TLI)) { + if (isInstructionTriviallyDead(Inst, &TLI)) { DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); Inst->eraseFromParent(); Changed = true; @@ -449,7 +495,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If the instruction can be simplified (e.g. X+0 = X) then replace it with // its simpler value. - if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT, AT)) { + if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) { DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); Inst->replaceAllUsesWith(V); Inst->eraseFromParent(); @@ -461,7 +507,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If this is a simple instruction that we can value number, process it. if (SimpleValue::canHandle(Inst)) { // See if the instruction has an available value. If so, use it. - if (Value *V = AvailableValues->lookup(Inst)) { + if (Value *V = AvailableValues.lookup(Inst)) { DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V << '\n'); Inst->replaceAllUsesWith(V); Inst->eraseFromParent(); @@ -471,52 +517,66 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { } // Otherwise, just remember that this value is available. - AvailableValues->insert(Inst, Inst); + AvailableValues.insert(Inst, Inst); continue; } + ParseMemoryInst MemInst(Inst, TTI); // If this is a non-volatile load, process it. - if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + if (MemInst.isValid() && MemInst.isLoad()) { // Ignore volatile loads. - if (!LI->isSimple()) { + if (MemInst.isVolatile()) { LastStore = nullptr; + // Don't CSE across synchronization boundaries. + if (Inst->mayWriteToMemory()) + ++CurrentGeneration; continue; } // If we have an available version of this load, and if it is the right // generation, replace this instruction. - std::pair<Value*, unsigned> InVal = - AvailableLoads->lookup(Inst->getOperand(0)); + std::pair<Value *, unsigned> InVal = + AvailableLoads.lookup(MemInst.getPtr()); if (InVal.first != nullptr && InVal.second == CurrentGeneration) { - DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << " to: " - << *InVal.first << '\n'); - if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); - Inst->eraseFromParent(); - Changed = true; - ++NumCSELoad; - continue; + Value *Op = getOrCreateResult(InVal.first, Inst->getType()); + if (Op != nullptr) { + DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst + << " to: " << *InVal.first << '\n'); + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(Op); + Inst->eraseFromParent(); + Changed = true; + ++NumCSELoad; + continue; + } } // Otherwise, remember that we have this instruction. - AvailableLoads->insert(Inst->getOperand(0), - std::pair<Value*, unsigned>(Inst, CurrentGeneration)); + AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( + Inst, CurrentGeneration)); LastStore = nullptr; continue; } // If this instruction may read from memory, forget LastStore. - if (Inst->mayReadFromMemory()) + // Load/store intrinsics will indicate both a read and a write to + // memory. The target may override this (e.g. so that a store intrinsic + // does not read from memory, and thus will be treated the same as a + // regular store for commoning purposes). + if (Inst->mayReadFromMemory() && + !(MemInst.isValid() && !MemInst.mayReadFromMemory())) LastStore = nullptr; // If this is a read-only call, process it. if (CallValue::canHandle(Inst)) { // If we have an available version of this call, and if it is the right // generation, replace this instruction. - std::pair<Value*, unsigned> InVal = AvailableCalls->lookup(Inst); + std::pair<Value *, unsigned> InVal = AvailableCalls.lookup(Inst); if (InVal.first != nullptr && InVal.second == CurrentGeneration) { - DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << " to: " - << *InVal.first << '\n'); - if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); + DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst + << " to: " << *InVal.first << '\n'); + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(InVal.first); Inst->eraseFromParent(); Changed = true; ++NumCSECall; @@ -524,8 +584,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { } // Otherwise, remember that we have this instruction. - AvailableCalls->insert(Inst, - std::pair<Value*, unsigned>(Inst, CurrentGeneration)); + AvailableCalls.insert( + Inst, std::pair<Value *, unsigned>(Inst, CurrentGeneration)); continue; } @@ -535,17 +595,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (Inst->mayWriteToMemory()) { ++CurrentGeneration; - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (MemInst.isValid() && MemInst.isStore()) { // We do a trivial form of DSE if there are two stores to the same // location with no intervening loads. Delete the earlier store. - if (LastStore && - LastStore->getPointerOperand() == SI->getPointerOperand()) { - DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << " due to: " - << *Inst << '\n'); - LastStore->eraseFromParent(); - Changed = true; - ++NumDSE; - LastStore = nullptr; + if (LastStore) { + ParseMemoryInst LastStoreMemInst(LastStore, TTI); + if (LastStoreMemInst.isMatchingMemLoc(MemInst)) { + DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore + << " due to: " << *Inst << '\n'); + LastStore->eraseFromParent(); + Changed = true; + ++NumDSE; + LastStore = nullptr; + } // fallthrough - we can exploit information about this store } @@ -554,12 +616,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // version of the pointer. It is safe to forward from volatile stores // to non-volatile loads, so we don't have to check for volatility of // the store. - AvailableLoads->insert(SI->getPointerOperand(), - std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration)); + AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( + Inst, CurrentGeneration)); // Remember that this was the last store we saw for DSE. - if (SI->isSimple()) - LastStore = SI; + if (!MemInst.isVolatile()) + LastStore = Inst; } } } @@ -567,40 +629,20 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { return Changed; } - -bool EarlyCSE::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - - // Note, deque is being used here because there is significant performance gains - // over vector when the container becomes very large due to the specific access - // patterns. For more information see the mailing list discussion on this: +bool EarlyCSE::run() { + // Note, deque is being used here because there is significant performance + // gains over vector when the container becomes very large due to the + // specific access patterns. For more information see the mailing list + // discussion on this: // http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html std::deque<StackNode *> nodesToProcess; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - AT = &getAnalysis<AssumptionTracker>(); - - // Tables that the pass uses when walking the domtree. - ScopedHTType AVTable; - AvailableValues = &AVTable; - LoadHTType LoadTable; - AvailableLoads = &LoadTable; - CallHTType CallTable; - AvailableCalls = &CallTable; - - CurrentGeneration = 0; bool Changed = false; // Process the root node. - nodesToProcess.push_back( - new StackNode(AvailableValues, AvailableLoads, AvailableCalls, - CurrentGeneration, DT->getRootNode(), - DT->getRootNode()->begin(), - DT->getRootNode()->end())); + nodesToProcess.push_back(new StackNode( + AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration, + DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end())); // Save the current generation. unsigned LiveOutGeneration = CurrentGeneration; @@ -624,11 +666,9 @@ bool EarlyCSE::runOnFunction(Function &F) { // Push the next child onto the stack. DomTreeNode *child = NodeToProcess->nextChild(); nodesToProcess.push_back( - new StackNode(AvailableValues, - AvailableLoads, - AvailableCalls, - NodeToProcess->childGeneration(), child, - child->begin(), child->end())); + new StackNode(AvailableValues, AvailableLoads, AvailableCalls, + NodeToProcess->childGeneration(), child, child->begin(), + child->end())); } else { // It has been processed, and there are no more children to process, // so delete it and pop it off the stack. @@ -642,3 +682,78 @@ bool EarlyCSE::runOnFunction(Function &F) { return Changed; } + +PreservedAnalyses EarlyCSEPass::run(Function &F, + AnalysisManager<Function> *AM) { + const DataLayout *DL = F.getParent()->getDataLayout(); + + auto &TLI = AM->getResult<TargetLibraryAnalysis>(F); + auto &TTI = AM->getResult<TargetIRAnalysis>(F); + auto &DT = AM->getResult<DominatorTreeAnalysis>(F); + auto &AC = AM->getResult<AssumptionAnalysis>(F); + + EarlyCSE CSE(F, DL, TLI, TTI, DT, AC); + + if (!CSE.run()) + return PreservedAnalyses::all(); + + // CSE preserves the dominator tree because it doesn't mutate the CFG. + // FIXME: Bundle this with other CFG-preservation. + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + return PA; +} + +namespace { +/// \brief A simple and fast domtree-based CSE pass. +/// +/// This pass does a simple depth-first walk over the dominator tree, +/// eliminating trivially redundant instructions and using instsimplify to +/// canonicalize things as it goes. It is intended to be fast and catch obvious +/// cases so that instcombine and other passes are more effective. It is +/// expected that a later pass of GVN will catch the interesting/hard cases. +class EarlyCSELegacyPass : public FunctionPass { +public: + static char ID; + + EarlyCSELegacyPass() : FunctionPass(ID) { + initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipOptnoneFunction(F)) + return false; + + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + auto *DL = DLP ? &DLP->getDataLayout() : nullptr; + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + + EarlyCSE CSE(F, DL, TLI, TTI, DT, AC); + + return CSE.run(); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); + } +}; +} + +char EarlyCSELegacyPass::ID = 0; + +FunctionPass *llvm::createEarlyCSEPass() { return new EarlyCSELegacyPass(); } + +INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false) diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 7dba4e2..73a1f25 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -20,11 +20,12 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -44,7 +45,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -457,7 +458,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) { return e; } -/// lookup - Returns the value number of the specified value. Fails if +/// Returns the value number of the specified value. Fails if /// the value has not yet been numbered. uint32_t ValueTable::lookup(Value *V) const { DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V); @@ -465,7 +466,7 @@ uint32_t ValueTable::lookup(Value *V) const { return VI->second; } -/// lookup_or_add_cmp - Returns the value number of the given comparison, +/// Returns the value number of the given comparison, /// assigning it a new number if it did not have one before. Useful when /// we deduced the result of a comparison, but don't immediately have an /// instruction realizing that comparison to hand. @@ -478,14 +479,14 @@ uint32_t ValueTable::lookup_or_add_cmp(unsigned Opcode, return e; } -/// clear - Remove all entries from the ValueTable. +/// Remove all entries from the ValueTable. void ValueTable::clear() { valueNumbering.clear(); expressionNumbering.clear(); nextValueNumber = 1; } -/// erase - Remove a value from the value numbering. +/// Remove a value from the value numbering. void ValueTable::erase(Value *V) { valueNumbering.erase(V); } @@ -581,8 +582,8 @@ namespace { return cast<MemIntrinsic>(Val.getPointer()); } - /// MaterializeAdjustedValue - Emit code into this block to adjust the value - /// defined here to the specified type. This handles various coercion cases. + /// Emit code into this block to adjust the value defined here to the + /// specified type. This handles various coercion cases. Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const; }; @@ -592,12 +593,12 @@ namespace { DominatorTree *DT; const DataLayout *DL; const TargetLibraryInfo *TLI; - AssumptionTracker *AT; + AssumptionCache *AC; SetVector<BasicBlock *> DeadBlocks; ValueTable VN; - /// LeaderTable - A mapping from value numbers to lists of Value*'s that + /// A mapping from value numbers to lists of Value*'s that /// have that value number. Use findLeader to query it. struct LeaderTableEntry { Value *Val; @@ -622,7 +623,7 @@ namespace { bool runOnFunction(Function &F) override; - /// markInstructionForDeletion - This removes the specified instruction from + /// This removes the specified instruction from /// our various maps and marks it for deletion. void markInstructionForDeletion(Instruction *I) { VN.erase(I); @@ -634,8 +635,7 @@ namespace { AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } MemoryDependenceAnalysis &getMemDep() const { return *MD; } private: - /// addToLeaderTable - Push a new Value to the LeaderTable onto the list for - /// its value number. + /// Push a new Value to the LeaderTable onto the list for its value number. void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) { LeaderTableEntry &Curr = LeaderTable[N]; if (!Curr.Val) { @@ -651,7 +651,7 @@ namespace { Curr.Next = Node; } - /// removeFromLeaderTable - Scan the list of values corresponding to a given + /// Scan the list of values corresponding to a given /// value number, and remove the given instruction if encountered. void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) { LeaderTableEntry* Prev = nullptr; @@ -682,9 +682,9 @@ namespace { // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); if (!NoLoads) AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); @@ -709,6 +709,9 @@ namespace { void dump(DenseMap<uint32_t, Value*> &d); bool iterateOnFunction(Function &F); bool performPRE(Function &F); + bool performScalarPRE(Instruction *I); + bool performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, + unsigned int ValNo); Value *findLeader(const BasicBlock *BB, uint32_t num); void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; @@ -725,16 +728,16 @@ namespace { char GVN::ID = 0; } -// createGVNPass - The public interface to this file... +// The public interface to this file... FunctionPass *llvm::createGVNPass(bool NoLoads) { return new GVN(NoLoads); } INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) @@ -750,7 +753,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) { } #endif -/// IsValueFullyAvailableInBlock - Return true if we can prove that the value +/// Return true if we can prove that the value /// we're analyzing is fully available in the specified block. As we go, keep /// track of which blocks we know are fully alive in FullyAvailableBlocks. This /// map is actually a tri-state map with the following values: @@ -796,7 +799,7 @@ static bool IsValueFullyAvailableInBlock(BasicBlock *BB, return true; -// SpeculationFailure - If we get here, we found out that this is not, after +// If we get here, we found out that this is not, after // all, a fully-available block. We have a problem if we speculated on this and // used the speculation to mark other blocks as available. SpeculationFailure: @@ -831,8 +834,7 @@ SpeculationFailure: } -/// CanCoerceMustAliasedValueToLoad - Return true if -/// CoerceAvailableValueToLoadType will succeed. +/// Return true if CoerceAvailableValueToLoadType will succeed. static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, const DataLayout &DL) { @@ -851,7 +853,7 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, return true; } -/// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and +/// If we saw a store of a value to memory, and /// then a load from a must-aliased pointer of a different type, try to coerce /// the stored value. LoadedTy is the type of the load we want to replace and /// InsertPt is the place to insert new instructions. @@ -936,7 +938,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, return new BitCastInst(StoredVal, LoadedTy, "bitcast", InsertPt); } -/// AnalyzeLoadFromClobberingWrite - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering memory write (store, /// memset, memcpy, memmove). This means that the write *may* provide bits used /// by the load but we can't be sure because the pointers don't mustalias. @@ -1016,7 +1018,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, return LoadOffset-StoreOffset; } -/// AnalyzeLoadFromClobberingStore - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering store. static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, StoreInst *DepSI, @@ -1032,7 +1034,7 @@ static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, StorePtr, StoreSize, DL); } -/// AnalyzeLoadFromClobberingLoad - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being clobbered by another load. See if /// the other load can feed into the second load. static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, @@ -1108,7 +1110,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, } -/// GetStoreValueForLoad - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering store. This means /// that the store provides bits used by the load but we the pointers don't /// mustalias. Check this case to see if there is anything more we can do @@ -1147,7 +1149,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, DL); } -/// GetLoadValueForLoad - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering load. This means /// that the load *may* provide bits used by the load but we can't be sure /// because the pointers don't mustalias. Check this case to see if there is @@ -1210,7 +1212,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, } -/// GetMemInstValueForLoad - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering mem intrinsic. static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, Type *LoadTy, Instruction *InsertPt, @@ -1267,7 +1269,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, } -/// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock, +/// Given a set of loads specified by ValuesPerBlock, /// construct SSA form, allowing us to eliminate LI. This returns the value /// that should be used at LI's definition site. static Value *ConstructSSAForLoadSet(LoadInst *LI, @@ -1621,7 +1623,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // If all preds have a single successor, then we know it is safe to insert // the load on the pred (?!?), so we can insert code to materialize the // pointer if it is not available. - PHITransAddr Address(LI->getPointerOperand(), DL, AT); + PHITransAddr Address(LI->getPointerOperand(), DL, AC); Value *LoadPtr = nullptr; LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT, NewInsts); @@ -1702,13 +1704,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, return true; } -/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are +/// Attempt to eliminate a load whose dependencies are /// non-local by performing PHI construction. bool GVN::processNonLocalLoad(LoadInst *LI) { // Step 1: Find the non-local dependencies of the load. LoadDepVect Deps; - AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI); - MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps); + MD->getNonLocalPointerDependency(LI, Deps); // If we had to process more than one hundred blocks to find the // dependencies, this load isn't worth worrying about. Optimizing @@ -1729,6 +1730,15 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return false; } + // If this load follows a GEP, see if we can PRE the indices before analyzing. + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) { + for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(), + OE = GEP->idx_end(); + OI != OE; ++OI) + if (Instruction *I = dyn_cast<Instruction>(OI->get())) + performScalarPRE(I); + } + // Step 2: Analyze the availability of the load AvailValInBlkVect ValuesPerBlock; UnavailBlkVect UnavailableBlocks; @@ -1807,7 +1817,7 @@ static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) { I->replaceAllUsesWith(Repl); } -/// processLoad - Attempt to eliminate a load, first by eliminating it +/// Attempt to eliminate a load, first by eliminating it /// locally, and then attempting non-local elimination if that fails. bool GVN::processLoad(LoadInst *L) { if (!MD) @@ -2006,7 +2016,7 @@ bool GVN::processLoad(LoadInst *L) { return false; } -// findLeader - In order to find a leader for a given value number at a +// In order to find a leader for a given value number at a // specific basic block, we first obtain the list of all Values for that number, // and then scan the list to find one whose block dominates the block in // question. This is fast because dominator tree queries consist of only @@ -2034,9 +2044,8 @@ Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) { return Val; } -/// replaceAllDominatedUsesWith - Replace all uses of 'From' with 'To' if the -/// use is dominated by the given basic block. Returns the number of uses that -/// were replaced. +/// Replace all uses of 'From' with 'To' if the use is dominated by the given +/// basic block. Returns the number of uses that were replaced. unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To, const BasicBlockEdge &Root) { unsigned Count = 0; @@ -2052,7 +2061,7 @@ unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To, return Count; } -/// isOnlyReachableViaThisEdge - There is an edge from 'Src' to 'Dst'. Return +/// There is an edge from 'Src' to 'Dst'. Return /// true if every path from the entry block to 'Dst' passes via this edge. In /// particular 'Dst' must not be reachable via another edge from 'Src'. static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, @@ -2069,7 +2078,7 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, return Pred != nullptr; } -/// propagateEquality - The given values are known to be equal in every block +/// The given values are known to be equal in every block /// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with /// 'RHS' everywhere in the scope. Returns whether a change was made. bool GVN::propagateEquality(Value *LHS, Value *RHS, @@ -2096,15 +2105,15 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, std::swap(LHS, RHS); assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!"); - // If there is no obvious reason to prefer the left-hand side over the right- - // hand side, ensure the longest lived term is on the right-hand side, so the - // shortest lived term will be replaced by the longest lived. This tends to - // expose more simplifications. + // If there is no obvious reason to prefer the left-hand side over the + // right-hand side, ensure the longest lived term is on the right-hand side, + // so the shortest lived term will be replaced by the longest lived. + // This tends to expose more simplifications. uint32_t LVN = VN.lookup_or_add(LHS); if ((isa<Argument>(LHS) && isa<Argument>(RHS)) || (isa<Instruction>(LHS) && isa<Instruction>(RHS))) { - // Move the 'oldest' value to the right-hand side, using the value number as - // a proxy for age. + // Move the 'oldest' value to the right-hand side, using the value number + // as a proxy for age. uint32_t RVN = VN.lookup_or_add(RHS); if (LVN < RVN) { std::swap(LHS, RHS); @@ -2133,10 +2142,10 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, NumGVNEqProp += NumReplacements; } - // Now try to deduce additional equalities from this one. For example, if the - // known equality was "(A != B)" == "false" then it follows that A and B are - // equal in the scope. Only boolean equalities with an explicit true or false - // RHS are currently supported. + // Now try to deduce additional equalities from this one. For example, if + // the known equality was "(A != B)" == "false" then it follows that A and B + // are equal in the scope. Only boolean equalities with an explicit true or + // false RHS are currently supported. if (!RHS->getType()->isIntegerTy(1)) // Not a boolean equality - bail out. continue; @@ -2161,7 +2170,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, // If we are propagating an equality like "(A == B)" == "true" then also // propagate the equality A == B. When propagating a comparison such as // "(A >= B)" == "true", replace all instances of "A < B" with "false". - if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) { + if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) { Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1); // If "A == B" is known true, or "A != B" is known false, then replace @@ -2170,12 +2179,28 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) Worklist.push_back(std::make_pair(Op0, Op1)); + // Handle the floating point versions of equality comparisons too. + if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) || + (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) { + + // Floating point -0.0 and 0.0 compare equal, so we can only + // propagate values if we know that we have a constant and that + // its value is non-zero. + + // FIXME: We should do this optimization if 'no signed zeros' is + // applicable via an instruction-level fast-math-flag or some other + // indicator that relaxed FP semantics are being used. + + if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero()) + Worklist.push_back(std::make_pair(Op0, Op1)); + } + // If "A >= B" is known true, replace "A < B" with false everywhere. CmpInst::Predicate NotPred = Cmp->getInversePredicate(); Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse); - // Since we don't have the instruction "A < B" immediately to hand, work out - // the value number that it would have and use that to find an appropriate - // instruction (if any). + // Since we don't have the instruction "A < B" immediately to hand, work + // out the value number that it would have and use that to find an + // appropriate instruction (if any). uint32_t NextNum = VN.getNextUnusedValueNumber(); uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1); // If the number we were assigned was brand new then there is no point in @@ -2203,7 +2228,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, return Changed; } -/// processInstruction - When calculating availability, handle an instruction +/// When calculating availability, handle an instruction /// by inserting it into the appropriate sets bool GVN::processInstruction(Instruction *I) { // Ignore dbg info intrinsics. @@ -2214,7 +2239,7 @@ bool GVN::processInstruction(Instruction *I) { // to value numbering it. Value numbering often exposes redundancies, for // example if it determines that %y is equal to %x then the instruction // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. - if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AT)) { + if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) { I->replaceAllUsesWith(V); if (MD && V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); @@ -2334,8 +2359,8 @@ bool GVN::runOnFunction(Function& F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - AT = &getAnalysis<AssumptionTracker>(); - TLI = &getAnalysis<TargetLibraryInfo>(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); VN.setMemDep(MD); VN.setDomTree(DT); @@ -2348,7 +2373,8 @@ bool GVN::runOnFunction(Function& F) { for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { BasicBlock *BB = FI++; - bool removedBlock = MergeBlockIntoPredecessor(BB, this); + bool removedBlock = MergeBlockIntoPredecessor( + BB, DT, /* LoopInfo */ nullptr, VN.getAliasAnalysis(), MD); if (removedBlock) ++NumGVNBlocks; Changed |= removedBlock; @@ -2431,175 +2457,204 @@ bool GVN::processBlock(BasicBlock *BB) { return ChangedFunction; } -/// performPRE - Perform a purely local form of PRE that looks for diamond -/// control flow patterns and attempts to perform simple PRE at the join point. -bool GVN::performPRE(Function &F) { - bool Changed = false; +// Instantiate an expression in a predecessor that lacked it. +bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, + unsigned int ValNo) { + // Because we are going top-down through the block, all value numbers + // will be available in the predecessor by the time we need them. Any + // that weren't originally present will have been instantiated earlier + // in this loop. + bool success = true; + for (unsigned i = 0, e = Instr->getNumOperands(); i != e; ++i) { + Value *Op = Instr->getOperand(i); + if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) + continue; + + if (Value *V = findLeader(Pred, VN.lookup(Op))) { + Instr->setOperand(i, V); + } else { + success = false; + break; + } + } + + // Fail out if we encounter an operand that is not available in + // the PRE predecessor. This is typically because of loads which + // are not value numbered precisely. + if (!success) + return false; + + Instr->insertBefore(Pred->getTerminator()); + Instr->setName(Instr->getName() + ".pre"); + Instr->setDebugLoc(Instr->getDebugLoc()); + VN.add(Instr, ValNo); + + // Update the availability map to include the new instruction. + addToLeaderTable(ValNo, Instr, Pred); + return true; +} + +bool GVN::performScalarPRE(Instruction *CurInst) { SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap; - for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) { - // Nothing to PRE in the entry block. - if (CurrentBlock == &F.getEntryBlock()) continue; - // Don't perform PRE on a landing pad. - if (CurrentBlock->isLandingPad()) continue; + if (isa<AllocaInst>(CurInst) || isa<TerminatorInst>(CurInst) || + isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() || + CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || + isa<DbgInfoIntrinsic>(CurInst)) + return false; - for (BasicBlock::iterator BI = CurrentBlock->begin(), - BE = CurrentBlock->end(); BI != BE; ) { - Instruction *CurInst = BI++; + // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from + // sinking the compare again, and it would force the code generator to + // move the i1 from processor flags or predicate registers into a general + // purpose register. + if (isa<CmpInst>(CurInst)) + return false; - if (isa<AllocaInst>(CurInst) || - isa<TerminatorInst>(CurInst) || isa<PHINode>(CurInst) || - CurInst->getType()->isVoidTy() || - CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || - isa<DbgInfoIntrinsic>(CurInst)) - continue; + // We don't currently value number ANY inline asm calls. + if (CallInst *CallI = dyn_cast<CallInst>(CurInst)) + if (CallI->isInlineAsm()) + return false; - // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from - // sinking the compare again, and it would force the code generator to - // move the i1 from processor flags or predicate registers into a general - // purpose register. - if (isa<CmpInst>(CurInst)) - continue; + uint32_t ValNo = VN.lookup(CurInst); + + // Look for the predecessors for PRE opportunities. We're + // only trying to solve the basic diamond case, where + // a value is computed in the successor and one predecessor, + // but not the other. We also explicitly disallow cases + // where the successor is its own predecessor, because they're + // more complicated to get right. + unsigned NumWith = 0; + unsigned NumWithout = 0; + BasicBlock *PREPred = nullptr; + BasicBlock *CurrentBlock = CurInst->getParent(); + predMap.clear(); + + for (pred_iterator PI = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock); + PI != PE; ++PI) { + BasicBlock *P = *PI; + // We're not interested in PRE where the block is its + // own predecessor, or in blocks with predecessors + // that are not reachable. + if (P == CurrentBlock) { + NumWithout = 2; + break; + } else if (!DT->isReachableFromEntry(P)) { + NumWithout = 2; + break; + } - // We don't currently value number ANY inline asm calls. - if (CallInst *CallI = dyn_cast<CallInst>(CurInst)) - if (CallI->isInlineAsm()) - continue; + Value *predV = findLeader(P, ValNo); + if (!predV) { + predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P)); + PREPred = P; + ++NumWithout; + } else if (predV == CurInst) { + /* CurInst dominates this predecessor. */ + NumWithout = 2; + break; + } else { + predMap.push_back(std::make_pair(predV, P)); + ++NumWith; + } + } - uint32_t ValNo = VN.lookup(CurInst); - - // Look for the predecessors for PRE opportunities. We're - // only trying to solve the basic diamond case, where - // a value is computed in the successor and one predecessor, - // but not the other. We also explicitly disallow cases - // where the successor is its own predecessor, because they're - // more complicated to get right. - unsigned NumWith = 0; - unsigned NumWithout = 0; - BasicBlock *PREPred = nullptr; - predMap.clear(); - - for (pred_iterator PI = pred_begin(CurrentBlock), - PE = pred_end(CurrentBlock); PI != PE; ++PI) { - BasicBlock *P = *PI; - // We're not interested in PRE where the block is its - // own predecessor, or in blocks with predecessors - // that are not reachable. - if (P == CurrentBlock) { - NumWithout = 2; - break; - } else if (!DT->isReachableFromEntry(P)) { - NumWithout = 2; - break; - } + // Don't do PRE when it might increase code size, i.e. when + // we would need to insert instructions in more than one pred. + if (NumWithout > 1 || NumWith == 0) + return false; - Value* predV = findLeader(P, ValNo); - if (!predV) { - predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P)); - PREPred = P; - ++NumWithout; - } else if (predV == CurInst) { - /* CurInst dominates this predecessor. */ - NumWithout = 2; - break; - } else { - predMap.push_back(std::make_pair(predV, P)); - ++NumWith; - } - } + // We may have a case where all predecessors have the instruction, + // and we just need to insert a phi node. Otherwise, perform + // insertion. + Instruction *PREInstr = nullptr; - // Don't do PRE when it might increase code size, i.e. when - // we would need to insert instructions in more than one pred. - if (NumWithout != 1 || NumWith == 0) - continue; + if (NumWithout != 0) { + // Don't do PRE across indirect branch. + if (isa<IndirectBrInst>(PREPred->getTerminator())) + return false; - // Don't do PRE across indirect branch. - if (isa<IndirectBrInst>(PREPred->getTerminator())) - continue; + // We can't do PRE safely on a critical edge, so instead we schedule + // the edge to be split and perform the PRE the next time we iterate + // on the function. + unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock); + if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) { + toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum)); + return false; + } + // We need to insert somewhere, so let's give it a shot + PREInstr = CurInst->clone(); + if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) { + // If we failed insertion, make sure we remove the instruction. + DEBUG(verifyRemoved(PREInstr)); + delete PREInstr; + return false; + } + } - // We can't do PRE safely on a critical edge, so instead we schedule - // the edge to be split and perform the PRE the next time we iterate - // on the function. - unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock); - if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) { - toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum)); - continue; - } + // Either we should have filled in the PRE instruction, or we should + // not have needed insertions. + assert (PREInstr != nullptr || NumWithout == 0); - // Instantiate the expression in the predecessor that lacked it. - // Because we are going top-down through the block, all value numbers - // will be available in the predecessor by the time we need them. Any - // that weren't originally present will have been instantiated earlier - // in this loop. - Instruction *PREInstr = CurInst->clone(); - bool success = true; - for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) { - Value *Op = PREInstr->getOperand(i); - if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) - continue; + ++NumGVNPRE; - if (Value *V = findLeader(PREPred, VN.lookup(Op))) { - PREInstr->setOperand(i, V); - } else { - success = false; - break; - } - } + // Create a PHI to make the value available in this block. + PHINode *Phi = + PHINode::Create(CurInst->getType(), predMap.size(), + CurInst->getName() + ".pre-phi", CurrentBlock->begin()); + for (unsigned i = 0, e = predMap.size(); i != e; ++i) { + if (Value *V = predMap[i].first) + Phi->addIncoming(V, predMap[i].second); + else + Phi->addIncoming(PREInstr, PREPred); + } + + VN.add(Phi, ValNo); + addToLeaderTable(ValNo, Phi, CurrentBlock); + Phi->setDebugLoc(CurInst->getDebugLoc()); + CurInst->replaceAllUsesWith(Phi); + if (Phi->getType()->getScalarType()->isPointerTy()) { + // Because we have added a PHI-use of the pointer value, it has now + // "escaped" from alias analysis' perspective. We need to inform + // AA of this. + for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; ++ii) { + unsigned jj = PHINode::getOperandNumForIncomingValue(ii); + VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj)); + } - // Fail out if we encounter an operand that is not available in - // the PRE predecessor. This is typically because of loads which - // are not value numbered precisely. - if (!success) { - DEBUG(verifyRemoved(PREInstr)); - delete PREInstr; - continue; - } + if (MD) + MD->invalidateCachedPointerInfo(Phi); + } + VN.erase(CurInst); + removeFromLeaderTable(ValNo, CurInst, CurrentBlock); - PREInstr->insertBefore(PREPred->getTerminator()); - PREInstr->setName(CurInst->getName() + ".pre"); - PREInstr->setDebugLoc(CurInst->getDebugLoc()); - VN.add(PREInstr, ValNo); - ++NumGVNPRE; - - // Update the availability map to include the new instruction. - addToLeaderTable(ValNo, PREInstr, PREPred); - - // Create a PHI to make the value available in this block. - PHINode* Phi = PHINode::Create(CurInst->getType(), predMap.size(), - CurInst->getName() + ".pre-phi", - CurrentBlock->begin()); - for (unsigned i = 0, e = predMap.size(); i != e; ++i) { - if (Value *V = predMap[i].first) - Phi->addIncoming(V, predMap[i].second); - else - Phi->addIncoming(PREInstr, PREPred); - } + DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n'); + if (MD) + MD->removeInstruction(CurInst); + DEBUG(verifyRemoved(CurInst)); + CurInst->eraseFromParent(); + ++NumGVNInstr; + + return true; +} - VN.add(Phi, ValNo); - addToLeaderTable(ValNo, Phi, CurrentBlock); - Phi->setDebugLoc(CurInst->getDebugLoc()); - CurInst->replaceAllUsesWith(Phi); - if (Phi->getType()->getScalarType()->isPointerTy()) { - // Because we have added a PHI-use of the pointer value, it has now - // "escaped" from alias analysis' perspective. We need to inform - // AA of this. - for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; - ++ii) { - unsigned jj = PHINode::getOperandNumForIncomingValue(ii); - VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj)); - } +/// Perform a purely local form of PRE that looks for diamond +/// control flow patterns and attempts to perform simple PRE at the join point. +bool GVN::performPRE(Function &F) { + bool Changed = false; + for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) { + // Nothing to PRE in the entry block. + if (CurrentBlock == &F.getEntryBlock()) + continue; - if (MD) - MD->invalidateCachedPointerInfo(Phi); - } - VN.erase(CurInst); - removeFromLeaderTable(ValNo, CurInst, CurrentBlock); + // Don't perform PRE on a landing pad. + if (CurrentBlock->isLandingPad()) + continue; - DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n'); - if (MD) MD->removeInstruction(CurInst); - DEBUG(verifyRemoved(CurInst)); - CurInst->eraseFromParent(); - Changed = true; + for (BasicBlock::iterator BI = CurrentBlock->begin(), + BE = CurrentBlock->end(); + BI != BE;) { + Instruction *CurInst = BI++; + Changed = performScalarPRE(CurInst); } } @@ -2612,50 +2667,48 @@ bool GVN::performPRE(Function &F) { /// Split the critical edge connecting the given two blocks, and return /// the block inserted to the critical edge. BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { - BasicBlock *BB = SplitCriticalEdge(Pred, Succ, this); + BasicBlock *BB = SplitCriticalEdge( + Pred, Succ, CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); if (MD) MD->invalidateCachedPredecessors(); return BB; } -/// splitCriticalEdges - Split critical edges found during the previous +/// Split critical edges found during the previous /// iteration that may enable further optimization. bool GVN::splitCriticalEdges() { if (toSplit.empty()) return false; do { std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val(); - SplitCriticalEdge(Edge.first, Edge.second, this); + SplitCriticalEdge(Edge.first, Edge.second, + CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); } while (!toSplit.empty()); if (MD) MD->invalidateCachedPredecessors(); return true; } -/// iterateOnFunction - Executes one iteration of GVN +/// Executes one iteration of GVN bool GVN::iterateOnFunction(Function &F) { cleanupGlobalSets(); // Top-down walk of the dominator tree bool Changed = false; -#if 0 - // Needed for value numbering with phi construction to work. - ReversePostOrderTraversal<Function*> RPOT(&F); - for (ReversePostOrderTraversal<Function*>::rpo_iterator RI = RPOT.begin(), - RE = RPOT.end(); RI != RE; ++RI) - Changed |= processBlock(*RI); -#else // Save the blocks this function have before transformation begins. GVN may // split critical edge, and hence may invalidate the RPO/DT iterator. // std::vector<BasicBlock *> BBVect; BBVect.reserve(256); - for (DomTreeNode *X : depth_first(DT->getRootNode())) - BBVect.push_back(X->getBlock()); + // Needed for value numbering with phi construction to work. + ReversePostOrderTraversal<Function *> RPOT(&F); + for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(), + RE = RPOT.end(); + RI != RE; ++RI) + BBVect.push_back(*RI); for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end(); I != E; I++) Changed |= processBlock(*I); -#endif return Changed; } @@ -2666,7 +2719,7 @@ void GVN::cleanupGlobalSets() { TableAllocator.Reset(); } -/// verifyRemoved - Verify that the specified instruction does not occur in our +/// Verify that the specified instruction does not occur in our /// internal data structures. void GVN::verifyRemoved(const Instruction *Inst) const { VN.verifyRemoved(Inst); @@ -2685,11 +2738,10 @@ void GVN::verifyRemoved(const Instruction *Inst) const { } } -// BB is declared dead, which implied other blocks become dead as well. This -// function is to add all these blocks to "DeadBlocks". For the dead blocks' -// live successors, update their phi nodes by replacing the operands -// corresponding to dead blocks with UndefVal. -// +/// BB is declared dead, which implied other blocks become dead as well. This +/// function is to add all these blocks to "DeadBlocks". For the dead blocks' +/// live successors, update their phi nodes by replacing the operands +/// corresponding to dead blocks with UndefVal. void GVN::addDeadBlock(BasicBlock *BB) { SmallVector<BasicBlock *, 4> NewDead; SmallSetVector<BasicBlock *, 4> DF; diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index c01f57f..f99ebbc 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -44,7 +44,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" @@ -91,7 +91,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); @@ -126,7 +126,7 @@ char IndVarSimplify::ID = 0; INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars", "Induction Variable Simplification", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) @@ -1929,13 +1929,15 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { if (!L->isLoopSimplifyForm()) return false; - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); - TTI = getAnalysisIfAvailable<TargetTransformInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TLI = TLIP ? &TLIP->getTLI() : nullptr; + auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>(); + TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr; DeadInsts.clear(); Changed = false; diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp new file mode 100644 index 0000000..8559e63 --- /dev/null +++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -0,0 +1,1422 @@ +//===-- InductiveRangeCheckElimination.cpp - ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// The InductiveRangeCheckElimination pass splits a loop's iteration space into +// three disjoint ranges. It does that in a way such that the loop running in +// the middle loop provably does not need range checks. As an example, it will +// convert +// +// len = < known positive > +// for (i = 0; i < n; i++) { +// if (0 <= i && i < len) { +// do_something(); +// } else { +// throw_out_of_bounds(); +// } +// } +// +// to +// +// len = < known positive > +// limit = smin(n, len) +// // no first segment +// for (i = 0; i < limit; i++) { +// if (0 <= i && i < len) { // this check is fully redundant +// do_something(); +// } else { +// throw_out_of_bounds(); +// } +// } +// for (i = limit; i < n; i++) { +// if (0 <= i && i < len) { +// do_something(); +// } else { +// throw_out_of_bounds(); +// } +// } +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Optional.h" + +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" + +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/IR/Verifier.h" + +#include "llvm/Support/Debug.h" + +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/SimplifyIndVar.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" + +#include "llvm/Pass.h" + +#include <array> + +using namespace llvm; + +static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden, + cl::init(64)); + +static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden, + cl::init(false)); + +static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal", + cl::Hidden, cl::init(10)); + +#define DEBUG_TYPE "irce" + +namespace { + +/// An inductive range check is conditional branch in a loop with +/// +/// 1. a very cold successor (i.e. the branch jumps to that successor very +/// rarely) +/// +/// and +/// +/// 2. a condition that is provably true for some range of values taken by the +/// containing loop's induction variable. +/// +/// Currently all inductive range checks are branches conditional on an +/// expression of the form +/// +/// 0 <= (Offset + Scale * I) < Length +/// +/// where `I' is the canonical induction variable of a loop to which Offset and +/// Scale are loop invariant, and Length is >= 0. Currently the 'false' branch +/// is considered cold, looking at profiling data to verify that is a TODO. + +class InductiveRangeCheck { + const SCEV *Offset; + const SCEV *Scale; + Value *Length; + BranchInst *Branch; + + InductiveRangeCheck() : + Offset(nullptr), Scale(nullptr), Length(nullptr), Branch(nullptr) { } + +public: + const SCEV *getOffset() const { return Offset; } + const SCEV *getScale() const { return Scale; } + Value *getLength() const { return Length; } + + void print(raw_ostream &OS) const { + OS << "InductiveRangeCheck:\n"; + OS << " Offset: "; + Offset->print(OS); + OS << " Scale: "; + Scale->print(OS); + OS << " Length: "; + Length->print(OS); + OS << " Branch: "; + getBranch()->print(OS); + OS << "\n"; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() { + print(dbgs()); + } +#endif + + BranchInst *getBranch() const { return Branch; } + + /// Represents an signed integer range [Range.getBegin(), Range.getEnd()). If + /// R.getEnd() sle R.getBegin(), then R denotes the empty range. + + class Range { + const SCEV *Begin; + const SCEV *End; + + public: + Range(const SCEV *Begin, const SCEV *End) : Begin(Begin), End(End) { + assert(Begin->getType() == End->getType() && "ill-typed range!"); + } + + Type *getType() const { return Begin->getType(); } + const SCEV *getBegin() const { return Begin; } + const SCEV *getEnd() const { return End; } + }; + + typedef SpecificBumpPtrAllocator<InductiveRangeCheck> AllocatorTy; + + /// This is the value the condition of the branch needs to evaluate to for the + /// branch to take the hot successor (see (1) above). + bool getPassingDirection() { return true; } + + /// Computes a range for the induction variable (IndVar) in which the range + /// check is redundant and can be constant-folded away. The induction + /// variable is not required to be the canonical {0,+,1} induction variable. + Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE, + const SCEVAddRecExpr *IndVar, + IRBuilder<> &B) const; + + /// Create an inductive range check out of BI if possible, else return + /// nullptr. + static InductiveRangeCheck *create(AllocatorTy &Alloc, BranchInst *BI, + Loop *L, ScalarEvolution &SE, + BranchProbabilityInfo &BPI); +}; + +class InductiveRangeCheckElimination : public LoopPass { + InductiveRangeCheck::AllocatorTy Allocator; + +public: + static char ID; + InductiveRangeCheckElimination() : LoopPass(ID) { + initializeInductiveRangeCheckEliminationPass( + *PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<BranchProbabilityInfo>(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +char InductiveRangeCheckElimination::ID = 0; +} + +INITIALIZE_PASS(InductiveRangeCheckElimination, "irce", + "Inductive range check elimination", false, false) + +static bool IsLowerBoundCheck(Value *Check, Value *&IndexV) { + using namespace llvm::PatternMatch; + + ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + Value *LHS = nullptr, *RHS = nullptr; + + if (!match(Check, m_ICmp(Pred, m_Value(LHS), m_Value(RHS)))) + return false; + + switch (Pred) { + default: + return false; + + case ICmpInst::ICMP_SLE: + std::swap(LHS, RHS); + // fallthrough + case ICmpInst::ICMP_SGE: + if (!match(RHS, m_ConstantInt<0>())) + return false; + IndexV = LHS; + return true; + + case ICmpInst::ICMP_SLT: + std::swap(LHS, RHS); + // fallthrough + case ICmpInst::ICMP_SGT: + if (!match(RHS, m_ConstantInt<-1>())) + return false; + IndexV = LHS; + return true; + } +} + +static bool IsUpperBoundCheck(Value *Check, Value *Index, Value *&UpperLimit) { + using namespace llvm::PatternMatch; + + ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + Value *LHS = nullptr, *RHS = nullptr; + + if (!match(Check, m_ICmp(Pred, m_Value(LHS), m_Value(RHS)))) + return false; + + switch (Pred) { + default: + return false; + + case ICmpInst::ICMP_SGT: + std::swap(LHS, RHS); + // fallthrough + case ICmpInst::ICMP_SLT: + if (LHS != Index) + return false; + UpperLimit = RHS; + return true; + + case ICmpInst::ICMP_UGT: + std::swap(LHS, RHS); + // fallthrough + case ICmpInst::ICMP_ULT: + if (LHS != Index) + return false; + UpperLimit = RHS; + return true; + } +} + +/// Split a condition into something semantically equivalent to (0 <= I < +/// Limit), both comparisons signed and Len loop invariant on L and positive. +/// On success, return true and set Index to I and UpperLimit to Limit. Return +/// false on failure (we may still write to UpperLimit and Index on failure). +/// It does not try to interpret I as a loop index. +/// +static bool SplitRangeCheckCondition(Loop *L, ScalarEvolution &SE, + Value *Condition, const SCEV *&Index, + Value *&UpperLimit) { + + // TODO: currently this catches some silly cases like comparing "%idx slt 1". + // Our transformations are still correct, but less likely to be profitable in + // those cases. We have to come up with some heuristics that pick out the + // range checks that are more profitable to clone a loop for. This function + // in general can be made more robust. + + using namespace llvm::PatternMatch; + + Value *A = nullptr; + Value *B = nullptr; + ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + + // In these early checks we assume that the matched UpperLimit is positive. + // We'll verify that fact later, before returning true. + + if (match(Condition, m_And(m_Value(A), m_Value(B)))) { + Value *IndexV = nullptr; + Value *ExpectedUpperBoundCheck = nullptr; + + if (IsLowerBoundCheck(A, IndexV)) + ExpectedUpperBoundCheck = B; + else if (IsLowerBoundCheck(B, IndexV)) + ExpectedUpperBoundCheck = A; + else + return false; + + if (!IsUpperBoundCheck(ExpectedUpperBoundCheck, IndexV, UpperLimit)) + return false; + + Index = SE.getSCEV(IndexV); + + if (isa<SCEVCouldNotCompute>(Index)) + return false; + + } else if (match(Condition, m_ICmp(Pred, m_Value(A), m_Value(B)))) { + switch (Pred) { + default: + return false; + + case ICmpInst::ICMP_SGT: + std::swap(A, B); + // fall through + case ICmpInst::ICMP_SLT: + UpperLimit = B; + Index = SE.getSCEV(A); + if (isa<SCEVCouldNotCompute>(Index) || !SE.isKnownNonNegative(Index)) + return false; + break; + + case ICmpInst::ICMP_UGT: + std::swap(A, B); + // fall through + case ICmpInst::ICMP_ULT: + UpperLimit = B; + Index = SE.getSCEV(A); + if (isa<SCEVCouldNotCompute>(Index)) + return false; + break; + } + } else { + return false; + } + + const SCEV *UpperLimitSCEV = SE.getSCEV(UpperLimit); + if (isa<SCEVCouldNotCompute>(UpperLimitSCEV) || + !SE.isKnownNonNegative(UpperLimitSCEV)) + return false; + + if (SE.getLoopDisposition(UpperLimitSCEV, L) != + ScalarEvolution::LoopInvariant) { + DEBUG(dbgs() << " in function: " << L->getHeader()->getParent()->getName() + << " "; + dbgs() << " UpperLimit is not loop invariant: " + << UpperLimit->getName() << "\n";); + return false; + } + + return true; +} + + +InductiveRangeCheck * +InductiveRangeCheck::create(InductiveRangeCheck::AllocatorTy &A, BranchInst *BI, + Loop *L, ScalarEvolution &SE, + BranchProbabilityInfo &BPI) { + + if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch()) + return nullptr; + + BranchProbability LikelyTaken(15, 16); + + if (BPI.getEdgeProbability(BI->getParent(), (unsigned) 0) < LikelyTaken) + return nullptr; + + Value *Length = nullptr; + const SCEV *IndexSCEV = nullptr; + + if (!SplitRangeCheckCondition(L, SE, BI->getCondition(), IndexSCEV, Length)) + return nullptr; + + assert(IndexSCEV && Length && "contract with SplitRangeCheckCondition!"); + + const SCEVAddRecExpr *IndexAddRec = dyn_cast<SCEVAddRecExpr>(IndexSCEV); + bool IsAffineIndex = + IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine(); + + if (!IsAffineIndex) + return nullptr; + + InductiveRangeCheck *IRC = new (A.Allocate()) InductiveRangeCheck; + IRC->Length = Length; + IRC->Offset = IndexAddRec->getStart(); + IRC->Scale = IndexAddRec->getStepRecurrence(SE); + IRC->Branch = BI; + return IRC; +} + +namespace { + +// Keeps track of the structure of a loop. This is similar to llvm::Loop, +// except that it is more lightweight and can track the state of a loop through +// changing and potentially invalid IR. This structure also formalizes the +// kinds of loops we can deal with -- ones that have a single latch that is also +// an exiting block *and* have a canonical induction variable. +struct LoopStructure { + const char *Tag; + + BasicBlock *Header; + BasicBlock *Latch; + + // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th + // successor is `LatchExit', the exit block of the loop. + BranchInst *LatchBr; + BasicBlock *LatchExit; + unsigned LatchBrExitIdx; + + Value *IndVarNext; + Value *IndVarStart; + Value *LoopExitAt; + bool IndVarIncreasing; + + LoopStructure() + : Tag(""), Header(nullptr), Latch(nullptr), LatchBr(nullptr), + LatchExit(nullptr), LatchBrExitIdx(-1), IndVarNext(nullptr), + IndVarStart(nullptr), LoopExitAt(nullptr), IndVarIncreasing(false) {} + + template <typename M> LoopStructure map(M Map) const { + LoopStructure Result; + Result.Tag = Tag; + Result.Header = cast<BasicBlock>(Map(Header)); + Result.Latch = cast<BasicBlock>(Map(Latch)); + Result.LatchBr = cast<BranchInst>(Map(LatchBr)); + Result.LatchExit = cast<BasicBlock>(Map(LatchExit)); + Result.LatchBrExitIdx = LatchBrExitIdx; + Result.IndVarNext = Map(IndVarNext); + Result.IndVarStart = Map(IndVarStart); + Result.LoopExitAt = Map(LoopExitAt); + Result.IndVarIncreasing = IndVarIncreasing; + return Result; + } + + static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &, + BranchProbabilityInfo &BPI, + Loop &, + const char *&); +}; + +/// This class is used to constrain loops to run within a given iteration space. +/// The algorithm this class implements is given a Loop and a range [Begin, +/// End). The algorithm then tries to break out a "main loop" out of the loop +/// it is given in a way that the "main loop" runs with the induction variable +/// in a subset of [Begin, End). The algorithm emits appropriate pre and post +/// loops to run any remaining iterations. The pre loop runs any iterations in +/// which the induction variable is < Begin, and the post loop runs any +/// iterations in which the induction variable is >= End. +/// +class LoopConstrainer { + // The representation of a clone of the original loop we started out with. + struct ClonedLoop { + // The cloned blocks + std::vector<BasicBlock *> Blocks; + + // `Map` maps values in the clonee into values in the cloned version + ValueToValueMapTy Map; + + // An instance of `LoopStructure` for the cloned loop + LoopStructure Structure; + }; + + // Result of rewriting the range of a loop. See changeIterationSpaceEnd for + // more details on what these fields mean. + struct RewrittenRangeInfo { + BasicBlock *PseudoExit; + BasicBlock *ExitSelector; + std::vector<PHINode *> PHIValuesAtPseudoExit; + PHINode *IndVarEnd; + + RewrittenRangeInfo() + : PseudoExit(nullptr), ExitSelector(nullptr), IndVarEnd(nullptr) {} + }; + + // Calculated subranges we restrict the iteration space of the main loop to. + // See the implementation of `calculateSubRanges' for more details on how + // these fields are computed. `LowLimit` is None if there is no restriction + // on low end of the restricted iteration space of the main loop. `HighLimit` + // is None if there is no restriction on high end of the restricted iteration + // space of the main loop. + + struct SubRanges { + Optional<const SCEV *> LowLimit; + Optional<const SCEV *> HighLimit; + }; + + // A utility function that does a `replaceUsesOfWith' on the incoming block + // set of a `PHINode' -- replaces instances of `Block' in the `PHINode's + // incoming block list with `ReplaceBy'. + static void replacePHIBlock(PHINode *PN, BasicBlock *Block, + BasicBlock *ReplaceBy); + + // Compute a safe set of limits for the main loop to run in -- effectively the + // intersection of `Range' and the iteration space of the original loop. + // Return None if unable to compute the set of subranges. + // + Optional<SubRanges> calculateSubRanges() const; + + // Clone `OriginalLoop' and return the result in CLResult. The IR after + // running `cloneLoop' is well formed except for the PHI nodes in CLResult -- + // the PHI nodes say that there is an incoming edge from `OriginalPreheader` + // but there is no such edge. + // + void cloneLoop(ClonedLoop &CLResult, const char *Tag) const; + + // Rewrite the iteration space of the loop denoted by (LS, Preheader). The + // iteration space of the rewritten loop ends at ExitLoopAt. The start of the + // iteration space is not changed. `ExitLoopAt' is assumed to be slt + // `OriginalHeaderCount'. + // + // If there are iterations left to execute, control is made to jump to + // `ContinuationBlock', otherwise they take the normal loop exit. The + // returned `RewrittenRangeInfo' object is populated as follows: + // + // .PseudoExit is a basic block that unconditionally branches to + // `ContinuationBlock'. + // + // .ExitSelector is a basic block that decides, on exit from the loop, + // whether to branch to the "true" exit or to `PseudoExit'. + // + // .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value + // for each PHINode in the loop header on taking the pseudo exit. + // + // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate + // preheader because it is made to branch to the loop header only + // conditionally. + // + RewrittenRangeInfo + changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader, + Value *ExitLoopAt, + BasicBlock *ContinuationBlock) const; + + // The loop denoted by `LS' has `OldPreheader' as its preheader. This + // function creates a new preheader for `LS' and returns it. + // + BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader, + const char *Tag) const; + + // `ContinuationBlockAndPreheader' was the continuation block for some call to + // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'. + // This function rewrites the PHI nodes in `LS.Header' to start with the + // correct value. + void rewriteIncomingValuesForPHIs( + LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader, + const LoopConstrainer::RewrittenRangeInfo &RRI) const; + + // Even though we do not preserve any passes at this time, we at least need to + // keep the parent loop structure consistent. The `LPPassManager' seems to + // verify this after running a loop pass. This function adds the list of + // blocks denoted by BBs to this loops parent loop if required. + void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs); + + // Some global state. + Function &F; + LLVMContext &Ctx; + ScalarEvolution &SE; + + // Information about the original loop we started out with. + Loop &OriginalLoop; + LoopInfo &OriginalLoopInfo; + const SCEV *LatchTakenCount; + BasicBlock *OriginalPreheader; + + // The preheader of the main loop. This may or may not be different from + // `OriginalPreheader'. + BasicBlock *MainLoopPreheader; + + // The range we need to run the main loop in. + InductiveRangeCheck::Range Range; + + // The structure of the main loop (see comment at the beginning of this class + // for a definition) + LoopStructure MainLoopStructure; + +public: + LoopConstrainer(Loop &L, LoopInfo &LI, const LoopStructure &LS, + ScalarEvolution &SE, InductiveRangeCheck::Range R) + : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()), + SE(SE), OriginalLoop(L), OriginalLoopInfo(LI), LatchTakenCount(nullptr), + OriginalPreheader(nullptr), MainLoopPreheader(nullptr), Range(R), + MainLoopStructure(LS) {} + + // Entry point for the algorithm. Returns true on success. + bool run(); +}; + +} + +void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block, + BasicBlock *ReplaceBy) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingBlock(i) == Block) + PN->setIncomingBlock(i, ReplaceBy); +} + +static bool CanBeSMax(ScalarEvolution &SE, const SCEV *S) { + APInt SMax = + APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth()); + return SE.getSignedRange(S).contains(SMax) && + SE.getUnsignedRange(S).contains(SMax); +} + +static bool CanBeSMin(ScalarEvolution &SE, const SCEV *S) { + APInt SMin = + APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth()); + return SE.getSignedRange(S).contains(SMin) && + SE.getUnsignedRange(S).contains(SMin); +} + +Optional<LoopStructure> +LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BPI, + Loop &L, const char *&FailureReason) { + assert(L.isLoopSimplifyForm() && "should follow from addRequired<>"); + + BasicBlock *Latch = L.getLoopLatch(); + if (!L.isLoopExiting(Latch)) { + FailureReason = "no loop latch"; + return None; + } + + BasicBlock *Header = L.getHeader(); + BasicBlock *Preheader = L.getLoopPreheader(); + if (!Preheader) { + FailureReason = "no preheader"; + return None; + } + + BranchInst *LatchBr = dyn_cast<BranchInst>(&*Latch->rbegin()); + if (!LatchBr || LatchBr->isUnconditional()) { + FailureReason = "latch terminator not conditional branch"; + return None; + } + + unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0; + + BranchProbability ExitProbability = + BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx); + + if (ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) { + FailureReason = "short running loop, not profitable"; + return None; + } + + ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition()); + if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) { + FailureReason = "latch terminator branch not conditional on integral icmp"; + return None; + } + + const SCEV *LatchCount = SE.getExitCount(&L, Latch); + if (isa<SCEVCouldNotCompute>(LatchCount)) { + FailureReason = "could not compute latch count"; + return None; + } + + ICmpInst::Predicate Pred = ICI->getPredicate(); + Value *LeftValue = ICI->getOperand(0); + const SCEV *LeftSCEV = SE.getSCEV(LeftValue); + IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType()); + + Value *RightValue = ICI->getOperand(1); + const SCEV *RightSCEV = SE.getSCEV(RightValue); + + // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence. + if (!isa<SCEVAddRecExpr>(LeftSCEV)) { + if (isa<SCEVAddRecExpr>(RightSCEV)) { + std::swap(LeftSCEV, RightSCEV); + std::swap(LeftValue, RightValue); + Pred = ICmpInst::getSwappedPredicate(Pred); + } else { + FailureReason = "no add recurrences in the icmp"; + return None; + } + } + + auto IsInductionVar = [&SE](const SCEVAddRecExpr *AR, bool &IsIncreasing) { + if (!AR->isAffine()) + return false; + + IntegerType *Ty = cast<IntegerType>(AR->getType()); + IntegerType *WideTy = + IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2); + + // Currently we only work with induction variables that have been proved to + // not wrap. This restriction can potentially be lifted in the future. + + const SCEVAddRecExpr *ExtendAfterOp = + dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); + if (!ExtendAfterOp) + return false; + + const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy); + const SCEV *ExtendedStep = + SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy); + + bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart && + ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep; + + if (!NoSignedWrap) + return false; + + if (const SCEVConstant *StepExpr = + dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) { + ConstantInt *StepCI = StepExpr->getValue(); + if (StepCI->isOne() || StepCI->isMinusOne()) { + IsIncreasing = StepCI->isOne(); + return true; + } + } + + return false; + }; + + // `ICI` is interpreted as taking the backedge if the *next* value of the + // induction variable satisfies some constraint. + + const SCEVAddRecExpr *IndVarNext = cast<SCEVAddRecExpr>(LeftSCEV); + bool IsIncreasing = false; + if (!IsInductionVar(IndVarNext, IsIncreasing)) { + FailureReason = "LHS in icmp not induction variable"; + return None; + } + + ConstantInt *One = ConstantInt::get(IndVarTy, 1); + // TODO: generalize the predicates here to also match their unsigned variants. + if (IsIncreasing) { + bool FoundExpectedPred = + (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) || + (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0); + + if (!FoundExpectedPred) { + FailureReason = "expected icmp slt semantically, found something else"; + return None; + } + + if (LatchBrExitIdx == 0) { + if (CanBeSMax(SE, RightSCEV)) { + // TODO: this restriction is easily removable -- we just have to + // remember that the icmp was an slt and not an sle. + FailureReason = "limit may overflow when coercing sle to slt"; + return None; + } + + IRBuilder<> B(&*Preheader->rbegin()); + RightValue = B.CreateAdd(RightValue, One); + } + + } else { + bool FoundExpectedPred = + (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) || + (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0); + + if (!FoundExpectedPred) { + FailureReason = "expected icmp sgt semantically, found something else"; + return None; + } + + if (LatchBrExitIdx == 0) { + if (CanBeSMin(SE, RightSCEV)) { + // TODO: this restriction is easily removable -- we just have to + // remember that the icmp was an sgt and not an sge. + FailureReason = "limit may overflow when coercing sge to sgt"; + return None; + } + + IRBuilder<> B(&*Preheader->rbegin()); + RightValue = B.CreateSub(RightValue, One); + } + } + + const SCEV *StartNext = IndVarNext->getStart(); + const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE)); + const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend); + + BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx); + + assert(SE.getLoopDisposition(LatchCount, &L) == + ScalarEvolution::LoopInvariant && + "loop variant exit count doesn't make sense!"); + + assert(!L.contains(LatchExit) && "expected an exit block!"); + + Value *IndVarStartV = SCEVExpander(SE, "irce").expandCodeFor( + IndVarStart, IndVarTy, &*Preheader->rbegin()); + IndVarStartV->setName("indvar.start"); + + LoopStructure Result; + + Result.Tag = "main"; + Result.Header = Header; + Result.Latch = Latch; + Result.LatchBr = LatchBr; + Result.LatchExit = LatchExit; + Result.LatchBrExitIdx = LatchBrExitIdx; + Result.IndVarStart = IndVarStartV; + Result.IndVarNext = LeftValue; + Result.IndVarIncreasing = IsIncreasing; + Result.LoopExitAt = RightValue; + + FailureReason = nullptr; + + return Result; +} + +Optional<LoopConstrainer::SubRanges> +LoopConstrainer::calculateSubRanges() const { + IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType()); + + if (Range.getType() != Ty) + return None; + + LoopConstrainer::SubRanges Result; + + // I think we can be more aggressive here and make this nuw / nsw if the + // addition that feeds into the icmp for the latch's terminating branch is nuw + // / nsw. In any case, a wrapping 2's complement addition is safe. + ConstantInt *One = ConstantInt::get(Ty, 1); + const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart); + const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt); + + bool Increasing = MainLoopStructure.IndVarIncreasing; + // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the + // range of values the induction variable takes. + const SCEV *Smallest = + Increasing ? Start : SE.getAddExpr(End, SE.getSCEV(One)); + const SCEV *Greatest = + Increasing ? End : SE.getAddExpr(Start, SE.getSCEV(One)); + + auto Clamp = [this, Smallest, Greatest](const SCEV *S) { + return SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S)); + }; + + // In some cases we can prove that we don't need a pre or post loop + + bool ProvablyNoPreloop = + SE.isKnownPredicate(ICmpInst::ICMP_SLE, Range.getBegin(), Smallest); + if (!ProvablyNoPreloop) + Result.LowLimit = Clamp(Range.getBegin()); + + bool ProvablyNoPostLoop = + SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd()); + if (!ProvablyNoPostLoop) + Result.HighLimit = Clamp(Range.getEnd()); + + return Result; +} + +void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result, + const char *Tag) const { + for (BasicBlock *BB : OriginalLoop.getBlocks()) { + BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F); + Result.Blocks.push_back(Clone); + Result.Map[BB] = Clone; + } + + auto GetClonedValue = [&Result](Value *V) { + assert(V && "null values not in domain!"); + auto It = Result.Map.find(V); + if (It == Result.Map.end()) + return V; + return static_cast<Value *>(It->second); + }; + + Result.Structure = MainLoopStructure.map(GetClonedValue); + Result.Structure.Tag = Tag; + + for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) { + BasicBlock *ClonedBB = Result.Blocks[i]; + BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i]; + + assert(Result.Map[OriginalBB] == ClonedBB && "invariant!"); + + for (Instruction &I : *ClonedBB) + RemapInstruction(&I, Result.Map, + RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); + + // Exit blocks will now have one more predecessor and their PHI nodes need + // to be edited to reflect that. No phi nodes need to be introduced because + // the loop is in LCSSA. + + for (auto SBBI = succ_begin(OriginalBB), SBBE = succ_end(OriginalBB); + SBBI != SBBE; ++SBBI) { + + if (OriginalLoop.contains(*SBBI)) + continue; // not an exit block + + for (Instruction &I : **SBBI) { + if (!isa<PHINode>(&I)) + break; + + PHINode *PN = cast<PHINode>(&I); + Value *OldIncoming = PN->getIncomingValueForBlock(OriginalBB); + PN->addIncoming(GetClonedValue(OldIncoming), ClonedBB); + } + } + } +} + +LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( + const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt, + BasicBlock *ContinuationBlock) const { + + // We start with a loop with a single latch: + // + // +--------------------+ + // | | + // | preheader | + // | | + // +--------+-----------+ + // | ----------------\ + // | / | + // +--------v----v------+ | + // | | | + // | header | | + // | | | + // +--------------------+ | + // | + // ..... | + // | + // +--------------------+ | + // | | | + // | latch >----------/ + // | | + // +-------v------------+ + // | + // | + // | +--------------------+ + // | | | + // +---> original exit | + // | | + // +--------------------+ + // + // We change the control flow to look like + // + // + // +--------------------+ + // | | + // | preheader >-------------------------+ + // | | | + // +--------v-----------+ | + // | /-------------+ | + // | / | | + // +--------v--v--------+ | | + // | | | | + // | header | | +--------+ | + // | | | | | | + // +--------------------+ | | +-----v-----v-----------+ + // | | | | + // | | | .pseudo.exit | + // | | | | + // | | +-----------v-----------+ + // | | | + // ..... | | | + // | | +--------v-------------+ + // +--------------------+ | | | | + // | | | | | ContinuationBlock | + // | latch >------+ | | | + // | | | +----------------------+ + // +---------v----------+ | + // | | + // | | + // | +---------------^-----+ + // | | | + // +-----> .exit.selector | + // | | + // +----------v----------+ + // | + // +--------------------+ | + // | | | + // | original exit <----+ + // | | + // +--------------------+ + // + + RewrittenRangeInfo RRI; + + auto BBInsertLocation = std::next(Function::iterator(LS.Latch)); + RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector", + &F, BBInsertLocation); + RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F, + BBInsertLocation); + + BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin()); + bool Increasing = LS.IndVarIncreasing; + + IRBuilder<> B(PreheaderJump); + + // EnterLoopCond - is it okay to start executing this `LS'? + Value *EnterLoopCond = Increasing + ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt) + : B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt); + + B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit); + PreheaderJump->eraseFromParent(); + + LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector); + B.SetInsertPoint(LS.LatchBr); + Value *TakeBackedgeLoopCond = + Increasing ? B.CreateICmpSLT(LS.IndVarNext, ExitSubloopAt) + : B.CreateICmpSGT(LS.IndVarNext, ExitSubloopAt); + Value *CondForBranch = LS.LatchBrExitIdx == 1 + ? TakeBackedgeLoopCond + : B.CreateNot(TakeBackedgeLoopCond); + + LS.LatchBr->setCondition(CondForBranch); + + B.SetInsertPoint(RRI.ExitSelector); + + // IterationsLeft - are there any more iterations left, given the original + // upper bound on the induction variable? If not, we branch to the "real" + // exit. + Value *IterationsLeft = Increasing + ? B.CreateICmpSLT(LS.IndVarNext, LS.LoopExitAt) + : B.CreateICmpSGT(LS.IndVarNext, LS.LoopExitAt); + B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit); + + BranchInst *BranchToContinuation = + BranchInst::Create(ContinuationBlock, RRI.PseudoExit); + + // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of + // each of the PHI nodes in the loop header. This feeds into the initial + // value of the same PHI nodes if/when we continue execution. + for (Instruction &I : *LS.Header) { + if (!isa<PHINode>(&I)) + break; + + PHINode *PN = cast<PHINode>(&I); + + PHINode *NewPHI = PHINode::Create(PN->getType(), 2, PN->getName() + ".copy", + BranchToContinuation); + + NewPHI->addIncoming(PN->getIncomingValueForBlock(Preheader), Preheader); + NewPHI->addIncoming(PN->getIncomingValueForBlock(LS.Latch), + RRI.ExitSelector); + RRI.PHIValuesAtPseudoExit.push_back(NewPHI); + } + + RRI.IndVarEnd = PHINode::Create(LS.IndVarNext->getType(), 2, "indvar.end", + BranchToContinuation); + RRI.IndVarEnd->addIncoming(LS.IndVarStart, Preheader); + RRI.IndVarEnd->addIncoming(LS.IndVarNext, RRI.ExitSelector); + + // The latch exit now has a branch from `RRI.ExitSelector' instead of + // `LS.Latch'. The PHI nodes need to be updated to reflect that. + for (Instruction &I : *LS.LatchExit) { + if (PHINode *PN = dyn_cast<PHINode>(&I)) + replacePHIBlock(PN, LS.Latch, RRI.ExitSelector); + else + break; + } + + return RRI; +} + +void LoopConstrainer::rewriteIncomingValuesForPHIs( + LoopStructure &LS, BasicBlock *ContinuationBlock, + const LoopConstrainer::RewrittenRangeInfo &RRI) const { + + unsigned PHIIndex = 0; + for (Instruction &I : *LS.Header) { + if (!isa<PHINode>(&I)) + break; + + PHINode *PN = cast<PHINode>(&I); + + for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) + if (PN->getIncomingBlock(i) == ContinuationBlock) + PN->setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]); + } + + LS.IndVarStart = RRI.IndVarEnd; +} + +BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS, + BasicBlock *OldPreheader, + const char *Tag) const { + + BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header); + BranchInst::Create(LS.Header, Preheader); + + for (Instruction &I : *LS.Header) { + if (!isa<PHINode>(&I)) + break; + + PHINode *PN = cast<PHINode>(&I); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) + replacePHIBlock(PN, OldPreheader, Preheader); + } + + return Preheader; +} + +void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) { + Loop *ParentLoop = OriginalLoop.getParentLoop(); + if (!ParentLoop) + return; + + for (BasicBlock *BB : BBs) + ParentLoop->addBasicBlockToLoop(BB, OriginalLoopInfo); +} + +bool LoopConstrainer::run() { + BasicBlock *Preheader = nullptr; + LatchTakenCount = SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch); + Preheader = OriginalLoop.getLoopPreheader(); + assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr && + "preconditions!"); + + OriginalPreheader = Preheader; + MainLoopPreheader = Preheader; + + Optional<SubRanges> MaybeSR = calculateSubRanges(); + if (!MaybeSR.hasValue()) { + DEBUG(dbgs() << "irce: could not compute subranges\n"); + return false; + } + + SubRanges SR = MaybeSR.getValue(); + bool Increasing = MainLoopStructure.IndVarIncreasing; + IntegerType *IVTy = + cast<IntegerType>(MainLoopStructure.IndVarNext->getType()); + + SCEVExpander Expander(SE, "irce"); + Instruction *InsertPt = OriginalPreheader->getTerminator(); + + // It would have been better to make `PreLoop' and `PostLoop' + // `Optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy + // constructor. + ClonedLoop PreLoop, PostLoop; + bool NeedsPreLoop = + Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue(); + bool NeedsPostLoop = + Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue(); + + Value *ExitPreLoopAt = nullptr; + Value *ExitMainLoopAt = nullptr; + const SCEVConstant *MinusOneS = + cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */)); + + if (NeedsPreLoop) { + const SCEV *ExitPreLoopAtSCEV = nullptr; + + if (Increasing) + ExitPreLoopAtSCEV = *SR.LowLimit; + else { + if (CanBeSMin(SE, *SR.HighLimit)) { + DEBUG(dbgs() << "irce: could not prove no-overflow when computing " + << "preloop exit limit. HighLimit = " << *(*SR.HighLimit) + << "\n"); + return false; + } + ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS); + } + + ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt); + ExitPreLoopAt->setName("exit.preloop.at"); + } + + if (NeedsPostLoop) { + const SCEV *ExitMainLoopAtSCEV = nullptr; + + if (Increasing) + ExitMainLoopAtSCEV = *SR.HighLimit; + else { + if (CanBeSMin(SE, *SR.LowLimit)) { + DEBUG(dbgs() << "irce: could not prove no-overflow when computing " + << "mainloop exit limit. LowLimit = " << *(*SR.LowLimit) + << "\n"); + return false; + } + ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS); + } + + ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt); + ExitMainLoopAt->setName("exit.mainloop.at"); + } + + // We clone these ahead of time so that we don't have to deal with changing + // and temporarily invalid IR as we transform the loops. + if (NeedsPreLoop) + cloneLoop(PreLoop, "preloop"); + if (NeedsPostLoop) + cloneLoop(PostLoop, "postloop"); + + RewrittenRangeInfo PreLoopRRI; + + if (NeedsPreLoop) { + Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header, + PreLoop.Structure.Header); + + MainLoopPreheader = + createPreheader(MainLoopStructure, Preheader, "mainloop"); + PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader, + ExitPreLoopAt, MainLoopPreheader); + rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader, + PreLoopRRI); + } + + BasicBlock *PostLoopPreheader = nullptr; + RewrittenRangeInfo PostLoopRRI; + + if (NeedsPostLoop) { + PostLoopPreheader = + createPreheader(PostLoop.Structure, Preheader, "postloop"); + PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader, + ExitMainLoopAt, PostLoopPreheader); + rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader, + PostLoopRRI); + } + + BasicBlock *NewMainLoopPreheader = + MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr; + BasicBlock *NewBlocks[] = {PostLoopPreheader, PreLoopRRI.PseudoExit, + PreLoopRRI.ExitSelector, PostLoopRRI.PseudoExit, + PostLoopRRI.ExitSelector, NewMainLoopPreheader}; + + // Some of the above may be nullptr, filter them out before passing to + // addToParentLoopIfNeeded. + auto NewBlocksEnd = + std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr); + + addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd)); + addToParentLoopIfNeeded(PreLoop.Blocks); + addToParentLoopIfNeeded(PostLoop.Blocks); + + return true; +} + +/// Computes and returns a range of values for the induction variable (IndVar) +/// in which the range check can be safely elided. If it cannot compute such a +/// range, returns None. +Optional<InductiveRangeCheck::Range> +InductiveRangeCheck::computeSafeIterationSpace(ScalarEvolution &SE, + const SCEVAddRecExpr *IndVar, + IRBuilder<> &) const { + // IndVar is of the form "A + B * I" (where "I" is the canonical induction + // variable, that may or may not exist as a real llvm::Value in the loop) and + // this inductive range check is a range check on the "C + D * I" ("C" is + // getOffset() and "D" is getScale()). We rewrite the value being range + // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA". + // Currently we support this only for "B" = "D" = { 1 or -1 }, but the code + // can be generalized as needed. + // + // The actual inequalities we solve are of the form + // + // 0 <= M + 1 * IndVar < L given L >= 0 (i.e. N == 1) + // + // The inequality is satisfied by -M <= IndVar < (L - M) [^1]. All additions + // and subtractions are twos-complement wrapping and comparisons are signed. + // + // Proof: + // + // If there exists IndVar such that -M <= IndVar < (L - M) then it follows + // that -M <= (-M + L) [== Eq. 1]. Since L >= 0, if (-M + L) sign-overflows + // then (-M + L) < (-M). Hence by [Eq. 1], (-M + L) could not have + // overflown. + // + // This means IndVar = t + (-M) for t in [0, L). Hence (IndVar + M) = t. + // Hence 0 <= (IndVar + M) < L + + // [^1]: Note that the solution does _not_ apply if L < 0; consider values M = + // 127, IndVar = 126 and L = -2 in an i8 world. + + if (!IndVar->isAffine()) + return None; + + const SCEV *A = IndVar->getStart(); + const SCEVConstant *B = dyn_cast<SCEVConstant>(IndVar->getStepRecurrence(SE)); + if (!B) + return None; + + const SCEV *C = getOffset(); + const SCEVConstant *D = dyn_cast<SCEVConstant>(getScale()); + if (D != B) + return None; + + ConstantInt *ConstD = D->getValue(); + if (!(ConstD->isMinusOne() || ConstD->isOne())) + return None; + + const SCEV *M = SE.getMinusSCEV(C, A); + + const SCEV *Begin = SE.getNegativeSCEV(M); + const SCEV *End = SE.getMinusSCEV(SE.getSCEV(getLength()), M); + + return InductiveRangeCheck::Range(Begin, End); +} + +static Optional<InductiveRangeCheck::Range> +IntersectRange(ScalarEvolution &SE, + const Optional<InductiveRangeCheck::Range> &R1, + const InductiveRangeCheck::Range &R2, IRBuilder<> &B) { + if (!R1.hasValue()) + return R2; + auto &R1Value = R1.getValue(); + + // TODO: we could widen the smaller range and have this work; but for now we + // bail out to keep things simple. + if (R1Value.getType() != R2.getType()) + return None; + + const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin()); + const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd()); + + return InductiveRangeCheck::Range(NewBegin, NewEnd); +} + +bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { + if (L->getBlocks().size() >= LoopSizeCutoff) { + DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";); + return false; + } + + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) { + DEBUG(dbgs() << "irce: loop has no preheader, leaving\n"); + return false; + } + + LLVMContext &Context = Preheader->getContext(); + InductiveRangeCheck::AllocatorTy IRCAlloc; + SmallVector<InductiveRangeCheck *, 16> RangeChecks; + ScalarEvolution &SE = getAnalysis<ScalarEvolution>(); + BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>(); + + for (auto BBI : L->getBlocks()) + if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator())) + if (InductiveRangeCheck *IRC = + InductiveRangeCheck::create(IRCAlloc, TBI, L, SE, BPI)) + RangeChecks.push_back(IRC); + + if (RangeChecks.empty()) + return false; + + DEBUG(dbgs() << "irce: looking at loop "; L->print(dbgs()); + dbgs() << "irce: loop has " << RangeChecks.size() + << " inductive range checks: \n"; + for (InductiveRangeCheck *IRC : RangeChecks) + IRC->print(dbgs()); + ); + + const char *FailureReason = nullptr; + Optional<LoopStructure> MaybeLoopStructure = + LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason); + if (!MaybeLoopStructure.hasValue()) { + DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason + << "\n";); + return false; + } + LoopStructure LS = MaybeLoopStructure.getValue(); + bool Increasing = LS.IndVarIncreasing; + const SCEV *MinusOne = + SE.getConstant(LS.IndVarNext->getType(), Increasing ? -1 : 1, true); + const SCEVAddRecExpr *IndVar = + cast<SCEVAddRecExpr>(SE.getAddExpr(SE.getSCEV(LS.IndVarNext), MinusOne)); + + Optional<InductiveRangeCheck::Range> SafeIterRange; + Instruction *ExprInsertPt = Preheader->getTerminator(); + + SmallVector<InductiveRangeCheck *, 4> RangeChecksToEliminate; + + IRBuilder<> B(ExprInsertPt); + for (InductiveRangeCheck *IRC : RangeChecks) { + auto Result = IRC->computeSafeIterationSpace(SE, IndVar, B); + if (Result.hasValue()) { + auto MaybeSafeIterRange = + IntersectRange(SE, SafeIterRange, Result.getValue(), B); + if (MaybeSafeIterRange.hasValue()) { + RangeChecksToEliminate.push_back(IRC); + SafeIterRange = MaybeSafeIterRange.getValue(); + } + } + } + + if (!SafeIterRange.hasValue()) + return false; + + LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LS, + SE, SafeIterRange.getValue()); + bool Changed = LC.run(); + + if (Changed) { + auto PrintConstrainedLoopInfo = [L]() { + dbgs() << "irce: in function "; + dbgs() << L->getHeader()->getParent()->getName() << ": "; + dbgs() << "constrained "; + L->print(dbgs()); + }; + + DEBUG(PrintConstrainedLoopInfo()); + + if (PrintChangedLoops) + PrintConstrainedLoopInfo(); + + // Optimize away the now-redundant range checks. + + for (InductiveRangeCheck *IRC : RangeChecksToEliminate) { + ConstantInt *FoldedRangeCheck = IRC->getPassingDirection() + ? ConstantInt::getTrue(Context) + : ConstantInt::getFalse(Context); + IRC->getBranch()->setCondition(FoldedRangeCheck); + } + } + + return Changed; +} + +Pass *llvm::createInductiveRangeCheckEliminationPass() { + return new InductiveRangeCheckElimination; +} diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 60a4925..8b54abd 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -32,7 +32,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -115,7 +115,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); AU.addPreserved<LazyValueInfo>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } void FindLoopHeaders(Function &F); @@ -145,7 +145,7 @@ char JumpThreading::ID = 0; INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading", "Jump Threading", false, false) INITIALIZE_PASS_DEPENDENCY(LazyValueInfo) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(JumpThreading, "jump-threading", "Jump Threading", false, false) @@ -161,7 +161,7 @@ bool JumpThreading::runOnFunction(Function &F) { DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); LVI = &getAnalysis<LazyValueInfo>(); // Remove unreachable blocks from function as they may result in infinite @@ -188,7 +188,7 @@ bool JumpThreading::runOnFunction(Function &F) { // If the block is trivially dead, zap it. This eliminates the successor // edges which simplifies the CFG. - if (pred_begin(BB) == pred_end(BB) && + if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) { DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName() << "' with terminator: " << *BB->getTerminator() << '\n'); @@ -662,7 +662,7 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) { bool JumpThreading::ProcessBlock(BasicBlock *BB) { // If the block is trivially dead, just return and let the caller nuke it. // This simplifies other transformations. - if (pred_begin(BB) == pred_end(BB) && + if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) return false; @@ -797,7 +797,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { } } else if (CondBr && CondConst && CondBr->isConditional()) { - // There might be an invairant in the same block with the conditional + // There might be an invariant in the same block with the conditional // that can determine the predicate. LazyValueInfo::Tristate Ret = @@ -902,8 +902,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // only happen in dead loops. if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType()); if (AvailableVal->getType() != LI->getType()) - AvailableVal = CastInst::Create(CastInst::BitCast, AvailableVal, - LI->getType(), "", LI); + AvailableVal = + CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI); LI->replaceAllUsesWith(AvailableVal); LI->eraseFromParent(); return true; @@ -993,7 +993,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Split them out to their own block. UnavailablePred = - SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split", this); + SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split"); } // If the value isn't available in all predecessors, then there will be @@ -1040,8 +1040,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // predecessor use the same bitcast. Value *&PredV = I->second; if (PredV->getType() != LI->getType()) - PredV = CastInst::Create(CastInst::BitCast, PredV, LI->getType(), "", - P->getTerminator()); + PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "", + P->getTerminator()); PN->addIncoming(PredV, I->first); } @@ -1418,7 +1418,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this); + PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); } // And finally, do it! @@ -1561,7 +1561,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this); + PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); } // Okay, we decided to do this! Clone all the instructions in BB onto the end @@ -1575,7 +1575,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator()); if (!OldPredBranch || !OldPredBranch->isUnconditional()) { - PredBB = SplitEdge(PredBB, BB, this); + PredBB = SplitEdge(PredBB, BB); OldPredBranch = cast<BranchInst>(PredBB->getTerminator()); } diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 5f00bb9..14af38b 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -52,7 +52,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -71,6 +71,27 @@ static cl::opt<bool> DisablePromotion("disable-licm-promotion", cl::Hidden, cl::desc("Disable memory promotion in LICM pass")); +static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); +static bool isNotUsedInLoop(Instruction &I, Loop *CurLoop); +static bool hoist(Instruction &I, BasicBlock *Preheader); +static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, + Loop *CurLoop, AliasSetTracker *CurAST ); +static bool isGuaranteedToExecute(Instruction &Inst, DominatorTree *DT, + Loop *CurLoop, LICMSafetyInfo * SafetyInfo); +static bool isSafeToExecuteUnconditionally(Instruction &Inst,DominatorTree *DT, + const DataLayout *DL, Loop *CurLoop, + LICMSafetyInfo * SafetyInfo); +static bool pointerInvalidatedByLoop(Value *V, uint64_t Size, + const AAMDNodes &AAInfo, + AliasSetTracker *CurAST); +static Instruction *CloneInstructionInExitBlock(Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN, LoopInfo *LI); +static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, + DominatorTree *DT, const DataLayout *DL, + Loop *CurLoop, AliasSetTracker *CurAST, + LICMSafetyInfo * SafetyInfo); + namespace { struct LICM : public LoopPass { static char ID; // Pass identification, replacement for typeid @@ -86,7 +107,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); @@ -94,7 +115,7 @@ namespace { AU.addRequired<AliasAnalysis>(); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } using llvm::Pass::doFinalization; @@ -117,9 +138,6 @@ namespace { BasicBlock *Preheader; // The preheader block of the current loop... Loop *CurLoop; // The current loop we are working on... AliasSetTracker *CurAST; // AliasSet information for the current loop... - bool MayThrow; // The current loop contains an instruction which - // may throw, thus preventing code motion of - // instructions with side effects. DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap; /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. @@ -132,88 +150,17 @@ namespace { /// Simple Analysis hook. Delete loop L from alias set map. void deleteAnalysisLoop(Loop *L) override; - - /// SinkRegion - Walk the specified region of the CFG (defined by all blocks - /// dominated by the specified block, and that are in the current loop) in - /// reverse depth first order w.r.t the DominatorTree. This allows us to - /// visit uses before definitions, allowing us to sink a loop body in one - /// pass without iteration. - /// - void SinkRegion(DomTreeNode *N); - - /// HoistRegion - Walk the specified region of the CFG (defined by all - /// blocks dominated by the specified block, and that are in the current - /// loop) in depth first order w.r.t the DominatorTree. This allows us to - /// visit definitions before uses, allowing us to hoist a loop body in one - /// pass without iteration. - /// - void HoistRegion(DomTreeNode *N); - - /// inSubLoop - Little predicate that returns true if the specified basic - /// block is in a subloop of the current one, not the current one itself. - /// - bool inSubLoop(BasicBlock *BB) { - assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop"); - return LI->getLoopFor(BB) != CurLoop; - } - - /// sink - When an instruction is found to only be used outside of the loop, - /// this function moves it to the exit blocks and patches up SSA form as - /// needed. - /// - void sink(Instruction &I); - - /// hoist - When an instruction is found to only use loop invariant operands - /// that is safe to hoist, this instruction is called to do the dirty work. - /// - void hoist(Instruction &I); - - /// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it - /// is not a trapping instruction or if it is a trapping instruction and is - /// guaranteed to execute. - /// - bool isSafeToExecuteUnconditionally(Instruction &I); - - /// isGuaranteedToExecute - Check that the instruction is guaranteed to - /// execute. - /// - bool isGuaranteedToExecute(Instruction &I); - - /// pointerInvalidatedByLoop - Return true if the body of this loop may - /// store into the memory location pointed to by V. - /// - bool pointerInvalidatedByLoop(Value *V, uint64_t Size, - const AAMDNodes &AAInfo) { - // Check to see if any of the basic blocks in CurLoop invalidate *V. - return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod(); - } - - bool canSinkOrHoistInst(Instruction &I); - bool isNotUsedInLoop(Instruction &I); - - void PromoteAliasSet(AliasSet &AS, - SmallVectorImpl<BasicBlock*> &ExitBlocks, - SmallVectorImpl<Instruction*> &InsertPts, - PredIteratorCache &PIC); - - /// \brief Create a copy of the instruction in the exit block and patch up - /// SSA. - /// PN is a user of I in ExitBlock that can be used to get the number and - /// list of predecessors fast. - Instruction *CloneInstructionInExitBlock(Instruction &I, - BasicBlock &ExitBlock, - PHINode &PN); }; } char LICM::ID = 0; INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) @@ -230,13 +177,13 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { Changed = false; // Get our Loop and Alias Analysis information... - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); AA = &getAnalysis<AliasAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); @@ -273,14 +220,9 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { CurAST->add(*BB); // Incorporate the specified basic block } - MayThrow = false; - // TODO: We've already searched for instructions which may throw in subloops. - // We may want to reuse this information. - for (Loop::block_iterator BB = L->block_begin(), BBE = L->block_end(); - (BB != BBE) && !MayThrow ; ++BB) - for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); - (I != E) && !MayThrow; ++I) - MayThrow |= I->mayThrow(); + // Compute loop safety information. + LICMSafetyInfo SafetyInfo; + computeLICMSafetyInfo(&SafetyInfo, CurLoop); // We want to visit all of the instructions in this loop... that are not parts // of our subloops (they have already had their invariants hoisted out of @@ -293,9 +235,11 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // instructions, we perform another pass to hoist them out of the loop. // if (L->hasDedicatedExits()) - SinkRegion(DT->getNode(L->getHeader())); + Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, DL, TLI, + CurLoop, CurAST, &SafetyInfo); if (Preheader) - HoistRegion(DT->getNode(L->getHeader())); + Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, DL, TLI, + CurLoop, CurAST, &SafetyInfo); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -307,7 +251,9 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // Loop over all of the alias sets in the tracker object. for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end(); I != E; ++I) - PromoteAliasSet(*I, ExitBlocks, InsertPts, PIC); + Changed |= promoteLoopAccessesToScalars(*I, ExitBlocks, InsertPts, + PIC, LI, DT, CurLoop, + CurAST, &SafetyInfo); // Once we have promoted values across the loop body we have to recursively // reform LCSSA as any nested loop may now have values defined within the @@ -316,7 +262,8 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // SSAUpdater strategy during promotion that was LCSSA aware and reformed // it as it went. if (Changed) - formLCSSARecursively(*L, *DT, getAnalysisIfAvailable<ScalarEvolution>()); + formLCSSARecursively(*L, *DT, LI, + getAnalysisIfAvailable<ScalarEvolution>()); } // Check that neither this loop nor its parent have had LCSSA broken. LICM is @@ -339,27 +286,36 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { return Changed; } -/// SinkRegion - Walk the specified region of the CFG (defined by all blocks -/// dominated by the specified block, and that are in the current loop) in -/// reverse depth first order w.r.t the DominatorTree. This allows us to visit -/// uses before definitions, allowing us to sink a loop body in one pass without -/// iteration. +/// Walk the specified region of the CFG (defined by all blocks dominated by +/// the specified block, and that are in the current loop) in reverse depth +/// first order w.r.t the DominatorTree. This allows us to visit uses before +/// definitions, allowing us to sink a loop body in one pass without iteration. /// -void LICM::SinkRegion(DomTreeNode *N) { - assert(N != nullptr && "Null dominator tree node?"); +bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, + DominatorTree *DT, const DataLayout *DL, + TargetLibraryInfo *TLI, Loop *CurLoop, + AliasSetTracker *CurAST, LICMSafetyInfo * SafetyInfo) { + + // Verify inputs. + assert(N != nullptr && AA != nullptr && LI != nullptr && + DT != nullptr && CurLoop != nullptr && CurAST != nullptr && + SafetyInfo != nullptr && "Unexpected input to sinkRegion"); + + // Set changed as false. + bool Changed = false; + // Get basic block BasicBlock *BB = N->getBlock(); - // If this subregion is not in the top level loop at all, exit. - if (!CurLoop->contains(BB)) return; + if (!CurLoop->contains(BB)) return Changed; // We are processing blocks in reverse dfo, so process children first. const std::vector<DomTreeNode*> &Children = N->getChildren(); for (unsigned i = 0, e = Children.size(); i != e; ++i) - SinkRegion(Children[i]); - + Changed |= sinkRegion(Children[i], AA, LI, DT, DL, TLI, CurLoop, + CurAST, SafetyInfo); // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). - if (inSubLoop(BB)) return; + if (inSubLoop(BB,CurLoop,LI)) return Changed; for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) { Instruction &I = *--II; @@ -380,31 +336,39 @@ void LICM::SinkRegion(DomTreeNode *N) { // outside of the loop. In this case, it doesn't even matter if the // operands of the instruction are loop invariant. // - if (isNotUsedInLoop(I) && canSinkOrHoistInst(I)) { + if (isNotUsedInLoop(I, CurLoop) && + canSinkOrHoistInst(I, AA, DT, DL, CurLoop, CurAST, SafetyInfo)) { ++II; - sink(I); + Changed |= sink(I, LI, DT, CurLoop, CurAST); } } + return Changed; } -/// HoistRegion - Walk the specified region of the CFG (defined by all blocks -/// dominated by the specified block, and that are in the current loop) in depth -/// first order w.r.t the DominatorTree. This allows us to visit definitions -/// before uses, allowing us to hoist a loop body in one pass without iteration. +/// Walk the specified region of the CFG (defined by all blocks dominated by +/// the specified block, and that are in the current loop) in depth first +/// order w.r.t the DominatorTree. This allows us to visit definitions before +/// uses, allowing us to hoist a loop body in one pass without iteration. /// -void LICM::HoistRegion(DomTreeNode *N) { - assert(N != nullptr && "Null dominator tree node?"); +bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, + DominatorTree *DT, const DataLayout *DL, + TargetLibraryInfo *TLI, Loop *CurLoop, + AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { + // Verify inputs. + assert(N != nullptr && AA != nullptr && LI != nullptr && + DT != nullptr && CurLoop != nullptr && CurAST != nullptr && + SafetyInfo != nullptr && "Unexpected input to hoistRegion"); + // Set changed as false. + bool Changed = false; + // Get basic block BasicBlock *BB = N->getBlock(); - // If this subregion is not in the top level loop at all, exit. - if (!CurLoop->contains(BB)) return; - + if (!CurLoop->contains(BB)) return Changed; // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). - if (!inSubLoop(BB)) + if (!inSubLoop(BB, CurLoop, LI)) for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) { Instruction &I = *II++; - // Try constant folding this instruction. If all the operands are // constants, it is technically hoistable, but it would be better to just // fold it. @@ -421,20 +385,49 @@ void LICM::HoistRegion(DomTreeNode *N) { // if all of the operands of the instruction are loop invariant and if it // is safe to hoist the instruction. // - if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I) && - isSafeToExecuteUnconditionally(I)) - hoist(I); + if (CurLoop->hasLoopInvariantOperands(&I) && + canSinkOrHoistInst(I, AA, DT, DL, CurLoop, CurAST, SafetyInfo) && + isSafeToExecuteUnconditionally(I, DT, DL, CurLoop, SafetyInfo)) + Changed |= hoist(I, CurLoop->getLoopPreheader()); } const std::vector<DomTreeNode*> &Children = N->getChildren(); for (unsigned i = 0, e = Children.size(); i != e; ++i) - HoistRegion(Children[i]); + Changed |= hoistRegion(Children[i], AA, LI, DT, DL, TLI, CurLoop, + CurAST, SafetyInfo); + return Changed; +} + +/// Computes loop safety information, checks loop body & header +/// for the possiblity of may throw exception. +/// +void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { + assert(CurLoop != nullptr && "CurLoop cant be null"); + BasicBlock *Header = CurLoop->getHeader(); + // Setting default safety values. + SafetyInfo->MayThrow = false; + SafetyInfo->HeaderMayThrow = false; + // Iterate over header and compute dafety info. + for (BasicBlock::iterator I = Header->begin(), E = Header->end(); + (I != E) && !SafetyInfo->HeaderMayThrow; ++I) + SafetyInfo->HeaderMayThrow |= I->mayThrow(); + + SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow; + // Iterate over loop instructions and compute safety info. + for (Loop::block_iterator BB = CurLoop->block_begin(), + BBE = CurLoop->block_end(); (BB != BBE) && !SafetyInfo->MayThrow ; ++BB) + for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); + (I != E) && !SafetyInfo->MayThrow; ++I) + SafetyInfo->MayThrow |= I->mayThrow(); } /// canSinkOrHoistInst - Return true if the hoister and sinker can handle this /// instruction. /// -bool LICM::canSinkOrHoistInst(Instruction &I) { +bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, + DominatorTree *DT, const DataLayout *DL, + Loop *CurLoop, AliasSetTracker *CurAST, + LICMSafetyInfo * SafetyInfo) { // Loads have extra constraints we have to verify before we can hoist them. if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { if (!LI->isUnordered()) @@ -455,7 +448,7 @@ bool LICM::canSinkOrHoistInst(Instruction &I) { AAMDNodes AAInfo; LI->getAAMetadata(AAInfo); - return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo); + return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST); } else if (CallInst *CI = dyn_cast<CallInst>(&I)) { // Don't sink or hoist dbg info; it's legal, but not useful. if (isa<DbgInfoIntrinsic>(I)) @@ -494,14 +487,14 @@ bool LICM::canSinkOrHoistInst(Instruction &I) { !isa<InsertValueInst>(I)) return false; - return isSafeToExecuteUnconditionally(I); + return isSafeToExecuteUnconditionally(I, DT, DL, CurLoop, SafetyInfo); } -/// \brief Returns true if a PHINode is a trivially replaceable with an +/// Returns true if a PHINode is a trivially replaceable with an /// Instruction. +/// This is true when all incoming values are that instruction. +/// This pattern occurs most often with LCSSA PHI nodes. /// -/// This is true when all incoming values are that instruction. This pattern -/// occurs most often with LCSSA PHI nodes. static bool isTriviallyReplacablePHI(PHINode &PN, Instruction &I) { for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) if (PN.getIncomingValue(i) != &I) @@ -510,11 +503,11 @@ static bool isTriviallyReplacablePHI(PHINode &PN, Instruction &I) { return true; } -/// isNotUsedInLoop - Return true if the only users of this instruction are -/// outside of the loop. If this is true, we can sink the instruction to the -/// exit blocks of the loop. +/// Return true if the only users of this instruction are outside of +/// the loop. If this is true, we can sink the instruction to the exit +/// blocks of the loop. /// -bool LICM::isNotUsedInLoop(Instruction &I) { +static bool isNotUsedInLoop(Instruction &I, Loop *CurLoop) { for (User *U : I.users()) { Instruction *UI = cast<Instruction>(U); if (PHINode *PN = dyn_cast<PHINode>(UI)) { @@ -545,9 +538,9 @@ bool LICM::isNotUsedInLoop(Instruction &I) { return true; } -Instruction *LICM::CloneInstructionInExitBlock(Instruction &I, - BasicBlock &ExitBlock, - PHINode &PN) { +static Instruction *CloneInstructionInExitBlock(Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN, LoopInfo *LI) { Instruction *New = I.clone(); ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New); if (!I.getName().empty()) New->setName(I.getName() + ".le"); @@ -574,14 +567,15 @@ Instruction *LICM::CloneInstructionInExitBlock(Instruction &I, return New; } -/// sink - When an instruction is found to only be used outside of the loop, -/// this function moves it to the exit blocks and patches up SSA form as needed. +/// When an instruction is found to only be used outside of the loop, this +/// function moves it to the exit blocks and patches up SSA form as needed. /// This method is guaranteed to remove the original instruction from its /// position, and may either delete it or move it to outside of the loop. /// -void LICM::sink(Instruction &I) { +static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, + Loop *CurLoop, AliasSetTracker *CurAST ) { DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); - + bool Changed = false; if (isa<LoadInst>(I)) ++NumMovedLoads; else if (isa<CallInst>(I)) ++NumMovedCalls; ++NumSunk; @@ -590,7 +584,8 @@ void LICM::sink(Instruction &I) { #ifndef NDEBUG SmallVector<BasicBlock *, 32> ExitBlocks; CurLoop->getUniqueExitBlocks(ExitBlocks); - SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end()); + SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); #endif // Clones of this instruction. Don't create more than one per exit block! @@ -618,7 +613,7 @@ void LICM::sink(Instruction &I) { New = It->second; else New = SunkCopies[ExitBlock] = - CloneInstructionInExitBlock(I, *ExitBlock, *PN); + CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI); PN->replaceAllUsesWith(New); PN->eraseFromParent(); @@ -626,44 +621,41 @@ void LICM::sink(Instruction &I) { CurAST->deleteValue(&I); I.eraseFromParent(); + return Changed; } -/// hoist - When an instruction is found to only use loop invariant operands -/// that is safe to hoist, this instruction is called to do the dirty work. +/// When an instruction is found to only use loop invariant operands that +/// is safe to hoist, this instruction is called to do the dirty work. /// -void LICM::hoist(Instruction &I) { +static bool hoist(Instruction &I, BasicBlock *Preheader) { DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I << "\n"); - // Move the new node to the Preheader, before its terminator. I.moveBefore(Preheader->getTerminator()); if (isa<LoadInst>(I)) ++NumMovedLoads; else if (isa<CallInst>(I)) ++NumMovedCalls; ++NumHoisted; - Changed = true; + return true; } -/// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it is -/// not a trapping instruction or if it is a trapping instruction and is -/// guaranteed to execute. +/// Only sink or hoist an instruction if it is not a trapping instruction +/// or if it is a trapping instruction and is guaranteed to execute. /// -bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { +static bool isSafeToExecuteUnconditionally(Instruction &Inst, DominatorTree *DT, + const DataLayout *DL, Loop *CurLoop, + LICMSafetyInfo * SafetyInfo) { // If it is not a trapping instruction, it is always safe to hoist. if (isSafeToSpeculativelyExecute(&Inst, DL)) return true; - return isGuaranteedToExecute(Inst); + return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo); } -bool LICM::isGuaranteedToExecute(Instruction &Inst) { - - // Somewhere in this loop there is an instruction which may throw and make us - // exit the loop. - if (MayThrow) - return false; +static bool isGuaranteedToExecute(Instruction &Inst, DominatorTree *DT, + Loop *CurLoop, LICMSafetyInfo * SafetyInfo) { - // Otherwise we have to check to make sure that the instruction dominates all + // We have to check to make sure that the instruction dominates all // of the exit blocks. If it doesn't, then there is a path out of the loop // which does not execute this instruction, so we can't hoist it. @@ -671,7 +663,14 @@ bool LICM::isGuaranteedToExecute(Instruction &Inst) { // common), it is always guaranteed to dominate the exit blocks. Since this // is a common case, and can save some work, check it now. if (Inst.getParent() == CurLoop->getHeader()) - return true; + // If there's a throw in the header block, we can't guarantee we'll reach + // Inst. + return !SafetyInfo->HeaderMayThrow; + + // Somewhere in this loop there is an instruction which may throw and make us + // exit the loop. + if (SafetyInfo->MayThrow) + return false; // Get the exit blocks for the current loop. SmallVector<BasicBlock*, 8> ExitBlocks; @@ -768,25 +767,37 @@ namespace { }; } // end anon namespace -/// PromoteAliasSet - Try to promote memory values to scalars by sinking -/// stores out of the loop and moving loads to before the loop. We do this by -/// looping over the stores in the loop, looking for stores to Must pointers -/// which are loop invariant. +/// Try to promote memory values to scalars by sinking stores out of the +/// loop and moving loads to before the loop. We do this by looping over +/// the stores in the loop, looking for stores to Must pointers which are +/// loop invariant. /// -void LICM::PromoteAliasSet(AliasSet &AS, - SmallVectorImpl<BasicBlock*> &ExitBlocks, - SmallVectorImpl<Instruction*> &InsertPts, - PredIteratorCache &PIC) { +bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, + SmallVectorImpl<BasicBlock*>&ExitBlocks, + SmallVectorImpl<Instruction*>&InsertPts, + PredIteratorCache &PIC, LoopInfo *LI, + DominatorTree *DT, Loop *CurLoop, + AliasSetTracker *CurAST, + LICMSafetyInfo * SafetyInfo) { + // Verify inputs. + assert(LI != nullptr && DT != nullptr && + CurLoop != nullptr && CurAST != nullptr && + SafetyInfo != nullptr && + "Unexpected Input to promoteLoopAccessesToScalars"); + // Initially set Changed status to false. + bool Changed = false; // We can promote this alias set if it has a store, if it is a "Must" alias // set, if the pointer is loop invariant, and if we are not eliminating any // volatile loads or stores. if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() || AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue())) - return; + return Changed; assert(!AS.empty() && "Must alias set should have at least one pointer element in it!"); + Value *SomePtr = AS.begin()->getValue(); + BasicBlock * Preheader = CurLoop->getLoopPreheader(); // It isn't safe to promote a load/store from the loop if the load/store is // conditional. For example, turning: @@ -810,6 +821,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, // us to prove better alignment. unsigned Alignment = 1; AAMDNodes AATags; + bool HasDedicatedExits = CurLoop->hasDedicatedExits(); // Check that all of the pointers in the alias set have the same type. We // cannot (yet) promote a memory location that is loaded and stored in @@ -822,7 +834,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, // cannot (yet) promote a memory location that is loaded and stored in // different sizes. if (SomePtr->getType() != ASIV->getType()) - return; + return Changed; for (User *U : ASIV->users()) { // Ignore instructions that are outside the loop. @@ -835,7 +847,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, if (LoadInst *load = dyn_cast<LoadInst>(UI)) { assert(!load->isVolatile() && "AST broken"); if (!load->isSimple()) - return; + return Changed; } else if (StoreInst *store = dyn_cast<StoreInst>(UI)) { // Stores *of* the pointer are not interesting, only stores *to* the // pointer. @@ -843,7 +855,14 @@ void LICM::PromoteAliasSet(AliasSet &AS, continue; assert(!store->isVolatile() && "AST broken"); if (!store->isSimple()) - return; + return Changed; + // Don't sink stores from loops without dedicated block exits. Exits + // containing indirect branches are not transformed by loop simplify, + // make sure we catch that. An additional load may be generated in the + // preheader for SSA updater, so also avoid sinking when no preheader + // is available. + if (!HasDedicatedExits || !Preheader) + return Changed; // Note that we only check GuaranteedToExecute inside the store case // so that we do not introduce stores where they did not exist before @@ -855,16 +874,17 @@ void LICM::PromoteAliasSet(AliasSet &AS, // Larger is better, with the exception of 0 being the best alignment. unsigned InstAlignment = store->getAlignment(); if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0) - if (isGuaranteedToExecute(*UI)) { + if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) { GuaranteedToExecute = true; Alignment = InstAlignment; } if (!GuaranteedToExecute) - GuaranteedToExecute = isGuaranteedToExecute(*UI); + GuaranteedToExecute = isGuaranteedToExecute(*UI, DT, + CurLoop, SafetyInfo); } else - return; // Not a load or store. + return Changed; // Not a load or store. // Merge the AA tags. if (LoopUses.empty()) { @@ -880,7 +900,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, // If there isn't a guaranteed-to-execute instruction, we can't promote. if (!GuaranteedToExecute) - return; + return Changed; // Otherwise, this is safe to promote, lets do it! DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n'); @@ -925,10 +945,12 @@ void LICM::PromoteAliasSet(AliasSet &AS, // If the SSAUpdater didn't use the load in the preheader, just zap it now. if (PreheaderLoad->use_empty()) PreheaderLoad->eraseFromParent(); -} + return Changed; +} -/// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. +/// Simple Analysis hook. Clone alias set info. +/// void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) { AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); if (!AST) @@ -937,8 +959,8 @@ void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) { AST->copyValue(From, To); } -/// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias -/// set. +/// Simple Analysis hook. Delete value V from alias set +/// void LICM::deleteAnalysisValue(Value *V, Loop *L) { AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); if (!AST) @@ -948,6 +970,7 @@ void LICM::deleteAnalysisValue(Value *V, Loop *L) { } /// Simple Analysis hook. Delete value L from alias set map. +/// void LICM::deleteAnalysisLoop(Loop *L) { AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); if (!AST) @@ -956,3 +979,23 @@ void LICM::deleteAnalysisLoop(Loop *L) { delete AST; LoopToAliasSetMap.erase(L); } + + +/// Return true if the body of this loop may store into the memory +/// location pointed to by V. +/// +static bool pointerInvalidatedByLoop(Value *V, uint64_t Size, + const AAMDNodes &AAInfo, + AliasSetTracker *CurAST) { + // Check to see if any of the basic blocks in CurLoop invalidate *V. + return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod(); +} + +/// Little predicate that returns true if the specified basic block is in +/// a subloop of the current one, not the current one itself. +/// +static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) { + assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop"); + return LI->getLoopFor(BB) != CurLoop; +} + diff --git a/lib/Transforms/Scalar/LLVMBuild.txt b/lib/Transforms/Scalar/LLVMBuild.txt index 2bb49a3..deea9e2 100644 --- a/lib/Transforms/Scalar/LLVMBuild.txt +++ b/lib/Transforms/Scalar/LLVMBuild.txt @@ -20,4 +20,4 @@ type = Library name = Scalar parent = Transforms library_name = ScalarOpts -required_libraries = Analysis Core InstCombine ProfileData Support Target TransformUtils +required_libraries = Analysis Core InstCombine ProfileData Support TransformUtils diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index 1d1f33a..98b068e 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -39,14 +39,14 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreserved<ScalarEvolution>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); } @@ -63,7 +63,7 @@ char LoopDeletion::ID = 0; INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", "Delete dead loops", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) @@ -236,7 +236,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { // Finally, the blocks from loopinfo. This has to happen late because // otherwise our loop iterators won't work. - LoopInfo &loopInfo = getAnalysis<LoopInfo>(); + LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SmallPtrSet<BasicBlock*, 8> blocks; blocks.insert(L->block_begin(), L->block_end()); for (BasicBlock *BB : blocks) diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a12f5a7..243c624 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -56,7 +56,7 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -163,8 +163,8 @@ namespace { /// loop preheaders be inserted into the CFG. /// void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); @@ -175,8 +175,8 @@ namespace { AU.addPreserved<ScalarEvolution>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfo>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } const DataLayout *getDataLayout() { @@ -197,11 +197,16 @@ namespace { } TargetLibraryInfo *getTargetLibraryInfo() { - return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>()); + if (!TLI) + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + + return TLI; } const TargetTransformInfo *getTargetTransformInfo() { - return TTI ? TTI : (TTI = &getAnalysis<TargetTransformInfo>()); + return TTI ? TTI + : (TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *CurLoop->getHeader()->getParent())); } Loop *getLoop() const { return CurLoop; } @@ -215,14 +220,14 @@ namespace { char LoopIdiomRecognize::ID = 0; INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) @@ -232,44 +237,13 @@ Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); } /// and zero out all the operands of this instruction. If any of them become /// dead, delete them and the computation tree that feeds them. /// -static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE, +static void deleteDeadInstruction(Instruction *I, const TargetLibraryInfo *TLI) { - SmallVector<Instruction*, 32> NowDeadInsts; - - NowDeadInsts.push_back(I); - - // Before we touch this instruction, remove it from SE! - do { - Instruction *DeadInst = NowDeadInsts.pop_back_val(); - - // This instruction is dead, zap it, in stages. Start by removing it from - // SCEV. - SE.forgetValue(DeadInst); - - for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { - Value *Op = DeadInst->getOperand(op); - DeadInst->setOperand(op, nullptr); - - // If this operand just became dead, add it to the NowDeadInsts list. - if (!Op->use_empty()) continue; - - if (Instruction *OpI = dyn_cast<Instruction>(Op)) - if (isInstructionTriviallyDead(OpI, TLI)) - NowDeadInsts.push_back(OpI); - } - - DeadInst->eraseFromParent(); - - } while (!NowDeadInsts.empty()); -} - -/// deleteIfDeadInstruction - If the specified value is a dead instruction, -/// delete it and any recursively used instructions. -static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, - const TargetLibraryInfo *TLI) { - if (Instruction *I = dyn_cast<Instruction>(V)) - if (isInstructionTriviallyDead(I, TLI)) - deleteDeadInstruction(I, SE, TLI); + SmallVector<Value *, 16> Operands(I->value_op_begin(), I->value_op_end()); + I->replaceAllUsesWith(UndefValue::get(I->getType())); + I->eraseFromParent(); + for (Value *Op : Operands) + RecursivelyDeleteTriviallyDeadInstructions(Op, TLI); } //===----------------------------------------------------------------------===// @@ -285,7 +259,7 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, // the concern of breaking data dependence. bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { if (BranchInst *Br = getBranch(BB)) { - return Br->isUnconditional() && BB->size() == 1; + return Br->isUnconditional() && Br == BB->begin(); } return false; } @@ -542,7 +516,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst, cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); PreCond->replaceAllUsesWith(NewPreCond); - deleteDeadInstruction(PreCond, *SE, TLI); + RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); } // Step 3: Note that the population count is exactly the trip count of the @@ -592,15 +566,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst, // Step 4: All the references to the original population counter outside // the loop are replaced with the NewCount -- the value returned from // __builtin_ctpop(). - { - SmallVector<Value *, 4> CntUses; - for (User *U : CntInst->users()) - if (cast<Instruction>(U)->getParent() != Body) - CntUses.push_back(U); - for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) { - (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount); - } - } + CntInst->replaceUsesOutsideBlock(NewCount, Body); // step 5: Forget the "non-computable" trip-count SCEV associated with the // loop. The loop would otherwise not be deleted even if it becomes empty. @@ -666,8 +632,8 @@ bool LoopIdiomRecognize::runOnCountableLoop() { // set DT (void)getDominatorTree(); - LoopInfo &LI = getAnalysis<LoopInfo>(); - TLI = &getAnalysis<TargetLibraryInfo>(); + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); // set TLI (void)getTargetLibraryInfo(); @@ -997,7 +963,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) { Expander.clear(); // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(BasePtr, *SE, TLI); + RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); return false; } @@ -1053,7 +1019,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // Okay, the memset has been formed. Zap the original store and anything that // feeds into it. - deleteDeadInstruction(TheStore, *SE, TLI); + deleteDeadInstruction(TheStore, TLI); ++NumMemSet; return true; } @@ -1094,7 +1060,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, getAnalysis<AliasAnalysis>(), SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); + RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); return false; } @@ -1109,8 +1075,8 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, StoreSize, getAnalysis<AliasAnalysis>(), SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(LoadBasePtr, *SE, TLI); - deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); + RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); + RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); return false; } @@ -1143,7 +1109,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // Okay, the memset has been formed. Zap the original store and anything that // feeds into it. - deleteDeadInstruction(SI, *SE, TLI); + deleteDeadInstruction(SI, TLI); ++NumMemCpy; return true; } diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp index 8fd7c8f..6dc600e 100644 --- a/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -14,15 +14,16 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -42,13 +43,13 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<AssumptionTracker>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); - AU.addPreserved("scalar-evolution"); - AU.addRequired<TargetLibraryInfo>(); + AU.addPreserved<ScalarEvolution>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; } @@ -56,10 +57,10 @@ namespace { char LoopInstSimplify::ID = 0; INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify", "Simplify instructions in loops", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify", "Simplify instructions in loops", false, false) @@ -75,11 +76,13 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - LoopInfo *LI = &getAnalysis<LoopInfo>(); + LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); - AssumptionTracker *AT = &getAnalysis<AssumptionTracker>(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + *L->getHeader()->getParent()); SmallVector<BasicBlock*, 8> ExitBlocks; L->getUniqueExitBlocks(ExitBlocks); @@ -120,7 +123,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Don't bother simplifying unused instructions. if (!I->use_empty()) { - Value *V = SimplifyInstruction(I, DL, TLI, DT, AT); + Value *V = SimplifyInstruction(I, DL, TLI, DT, &AC); if (V && LI->replacementPreservesLCSSAForm(I, V)) { // Mark all uses for resimplification next time round the loop. for (User *U : I->users()) diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp index 8f12204..fdf7e3b 100644 --- a/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -12,7 +12,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -28,7 +30,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -43,6 +45,12 @@ static cl::opt<unsigned> MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, cl::desc("The maximum increment for loop rerolling")); +static cl::opt<unsigned> +NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400), + cl::Hidden, + cl::desc("The maximum number of failures to tolerate" + " during fuzzy matching. (default: 400)")); + // This loop re-rolling transformation aims to transform loops like this: // // int foo(int a); @@ -119,6 +127,16 @@ MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, // br %cmp, header, exit namespace { + enum IterationLimits { + /// The maximum number of iterations that we'll try and reroll. This + /// has to be less than 25 in order to fit into a SmallBitVector. + IL_MaxRerollIterations = 16, + /// The bitvector index used by loop induction variables and other + /// instructions that belong to all iterations. + IL_All, + IL_End + }; + class LoopReroll : public LoopPass { public: static char ID; // Pass ID, replacement for typeid @@ -130,15 +148,15 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AliasAnalysis>(); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolution>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } -protected: + protected: AliasAnalysis *AA; LoopInfo *LI; ScalarEvolution *SE; @@ -311,26 +329,116 @@ protected: DenseSet<int> Reds; }; + // A DAGRootSet models an induction variable being used in a rerollable + // loop. For example, + // + // x[i*3+0] = y1 + // x[i*3+1] = y2 + // x[i*3+2] = y3 + // + // Base instruction -> i*3 + // +---+----+ + // / | \ + // ST[y1] +1 +2 <-- Roots + // | | + // ST[y2] ST[y3] + // + // There may be multiple DAGRoots, for example: + // + // x[i*2+0] = ... (1) + // x[i*2+1] = ... (1) + // x[i*2+4] = ... (2) + // x[i*2+5] = ... (2) + // x[(i+1234)*2+5678] = ... (3) + // x[(i+1234)*2+5679] = ... (3) + // + // The loop will be rerolled by adding a new loop induction variable, + // one for the Base instruction in each DAGRootSet. + // + struct DAGRootSet { + Instruction *BaseInst; + SmallInstructionVector Roots; + // The instructions between IV and BaseInst (but not including BaseInst). + SmallInstructionSet SubsumedInsts; + }; + + // The set of all DAG roots, and state tracking of all roots + // for a particular induction variable. + struct DAGRootTracker { + DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV, + ScalarEvolution *SE, AliasAnalysis *AA, + TargetLibraryInfo *TLI, const DataLayout *DL) + : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), + DL(DL), IV(IV) { + } + + /// Stage 1: Find all the DAG roots for the induction variable. + bool findRoots(); + /// Stage 2: Validate if the found roots are valid. + bool validate(ReductionTracker &Reductions); + /// Stage 3: Assuming validate() returned true, perform the + /// replacement. + /// @param IterCount The maximum iteration count of L. + void replace(const SCEV *IterCount); + + protected: + typedef MapVector<Instruction*, SmallBitVector> UsesTy; + + bool findRootsRecursive(Instruction *IVU, + SmallInstructionSet SubsumedInsts); + bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts); + bool collectPossibleRoots(Instruction *Base, + std::map<int64_t,Instruction*> &Roots); + + bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet); + void collectInLoopUserSet(const SmallInstructionVector &Roots, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users); + void collectInLoopUserSet(Instruction *Root, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users); + + UsesTy::iterator nextInstr(int Val, UsesTy &In, + const SmallInstructionSet &Exclude, + UsesTy::iterator *StartI=nullptr); + bool isBaseInst(Instruction *I); + bool isRootInst(Instruction *I); + bool instrDependsOn(Instruction *I, + UsesTy::iterator Start, + UsesTy::iterator End); + + LoopReroll *Parent; + + // Members of Parent, replicated here for brevity. + Loop *L; + ScalarEvolution *SE; + AliasAnalysis *AA; + TargetLibraryInfo *TLI; + const DataLayout *DL; + + // The loop induction variable. + Instruction *IV; + // Loop step amount. + uint64_t Inc; + // Loop reroll count; if Inc == 1, this records the scaling applied + // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ; + // If Inc is not 1, Scale = Inc. + uint64_t Scale; + // The roots themselves. + SmallVector<DAGRootSet,16> RootSets; + // All increment instructions for IV. + SmallInstructionVector LoopIncs; + // Map of all instructions in the loop (in order) to the iterations + // they are used in (or specially, IL_All for instructions + // used in the loop increment mechanism). + UsesTy Uses; + }; + void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); void collectPossibleReductions(Loop *L, ReductionTracker &Reductions); - void collectInLoopUserSet(Loop *L, - const SmallInstructionVector &Roots, - const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet<Instruction *> &Users); - void collectInLoopUserSet(Loop *L, - Instruction * Root, - const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet<Instruction *> &Users); - bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale, - Instruction *&IV, - SmallInstructionVector &LoopIncs); - bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV, - SmallVector<SmallInstructionVector, 32> &Roots, - SmallInstructionSet &AllRoots, - SmallInstructionVector &LoopIncs); bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, ReductionTracker &Reductions); }; @@ -339,10 +447,10 @@ protected: char LoopReroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) Pass *llvm::createLoopRerollPass() { @@ -353,10 +461,10 @@ Pass *llvm::createLoopRerollPass() { // This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in // non-loop blocks to be outside the loop. static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { - for (User *U : I->users()) + for (User *U : I->users()) { if (!L->contains(cast<Instruction>(U))) return true; - + } return false; } @@ -403,6 +511,8 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) { // (including the PHI), except for the last value (which is used by the PHI // and also outside the loop). Instruction *C = Instructions.front(); + if (C->user_empty()) + return; do { C = cast<Instruction>(*C->user_begin()); @@ -424,11 +534,12 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) { return; // C is now the (potential) last instruction in the reduction chain. - for (User *U : C->users()) + for (User *U : C->users()) { // The only in-loop user can be the initial PHI. if (L->contains(cast<Instruction>(U))) if (cast<Instruction>(U) != Instructions.front()) return; + } Instructions.push_back(C); Valid = true; @@ -467,7 +578,7 @@ void LoopReroll::collectPossibleReductions(Loop *L, // if they are users, but their users are not added. This is used, for // example, to prevent a reduction update from forcing all later reduction // updates into the use set. -void LoopReroll::collectInLoopUserSet(Loop *L, +void LoopReroll::DAGRootTracker::collectInLoopUserSet( Instruction *Root, const SmallInstructionSet &Exclude, const SmallInstructionSet &Final, DenseSet<Instruction *> &Users) { @@ -504,14 +615,14 @@ void LoopReroll::collectInLoopUserSet(Loop *L, // Collect all of the users of all of the provided root instructions (combined // into a single set). -void LoopReroll::collectInLoopUserSet(Loop *L, +void LoopReroll::DAGRootTracker::collectInLoopUserSet( const SmallInstructionVector &Roots, const SmallInstructionSet &Exclude, const SmallInstructionSet &Final, DenseSet<Instruction *> &Users) { for (SmallInstructionVector::const_iterator I = Roots.begin(), IE = Roots.end(); I != IE; ++I) - collectInLoopUserSet(L, *I, Exclude, Final, Users); + collectInLoopUserSet(*I, Exclude, Final, Users); } static bool isSimpleLoadStore(Instruction *I) { @@ -524,289 +635,372 @@ static bool isSimpleLoadStore(Instruction *I) { return false; } -// Recognize loops that are setup like this: -// -// %iv = phi [ (preheader, ...), (body, %iv.next) ] -// %scaled.iv = mul %iv, scale -// f(%scaled.iv) -// %scaled.iv.1 = add %scaled.iv, 1 -// f(%scaled.iv.1) -// %scaled.iv.2 = add %scaled.iv, 2 -// f(%scaled.iv.2) -// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 -// f(%scaled.iv.scale_m_1) -// ... -// %iv.next = add %iv, 1 -// %cmp = icmp(%iv, ...) -// br %cmp, header, exit -// -// and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs. -bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale, - Instruction *&IV, - SmallInstructionVector &LoopIncs) { - // This is a special case: here we're looking for all uses (except for - // the increment) to be multiplied by a common factor. The increment must - // be by one. This is to capture loops like: - // for (int i = 0; i < 500; ++i) { - // foo(3*i); foo(3*i+1); foo(3*i+2); - // } - if (RealIV->getNumUses() != 2) - return false; - const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV)); - Instruction *User1 = cast<Instruction>(*RealIV->user_begin()), - *User2 = cast<Instruction>(*std::next(RealIV->user_begin())); - if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType())) - return false; - const SCEVAddRecExpr *User1SCEV = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)), - *User2SCEV = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2)); - if (!User1SCEV || !User1SCEV->isAffine() || - !User2SCEV || !User2SCEV->isAffine()) +/// Return true if IVU is a "simple" arithmetic operation. +/// This is used for narrowing the search space for DAGRoots; only arithmetic +/// and GEPs can be part of a DAGRoot. +static bool isSimpleArithmeticOp(User *IVU) { + if (Instruction *I = dyn_cast<Instruction>(IVU)) { + switch (I->getOpcode()) { + default: return false; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::Shl: + case Instruction::AShr: + case Instruction::LShr: + case Instruction::GetElementPtr: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + return true; + } + } + return false; +} + +static bool isLoopIncrement(User *U, Instruction *IV) { + BinaryOperator *BO = dyn_cast<BinaryOperator>(U); + if (!BO || BO->getOpcode() != Instruction::Add) return false; - // We assume below that User1 is the scale multiply and User2 is the - // increment. If this can't be true, then swap them. - if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) { - std::swap(User1, User2); - std::swap(User1SCEV, User2SCEV); + for (auto *UU : BO->users()) { + PHINode *PN = dyn_cast<PHINode>(UU); + if (PN && PN == IV) + return true; } + return false; +} + +bool LoopReroll::DAGRootTracker:: +collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { + SmallInstructionVector BaseUsers; + + for (auto *I : Base->users()) { + ConstantInt *CI = nullptr; + + if (isLoopIncrement(I, IV)) { + LoopIncs.push_back(cast<Instruction>(I)); + continue; + } + + // The root nodes must be either GEPs, ORs or ADDs. + if (auto *BO = dyn_cast<BinaryOperator>(I)) { + if (BO->getOpcode() == Instruction::Add || + BO->getOpcode() == Instruction::Or) + CI = dyn_cast<ConstantInt>(BO->getOperand(1)); + } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { + Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1); + CI = dyn_cast<ConstantInt>(LastOperand); + } + + if (!CI) { + if (Instruction *II = dyn_cast<Instruction>(I)) { + BaseUsers.push_back(II); + continue; + } else { + DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n"); + return false; + } + } + + int64_t V = CI->getValue().getSExtValue(); + if (Roots.find(V) != Roots.end()) + // No duplicates, please. + return false; - if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE)) + // FIXME: Add support for negative values. + if (V < 0) { + DEBUG(dbgs() << "LRR: Aborting due to negative value: " << V << "\n"); + return false; + } + + Roots[V] = cast<Instruction>(I); + } + + if (Roots.empty()) return false; - assert(User2SCEV->getStepRecurrence(*SE)->isOne() && - "Invalid non-unit step for multiplicative scaling"); - LoopIncs.push_back(User2); - - if (const SCEVConstant *MulScale = - dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) { - // Make sure that both the start and step have the same multiplier. - if (RealIVSCEV->getStart()->getType() != MulScale->getType()) + + // If we found non-loop-inc, non-root users of Base, assume they are + // for the zeroth root index. This is because "add %a, 0" gets optimized + // away. + if (BaseUsers.size()) { + if (Roots.find(0) != Roots.end()) { + DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n"); return false; - if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) != - User1SCEV->getStart()) + } + Roots[0] = Base; + } + + // Calculate the number of users of the base, or lowest indexed, iteration. + unsigned NumBaseUses = BaseUsers.size(); + if (NumBaseUses == 0) + NumBaseUses = Roots.begin()->second->getNumUses(); + + // Check that every node has the same number of users. + for (auto &KV : Roots) { + if (KV.first == 0) + continue; + if (KV.second->getNumUses() != NumBaseUses) { + DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: " + << "#Base=" << NumBaseUses << ", #Root=" << + KV.second->getNumUses() << "\n"); return false; + } + } + + return true; +} - ConstantInt *MulScaleCI = MulScale->getValue(); - if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc)) +bool LoopReroll::DAGRootTracker:: +findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) { + // Does the user look like it could be part of a root set? + // All its users must be simple arithmetic ops. + if (I->getNumUses() > IL_MaxRerollIterations) + return false; + + if ((I->getOpcode() == Instruction::Mul || + I->getOpcode() == Instruction::PHI) && + I != IV && + findRootsBase(I, SubsumedInsts)) + return true; + + SubsumedInsts.insert(I); + + for (User *V : I->users()) { + Instruction *I = dyn_cast<Instruction>(V); + if (std::find(LoopIncs.begin(), LoopIncs.end(), I) != LoopIncs.end()) + continue; + + if (!I || !isSimpleArithmeticOp(I) || + !findRootsRecursive(I, SubsumedInsts)) return false; - Scale = MulScaleCI->getZExtValue(); - IV = User1; - } else + } + return true; +} + +bool LoopReroll::DAGRootTracker:: +findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { + + // The base instruction needs to be a multiply so + // that we can erase it. + if (IVU->getOpcode() != Instruction::Mul && + IVU->getOpcode() != Instruction::PHI) return false; - DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n"); + std::map<int64_t, Instruction*> V; + if (!collectPossibleRoots(IVU, V)) + return false; + + // If we didn't get a root for index zero, then IVU must be + // subsumed. + if (V.find(0) == V.end()) + SubsumedInsts.insert(IVU); + + // Partition the vector into monotonically increasing indexes. + DAGRootSet DRS; + DRS.BaseInst = nullptr; + + for (auto &KV : V) { + if (!DRS.BaseInst) { + DRS.BaseInst = KV.second; + DRS.SubsumedInsts = SubsumedInsts; + } else if (DRS.Roots.empty()) { + DRS.Roots.push_back(KV.second); + } else if (V.find(KV.first - 1) != V.end()) { + DRS.Roots.push_back(KV.second); + } else { + // Linear sequence terminated. + RootSets.push_back(DRS); + DRS.BaseInst = KV.second; + DRS.SubsumedInsts = SubsumedInsts; + DRS.Roots.clear(); + } + } + RootSets.push_back(DRS); + return true; } -// Collect all root increments with respect to the provided induction variable -// (normally the PHI, but sometimes a multiply). A root increment is an -// instruction, normally an add, with a positive constant less than Scale. In a -// rerollable loop, each of these increments is the root of an instruction -// graph isomorphic to the others. Also, we collect the final induction -// increment (the increment equal to the Scale), and its users in LoopIncs. -bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, - Instruction *IV, - SmallVector<SmallInstructionVector, 32> &Roots, - SmallInstructionSet &AllRoots, - SmallInstructionVector &LoopIncs) { - for (User *U : IV->users()) { - Instruction *UI = cast<Instruction>(U); - if (!SE->isSCEVable(UI->getType())) - continue; - if (UI->getType() != IV->getType()) - continue; - if (!L->contains(UI)) - continue; - if (hasUsesOutsideLoop(UI, L)) - continue; +bool LoopReroll::DAGRootTracker::findRoots() { - if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV( - SE->getSCEV(UI), SE->getSCEV(IV)))) { - uint64_t Idx = Diff->getValue()->getValue().getZExtValue(); - if (Idx > 0 && Idx < Scale) { - Roots[Idx-1].push_back(UI); - AllRoots.insert(UI); - } else if (Idx == Scale && Inc > 1) { - LoopIncs.push_back(UI); - } + const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); + Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> + getValue()->getZExtValue(); + + assert(RootSets.empty() && "Unclean state!"); + if (Inc == 1) { + for (auto *IVU : IV->users()) { + if (isLoopIncrement(IVU, IV)) + LoopIncs.push_back(cast<Instruction>(IVU)); } + if (!findRootsRecursive(IV, SmallInstructionSet())) + return false; + LoopIncs.push_back(IV); + } else { + if (!findRootsBase(IV, SmallInstructionSet())) + return false; } - if (Roots[0].empty()) + // Ensure all sets have the same size. + if (RootSets.empty()) { + DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n"); return false; - bool AllSame = true; - for (unsigned i = 1; i < Scale-1; ++i) - if (Roots[i].size() != Roots[0].size()) { - AllSame = false; - break; + } + for (auto &V : RootSets) { + if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) { + DEBUG(dbgs() + << "LRR: Aborting because not all root sets have the same size\n"); + return false; } + } - if (!AllSame) + // And ensure all loop iterations are consecutive. We rely on std::map + // providing ordered traversal. + for (auto &V : RootSets) { + const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(V.BaseInst)); + if (!ADR) + return false; + + // Consider a DAGRootSet with N-1 roots (so N different values including + // BaseInst). + // Define d = Roots[0] - BaseInst, which should be the same as + // Roots[I] - Roots[I-1] for all I in [1..N). + // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the + // loop iteration J. + // + // Now, For the loop iterations to be consecutive: + // D = d * N + + unsigned N = V.Roots.size() + 1; + const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(V.Roots[0]), ADR); + const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N); + if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) { + DEBUG(dbgs() << "LRR: Aborting because iterations are not consecutive\n"); + return false; + } + } + Scale = RootSets[0].Roots.size() + 1; + + if (Scale > IL_MaxRerollIterations) { + DEBUG(dbgs() << "LRR: Aborting - too many iterations found. " + << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations + << "\n"); return false; + } + + DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n"); return true; } -// Validate the selected reductions. All iterations must have an isomorphic -// part of the reduction chain and, for non-associative reductions, the chain -// entries must appear in order. -bool LoopReroll::ReductionTracker::validateSelected() { - // For a non-associative reduction, the chain entries must appear in order. - for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); - RI != RIE; ++RI) { - int i = *RI; - int PrevIter = 0, BaseCount = 0, Count = 0; - for (Instruction *J : PossibleReds[i]) { - // Note that all instructions in the chain must have been found because - // all instructions in the function must have been assigned to some - // iteration. - int Iter = PossibleRedIter[J]; - if (Iter != PrevIter && Iter != PrevIter + 1 && - !PossibleReds[i].getReducedValue()->isAssociative()) { - DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " << - J << "\n"); - return false; - } +bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) { + // Populate the MapVector with all instructions in the block, in order first, + // so we can iterate over the contents later in perfect order. + for (auto &I : *L->getHeader()) { + Uses[&I].resize(IL_End); + } - if (Iter != PrevIter) { - if (Count != BaseCount) { - DEBUG(dbgs() << "LRR: Iteration " << PrevIter << - " reduction use count " << Count << - " is not equal to the base use count " << - BaseCount << "\n"); - return false; - } + SmallInstructionSet Exclude; + for (auto &DRS : RootSets) { + Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); + Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); + Exclude.insert(DRS.BaseInst); + } + Exclude.insert(LoopIncs.begin(), LoopIncs.end()); - Count = 0; + for (auto &DRS : RootSets) { + DenseSet<Instruction*> VBase; + collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase); + for (auto *I : VBase) { + Uses[I].set(0); + } + + unsigned Idx = 1; + for (auto *Root : DRS.Roots) { + DenseSet<Instruction*> V; + collectInLoopUserSet(Root, Exclude, PossibleRedSet, V); + + // While we're here, check the use sets are the same size. + if (V.size() != VBase.size()) { + DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n"); + return false; } - ++Count; - if (Iter == 0) - ++BaseCount; + for (auto *I : V) { + Uses[I].set(Idx); + } + ++Idx; + } - PrevIter = Iter; + // Make sure our subsumed instructions are remembered too. + for (auto *I : DRS.SubsumedInsts) { + Uses[I].set(IL_All); } } - return true; -} - -// For all selected reductions, remove all parts except those in the first -// iteration (and the PHI). Replace outside uses of the reduced value with uses -// of the first-iteration reduced value (in other words, reroll the selected -// reductions). -void LoopReroll::ReductionTracker::replaceSelected() { - // Fixup reductions to refer to the last instruction associated with the - // first iteration (not the last). - for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); - RI != RIE; ++RI) { - int i = *RI; - int j = 0; - for (int e = PossibleReds[i].size(); j != e; ++j) - if (PossibleRedIter[PossibleReds[i][j]] != 0) { - --j; - break; - } + // Make sure the loop increments are also accounted for. - // Replace users with the new end-of-chain value. - SmallInstructionVector Users; - for (User *U : PossibleReds[i].getReducedValue()->users()) - Users.push_back(cast<Instruction>(U)); + Exclude.clear(); + for (auto &DRS : RootSets) { + Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); + Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); + Exclude.insert(DRS.BaseInst); + } - for (SmallInstructionVector::iterator J = Users.begin(), - JE = Users.end(); J != JE; ++J) - (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(), - PossibleReds[i][j]); + DenseSet<Instruction*> V; + collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V); + for (auto *I : V) { + Uses[I].set(IL_All); } -} -// Reroll the provided loop with respect to the provided induction variable. -// Generally, we're looking for a loop like this: -// -// %iv = phi [ (preheader, ...), (body, %iv.next) ] -// f(%iv) -// %iv.1 = add %iv, 1 <-- a root increment -// f(%iv.1) -// %iv.2 = add %iv, 2 <-- a root increment -// f(%iv.2) -// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment -// f(%iv.scale_m_1) -// ... -// %iv.next = add %iv, scale -// %cmp = icmp(%iv, ...) -// br %cmp, header, exit -// -// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of -// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can -// be intermixed with eachother. The restriction imposed by this algorithm is -// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), -// etc. be the same. -// -// First, we collect the use set of %iv, excluding the other increment roots. -// This gives us f(%iv). Then we iterate over the loop instructions (scale-1) -// times, having collected the use set of f(%iv.(i+1)), during which we: -// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to -// the next unmatched instruction in f(%iv.(i+1)). -// - Ensure that both matched instructions don't have any external users -// (with the exception of last-in-chain reduction instructions). -// - Track the (aliasing) write set, and other side effects, of all -// instructions that belong to future iterations that come before the matched -// instructions. If the matched instructions read from that write set, then -// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in -// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, -// if any of these future instructions had side effects (could not be -// speculatively executed), and so do the matched instructions, when we -// cannot reorder those side-effect-producing instructions, and rerolling -// fails. -// -// Finally, we make sure that all loop instructions are either loop increment -// roots, belong to simple latch code, parts of validated reductions, part of -// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions -// have been validated), then we reroll the loop. -bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, - const SCEV *IterCount, - ReductionTracker &Reductions) { - const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); - uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> - getValue()->getZExtValue(); - // The collection of loop increment instructions. - SmallInstructionVector LoopIncs; - uint64_t Scale = Inc; - - // The effective induction variable, IV, is normally also the real induction - // variable. When we're dealing with a loop like: - // for (int i = 0; i < 500; ++i) - // x[3*i] = ...; - // x[3*i+1] = ...; - // x[3*i+2] = ...; - // then the real IV is still i, but the effective IV is (3*i). - Instruction *RealIV = IV; - if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs)) - return false; + return true; - assert(Scale <= MaxInc && "Scale is too large"); - assert(Scale > 1 && "Scale must be at least 2"); +} - // The set of increment instructions for each increment value. - SmallVector<SmallInstructionVector, 32> Roots(Scale-1); - SmallInstructionSet AllRoots; - if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs)) - return false; +/// Get the next instruction in "In" that is a member of set Val. +/// Start searching from StartI, and do not return anything in Exclude. +/// If StartI is not given, start from In.begin(). +LoopReroll::DAGRootTracker::UsesTy::iterator +LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In, + const SmallInstructionSet &Exclude, + UsesTy::iterator *StartI) { + UsesTy::iterator I = StartI ? *StartI : In.begin(); + while (I != In.end() && (I->second.test(Val) == 0 || + Exclude.count(I->first) != 0)) + ++I; + return I; +} - DEBUG(dbgs() << "LRR: Found all root induction increments for: " << - *RealIV << "\n"); +bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) { + for (auto &DRS : RootSets) { + if (DRS.BaseInst == I) + return true; + } + return false; +} - // An array of just the possible reductions for this scale factor. When we - // collect the set of all users of some root instructions, these reduction - // instructions are treated as 'final' (their uses are not considered). - // This is important because we don't want the root use set to search down - // the reduction chain. - SmallInstructionSet PossibleRedSet; - SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet; - Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet, - PossibleRedLastSet); +bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) { + for (auto &DRS : RootSets) { + if (std::find(DRS.Roots.begin(), DRS.Roots.end(), I) != DRS.Roots.end()) + return true; + } + return false; +} +/// Return true if instruction I depends on any instruction between +/// Start and End. +bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I, + UsesTy::iterator Start, + UsesTy::iterator End) { + for (auto *U : I->users()) { + for (auto It = Start; It != End; ++It) + if (U == It->first) + return true; + } + return false; +} + +bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { // We now need to check for equivalence of the use graph of each root with // that of the primary induction variable (excluding the roots). Our goal // here is not to solve the full graph isomorphism problem, but rather to @@ -815,121 +1009,167 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, // is the same (although we will not make an assumption about how the // different iterations are intermixed). Note that while the order must be // the same, the instructions may not be in the same basic block. - SmallInstructionSet Exclude(AllRoots); - Exclude.insert(LoopIncs.begin(), LoopIncs.end()); - DenseSet<Instruction *> BaseUseSet; - collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet); + // An array of just the possible reductions for this scale factor. When we + // collect the set of all users of some root instructions, these reduction + // instructions are treated as 'final' (their uses are not considered). + // This is important because we don't want the root use set to search down + // the reduction chain. + SmallInstructionSet PossibleRedSet; + SmallInstructionSet PossibleRedLastSet; + SmallInstructionSet PossibleRedPHISet; + Reductions.restrictToScale(Scale, PossibleRedSet, + PossibleRedPHISet, PossibleRedLastSet); - DenseSet<Instruction *> AllRootUses; - std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1); + // Populate "Uses" with where each instruction is used. + if (!collectUsedInstructions(PossibleRedSet)) + return false; - bool MatchFailed = false; - for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) { - DenseSet<Instruction *> &RootUseSet = RootUseSets[i]; - collectInLoopUserSet(L, Roots[i], SmallInstructionSet(), - PossibleRedSet, RootUseSet); + // Make sure we mark the reduction PHIs as used in all iterations. + for (auto *I : PossibleRedPHISet) { + Uses[I].set(IL_All); + } - DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() << - " vs. iteration increment " << (i+1) << - " use set size: " << RootUseSet.size() << "\n"); + // Make sure all instructions in the loop are in one and only one + // set. + for (auto &KV : Uses) { + if (KV.second.count() != 1) { + DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: " + << *KV.first << " (#uses=" << KV.second.count() << ")\n"); + return false; + } + } - if (BaseUseSet.size() != RootUseSet.size()) { - MatchFailed = true; - break; + DEBUG( + for (auto &KV : Uses) { + dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n"; } + ); + for (unsigned Iter = 1; Iter < Scale; ++Iter) { // In addition to regular aliasing information, we need to look for // instructions from later (future) iterations that have side effects // preventing us from reordering them past other instructions with side // effects. bool FutureSideEffects = false; AliasSetTracker AST(*AA); - // The map between instructions in f(%iv.(i+1)) and f(%iv). DenseMap<Value *, Value *> BaseMap; - assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops"); - for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(), - JE = Header->end(); J1 != JE && !MatchFailed; ++J1) { - if (cast<Instruction>(J1) == RealIV) - continue; - if (cast<Instruction>(J1) == IV) - continue; - if (!BaseUseSet.count(J1)) - continue; - if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs. - continue; - - while (J2 != JE && (!RootUseSet.count(J2) || - std::find(Roots[i].begin(), Roots[i].end(), J2) != - Roots[i].end())) { - // As we iterate through the instructions, instructions that don't - // belong to previous iterations (or the base case), must belong to - // future iterations. We want to track the alias set of writes from - // previous iterations. - if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) && - !AllRootUses.count(J2)) { - if (J2->mayWriteToMemory()) - AST.add(J2); - - // Note: This is specifically guarded by a check on isa<PHINode>, - // which while a valid (somewhat arbitrary) micro-optimization, is - // needed because otherwise isSafeToSpeculativelyExecute returns - // false on PHI nodes. - if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL)) - FutureSideEffects = true; + // Compare iteration Iter to the base. + SmallInstructionSet Visited; + auto BaseIt = nextInstr(0, Uses, Visited); + auto RootIt = nextInstr(Iter, Uses, Visited); + auto LastRootIt = Uses.begin(); + + while (BaseIt != Uses.end() && RootIt != Uses.end()) { + Instruction *BaseInst = BaseIt->first; + Instruction *RootInst = RootIt->first; + + // Skip over the IV or root instructions; only match their users. + bool Continue = false; + if (isBaseInst(BaseInst)) { + Visited.insert(BaseInst); + BaseIt = nextInstr(0, Uses, Visited); + Continue = true; + } + if (isRootInst(RootInst)) { + LastRootIt = RootIt; + Visited.insert(RootInst); + RootIt = nextInstr(Iter, Uses, Visited); + Continue = true; + } + if (Continue) continue; + + if (!BaseInst->isSameOperationAs(RootInst)) { + // Last chance saloon. We don't try and solve the full isomorphism + // problem, but try and at least catch the case where two instructions + // *of different types* are round the wrong way. We won't be able to + // efficiently tell, given two ADD instructions, which way around we + // should match them, but given an ADD and a SUB, we can at least infer + // which one is which. + // + // This should allow us to deal with a greater subset of the isomorphism + // problem. It does however change a linear algorithm into a quadratic + // one, so limit the number of probes we do. + auto TryIt = RootIt; + unsigned N = NumToleratedFailedMatches; + while (TryIt != Uses.end() && + !BaseInst->isSameOperationAs(TryIt->first) && + N--) { + ++TryIt; + TryIt = nextInstr(Iter, Uses, Visited, &TryIt); } - ++J2; + if (TryIt == Uses.end() || TryIt == RootIt || + instrDependsOn(TryIt->first, RootIt, TryIt)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << "\n"); + return false; + } + + RootIt = TryIt; + RootInst = TryIt->first; } - if (!J1->isSameOperationAs(J2)) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << "\n"); - MatchFailed = true; - break; + // All instructions between the last root and this root + // may belong to some other iteration. If they belong to a + // future iteration, then they're dangerous to alias with. + // + // Note that because we allow a limited amount of flexibility in the order + // that we visit nodes, LastRootIt might be *before* RootIt, in which + // case we've already checked this set of instructions so we shouldn't + // do anything. + for (; LastRootIt < RootIt; ++LastRootIt) { + Instruction *I = LastRootIt->first; + if (LastRootIt->second.find_first() < (int)Iter) + continue; + if (I->mayWriteToMemory()) + AST.add(I); + // Note: This is specifically guarded by a check on isa<PHINode>, + // which while a valid (somewhat arbitrary) micro-optimization, is + // needed because otherwise isSafeToSpeculativelyExecute returns + // false on PHI nodes. + if (!isa<PHINode>(I) && !isSimpleLoadStore(I) && + !isSafeToSpeculativelyExecute(I, DL)) + // Intervening instructions cause side effects. + FutureSideEffects = true; } // Make sure that this instruction, which is in the use set of this // root instruction, does not also belong to the base set or the set of - // some previous root instruction. - if (BaseUseSet.count(J2) || AllRootUses.count(J2)) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << " (prev. case overlap)\n"); - MatchFailed = true; - break; + // some other root instruction. + if (RootIt->second.count() > 1) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << " (prev. case overlap)\n"); + return false; } // Make sure that we don't alias with any instruction in the alias set // tracker. If we do, then we depend on a future iteration, and we // can't reroll. - if (J2->mayReadFromMemory()) { - for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end(); - K != KE && !MatchFailed; ++K) { - if (K->aliasesUnknownInst(J2, *AA)) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << " (depends on future store)\n"); - MatchFailed = true; - break; + if (RootInst->mayReadFromMemory()) + for (auto &K : AST) { + if (K.aliasesUnknownInst(RootInst, *AA)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << " (depends on future store)\n"); + return false; } } - } // If we've past an instruction from a future iteration that may have // side effects, and this instruction might also, then we can't reorder // them, and this matching fails. As an exception, we allow the alias // set tracker to handle regular (simple) load/store dependencies. if (FutureSideEffects && - ((!isSimpleLoadStore(J1) && - !isSafeToSpeculativelyExecute(J1, DL)) || - (!isSimpleLoadStore(J2) && - !isSafeToSpeculativelyExecute(J2, DL)))) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << + ((!isSimpleLoadStore(BaseInst) && + !isSafeToSpeculativelyExecute(BaseInst, DL)) || + (!isSimpleLoadStore(RootInst) && + !isSafeToSpeculativelyExecute(RootInst, DL)))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << " (side effects prevent reordering)\n"); - MatchFailed = true; - break; + return false; } // For instructions that are part of a reduction, if the operation is @@ -942,42 +1182,46 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, // x += a[i]; x += b[i]; // x += a[i+1]; x += b[i+1]; // x += b[i+2]; x += a[i+2]; - bool InReduction = Reductions.isPairInSame(J1, J2); + bool InReduction = Reductions.isPairInSame(BaseInst, RootInst); - if (!(InReduction && J1->isAssociative())) { + if (!(InReduction && BaseInst->isAssociative())) { bool Swapped = false, SomeOpMatched = false; - for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) { - Value *Op2 = J2->getOperand(j); + for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) { + Value *Op2 = RootInst->getOperand(j); // If this is part of a reduction (and the operation is not // associatve), then we match all operands, but not those that are // part of the reduction. if (InReduction) if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) - if (Reductions.isPairInSame(J2, Op2I)) + if (Reductions.isPairInSame(RootInst, Op2I)) continue; DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); - if (BMI != BaseMap.end()) + if (BMI != BaseMap.end()) { Op2 = BMI->second; - else if (std::find(Roots[i].begin(), Roots[i].end(), - (Instruction*) Op2) != Roots[i].end()) - Op2 = IV; + } else { + for (auto &DRS : RootSets) { + if (DRS.Roots[Iter-1] == (Instruction*) Op2) { + Op2 = DRS.BaseInst; + break; + } + } + } - if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) { + if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) { // If we've not already decided to swap the matched operands, and // we've not already matched our first operand (note that we could // have skipped matching the first operand because it is part of a // reduction above), and the instruction is commutative, then try // the swapped match. - if (!Swapped && J1->isCommutative() && !SomeOpMatched && - J1->getOperand(!j) == Op2) { + if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched && + BaseInst->getOperand(!j) == Op2) { Swapped = true; } else { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << " (operand " << j << ")\n"); - MatchFailed = true; - break; + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst + << " vs. " << *RootInst << " (operand " << j << ")\n"); + return false; } } @@ -985,81 +1229,41 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, } } - if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) || - (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << " (uses outside loop)\n"); - MatchFailed = true; - break; + if ((!PossibleRedLastSet.count(BaseInst) && + hasUsesOutsideLoop(BaseInst, L)) || + (!PossibleRedLastSet.count(RootInst) && + hasUsesOutsideLoop(RootInst, L))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << " (uses outside loop)\n"); + return false; } - if (!MatchFailed) - BaseMap.insert(std::pair<Value *, Value *>(J2, J1)); - - AllRootUses.insert(J2); - Reductions.recordPair(J1, J2, i+1); + Reductions.recordPair(BaseInst, RootInst, Iter); + BaseMap.insert(std::make_pair(RootInst, BaseInst)); - ++J2; + LastRootIt = RootIt; + Visited.insert(BaseInst); + Visited.insert(RootInst); + BaseIt = nextInstr(0, Uses, Visited); + RootIt = nextInstr(Iter, Uses, Visited); } + assert (BaseIt == Uses.end() && RootIt == Uses.end() && + "Mismatched set sizes!"); } - if (MatchFailed) - return false; - DEBUG(dbgs() << "LRR: Matched all iteration increments for " << - *RealIV << "\n"); - - DenseSet<Instruction *> LoopIncUseSet; - collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(), - SmallInstructionSet(), LoopIncUseSet); - DEBUG(dbgs() << "LRR: Loop increment set size: " << - LoopIncUseSet.size() << "\n"); - - // Make sure that all instructions in the loop have been included in some - // use set. - for (BasicBlock::iterator J = Header->begin(), JE = Header->end(); - J != JE; ++J) { - if (isa<DbgInfoIntrinsic>(J)) - continue; - if (cast<Instruction>(J) == RealIV) - continue; - if (cast<Instruction>(J) == IV) - continue; - if (BaseUseSet.count(J) || AllRootUses.count(J) || - (LoopIncUseSet.count(J) && (J->isTerminator() || - isSafeToSpeculativelyExecute(J, DL)))) - continue; - - if (AllRoots.count(J)) - continue; - - if (Reductions.isSelectedPHI(J)) - continue; + *IV << "\n"); - DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV << - " unprocessed instruction found: " << *J << "\n"); - MatchFailed = true; - break; - } - - if (MatchFailed) - return false; - - DEBUG(dbgs() << "LRR: all instructions processed from " << - *RealIV << "\n"); - - if (!Reductions.validateSelected()) - return false; - - // At this point, we've validated the rerolling, and we're committed to - // making changes! - - Reductions.replaceSelected(); + return true; +} +void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { + BasicBlock *Header = L->getHeader(); // Remove instructions associated with non-base iterations. for (BasicBlock::reverse_iterator J = Header->rbegin(); J != Header->rend();) { - if (AllRootUses.count(&*J)) { + unsigned I = Uses[&*J].find_first(); + if (I > 0 && I < IL_All) { Instruction *D = &*J; DEBUG(dbgs() << "LRR: removing: " << *D << "\n"); D->eraseFromParent(); @@ -1069,57 +1273,198 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, ++J; } - // Insert the new induction variable. - const SCEV *Start = RealIVSCEV->getStart(); - if (Inc == 1) - Start = SE->getMulExpr(Start, - SE->getConstant(Start->getType(), Scale)); - const SCEVAddRecExpr *H = - cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start, - SE->getConstant(RealIVSCEV->getType(), 1), - L, SCEV::FlagAnyWrap)); - { // Limit the lifetime of SCEVExpander. - SCEVExpander Expander(*SE, "reroll"); - Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); - - for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(), - JE = BaseUseSet.end(); J != JE; ++J) - (*J)->replaceUsesOfWith(IV, NewIV); - - if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { - if (LoopIncUseSet.count(BI)) { - const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); - if (Inc == 1) - ICSCEV = - SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale)); - // Iteration count SCEV minus 1 - const SCEV *ICMinus1SCEV = - SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); - - Value *ICMinus1; // Iteration count minus 1 - if (isa<SCEVConstant>(ICMinus1SCEV)) { - ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI); - } else { - BasicBlock *Preheader = L->getLoopPreheader(); - if (!Preheader) - Preheader = InsertPreheaderForLoop(L, this); - - ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), - Preheader->getTerminator()); - } + // We need to create a new induction variable for each different BaseInst. + for (auto &DRS : RootSets) { + // Insert the new induction variable. + const SCEVAddRecExpr *RealIVSCEV = + cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); + const SCEV *Start = RealIVSCEV->getStart(); + const SCEVAddRecExpr *H = cast<SCEVAddRecExpr> + (SE->getAddRecExpr(Start, + SE->getConstant(RealIVSCEV->getType(), 1), + L, SCEV::FlagAnyWrap)); + { // Limit the lifetime of SCEVExpander. + SCEVExpander Expander(*SE, "reroll"); + Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); + + for (auto &KV : Uses) { + if (KV.second.find_first() == 0) + KV.first->replaceUsesOfWith(DRS.BaseInst, NewIV); + } - Value *Cond = + if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { + // FIXME: Why do we need this check? + if (Uses[BI].find_first() == IL_All) { + const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); + + // Iteration count SCEV minus 1 + const SCEV *ICMinus1SCEV = + SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); + + Value *ICMinus1; // Iteration count minus 1 + if (isa<SCEVConstant>(ICMinus1SCEV)) { + ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI); + } else { + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + Preheader = InsertPreheaderForLoop(L, Parent); + + ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), + Preheader->getTerminator()); + } + + Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond"); - BI->setCondition(Cond); + BI->setCondition(Cond); - if (BI->getSuccessor(1) != Header) - BI->swapSuccessors(); + if (BI->getSuccessor(1) != Header) + BI->swapSuccessors(); + } } } } SimplifyInstructionsInBlock(Header, DL, TLI); DeleteDeadPHIs(Header, TLI); +} + +// Validate the selected reductions. All iterations must have an isomorphic +// part of the reduction chain and, for non-associative reductions, the chain +// entries must appear in order. +bool LoopReroll::ReductionTracker::validateSelected() { + // For a non-associative reduction, the chain entries must appear in order. + for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + int PrevIter = 0, BaseCount = 0, Count = 0; + for (Instruction *J : PossibleReds[i]) { + // Note that all instructions in the chain must have been found because + // all instructions in the function must have been assigned to some + // iteration. + int Iter = PossibleRedIter[J]; + if (Iter != PrevIter && Iter != PrevIter + 1 && + !PossibleReds[i].getReducedValue()->isAssociative()) { + DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " << + J << "\n"); + return false; + } + + if (Iter != PrevIter) { + if (Count != BaseCount) { + DEBUG(dbgs() << "LRR: Iteration " << PrevIter << + " reduction use count " << Count << + " is not equal to the base use count " << + BaseCount << "\n"); + return false; + } + + Count = 0; + } + + ++Count; + if (Iter == 0) + ++BaseCount; + + PrevIter = Iter; + } + } + + return true; +} + +// For all selected reductions, remove all parts except those in the first +// iteration (and the PHI). Replace outside uses of the reduced value with uses +// of the first-iteration reduced value (in other words, reroll the selected +// reductions). +void LoopReroll::ReductionTracker::replaceSelected() { + // Fixup reductions to refer to the last instruction associated with the + // first iteration (not the last). + for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + int j = 0; + for (int e = PossibleReds[i].size(); j != e; ++j) + if (PossibleRedIter[PossibleReds[i][j]] != 0) { + --j; + break; + } + + // Replace users with the new end-of-chain value. + SmallInstructionVector Users; + for (User *U : PossibleReds[i].getReducedValue()->users()) { + Users.push_back(cast<Instruction>(U)); + } + + for (SmallInstructionVector::iterator J = Users.begin(), + JE = Users.end(); J != JE; ++J) + (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(), + PossibleReds[i][j]); + } +} + +// Reroll the provided loop with respect to the provided induction variable. +// Generally, we're looking for a loop like this: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// f(%iv) +// %iv.1 = add %iv, 1 <-- a root increment +// f(%iv.1) +// %iv.2 = add %iv, 2 <-- a root increment +// f(%iv.2) +// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment +// f(%iv.scale_m_1) +// ... +// %iv.next = add %iv, scale +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit +// +// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of +// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can +// be intermixed with eachother. The restriction imposed by this algorithm is +// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), +// etc. be the same. +// +// First, we collect the use set of %iv, excluding the other increment roots. +// This gives us f(%iv). Then we iterate over the loop instructions (scale-1) +// times, having collected the use set of f(%iv.(i+1)), during which we: +// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to +// the next unmatched instruction in f(%iv.(i+1)). +// - Ensure that both matched instructions don't have any external users +// (with the exception of last-in-chain reduction instructions). +// - Track the (aliasing) write set, and other side effects, of all +// instructions that belong to future iterations that come before the matched +// instructions. If the matched instructions read from that write set, then +// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in +// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, +// if any of these future instructions had side effects (could not be +// speculatively executed), and so do the matched instructions, when we +// cannot reorder those side-effect-producing instructions, and rerolling +// fails. +// +// Finally, we make sure that all loop instructions are either loop increment +// roots, belong to simple latch code, parts of validated reductions, part of +// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions +// have been validated), then we reroll the loop. +bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, + const SCEV *IterCount, + ReductionTracker &Reductions) { + DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DL); + + if (!DAGRoots.findRoots()) + return false; + DEBUG(dbgs() << "LRR: Found all root induction increments for: " << + *IV << "\n"); + + if (!DAGRoots.validate(Reductions)) + return false; + if (!Reductions.validateSelected()) + return false; + // At this point, we've validated the rerolling, and we're committed to + // making changes! + + Reductions.replaceSelected(); + DAGRoots.replace(IterCount); + ++NumRerolledLoops; return true; } @@ -1129,9 +1474,9 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { return false; AA = &getAnalysis<AliasAnalysis>(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolution>(); - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index afd2eca..4d12349 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -13,7 +13,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" @@ -54,16 +54,16 @@ namespace { // LCSSA form makes instruction renaming easier. void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } bool runOnLoop(Loop *L, LPPassManager &LPM) override; @@ -74,15 +74,16 @@ namespace { unsigned MaxHeaderSize; LoopInfo *LI; const TargetTransformInfo *TTI; - AssumptionTracker *AT; + AssumptionCache *AC; + DominatorTree *DT; }; } char LoopRotate::ID = 0; INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) @@ -100,9 +101,13 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { // Save the loop metadata. MDNode *LoopMD = L->getLoopID(); - LI = &getAnalysis<LoopInfo>(); - TTI = &getAnalysis<TargetTransformInfo>(); - AT = &getAnalysis<AssumptionTracker>(); + Function &F = *L->getHeader()->getParent(); + + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; // Simplify the loop latch before attempting to rotate the header // upward. Rotation may not be needed if the loop tail can be folded into the @@ -225,20 +230,17 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: { - Value *IVOpnd = nullptr; - if (isa<ConstantInt>(I->getOperand(0))) - IVOpnd = I->getOperand(1); - - if (isa<ConstantInt>(I->getOperand(1))) { - if (IVOpnd) - return false; - - IVOpnd = I->getOperand(0); - } + Value *IVOpnd = !isa<Constant>(I->getOperand(0)) + ? I->getOperand(0) + : !isa<Constant>(I->getOperand(1)) + ? I->getOperand(1) + : nullptr; + if (!IVOpnd) + return false; // If increment operand is used outside of the loop, this speculation // could cause extra live range interference. - if (MultiExitLoop && IVOpnd) { + if (MultiExitLoop) { for (User *UseI : IVOpnd->users()) { auto *UserInst = cast<Instruction>(UseI); if (!L->contains(UserInst)) @@ -307,9 +309,8 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) { // Nuke the Latch block. assert(Latch->empty() && "unable to evacuate Latch"); LI->removeBlock(Latch); - if (DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>()) - DTWP->getDomTree().eraseNode(Latch); + if (DT) + DT->eraseNode(Latch); Latch->eraseFromParent(); return true; } @@ -356,7 +357,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // duplicate blocks inside it. { SmallPtrSet<const Value *, 32> EphValues; - CodeMetrics::collectEphemeralValues(L, AT, EphValues); + CodeMetrics::collectEphemeralValues(L, AC, EphValues); CodeMetrics Metrics; Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); @@ -441,7 +442,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // With the operands remapped, see if the instruction constant folds or is // otherwise simplifyable. This commonly occurs because the entry from PHI // nodes allows icmps and other instructions to fold. - // FIXME: Provide DL, TLI, DT, AT to SimplifyInstruction. + // FIXME: Provide DL, TLI, DT, AC to SimplifyInstruction. Value *V = SimplifyInstruction(C); if (V && LI->replacementPreservesLCSSAForm(C, V)) { // If so, then delete the temporary instruction and stick the folded value @@ -494,31 +495,31 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // The conditional branch can't be folded, handle the general case. // Update DominatorTree to reflect the CFG change we just made. Then split // edges as necessary to preserve LoopSimplify form. - if (DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DominatorTree &DT = DTWP->getDomTree(); + if (DT) { // Everything that was dominated by the old loop header is now dominated // by the original loop preheader. Conceptually the header was merged // into the preheader, even though we reuse the actual block as a new // loop latch. - DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader); + DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), OrigHeaderNode->end()); - DomTreeNode *OrigPreheaderNode = DT.getNode(OrigPreheader); + DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader); for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) - DT.changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); + DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); - assert(DT.getNode(Exit)->getIDom() == OrigPreheaderNode); - assert(DT.getNode(NewHeader)->getIDom() == OrigPreheaderNode); + assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode); + assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode); // Update OrigHeader to be dominated by the new header block. - DT.changeImmediateDominator(OrigHeader, OrigLatch); + DT->changeImmediateDominator(OrigHeader, OrigLatch); } // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and // thus is not a preheader anymore. // Split the edge to form a real preheader. - BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this); + BasicBlock *NewPH = SplitCriticalEdge( + OrigPreheader, NewHeader, + CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); NewPH->setName(NewHeader->getName() + ".lr.ph"); // Preserve canonical loop form, which means that 'Exit' should have only @@ -534,8 +535,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { Loop *PredLoop = LI->getLoopFor(*PI); if (!PredLoop || PredLoop->contains(Exit)) continue; + if (isa<IndirectBrInst>((*PI)->getTerminator())) + continue; SplitLatchEdge |= L->getLoopLatch() == *PI; - BasicBlock *ExitSplit = SplitCriticalEdge(*PI, Exit, this); + BasicBlock *ExitSplit = SplitCriticalEdge( + *PI, Exit, CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); ExitSplit->moveBefore(Exit); } assert(SplitLatchEdge && @@ -549,17 +553,15 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { PHBI->eraseFromParent(); // With our CFG finalized, update DomTree if it is available. - if (DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DominatorTree &DT = DTWP->getDomTree(); + if (DT) { // Update OrigHeader to be dominated by the new header block. - DT.changeImmediateDominator(NewHeader, OrigPreheader); - DT.changeImmediateDominator(OrigHeader, OrigLatch); + DT->changeImmediateDominator(NewHeader, OrigPreheader); + DT->changeImmediateDominator(OrigHeader, OrigLatch); // Brute force incremental dominator tree update. Call // findNearestCommonDominator on all CFG predecessors of each child of the // original header. - DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader); + DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), OrigHeaderNode->end()); bool Changed; @@ -572,11 +574,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { pred_iterator PI = pred_begin(BB); BasicBlock *NearestDom = *PI; for (pred_iterator PE = pred_end(BB); PI != PE; ++PI) - NearestDom = DT.findNearestCommonDominator(NearestDom, *PI); + NearestDom = DT->findNearestCommonDominator(NearestDom, *PI); // Remember if this changes the DomTree. if (Node->getIDom()->getBlock() != NearestDom) { - DT.changeImmediateDominator(BB, NearestDom); + DT->changeImmediateDominator(BB, NearestDom); Changed = true; } } @@ -594,7 +596,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // the OrigHeader block into OrigLatch. This will succeed if they are // connected by an unconditional branch. This is just a cleanup so the // emitted code isn't too gross in this common case. - MergeBlockIntoPredecessor(OrigHeader, this); + MergeBlockIntoPredecessor(OrigHeader, DT, LI); DEBUG(dbgs() << "LoopRotation: into "; L->dump()); diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 7b60373..318065e 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1327,11 +1327,9 @@ void LSRUse::DeleteFormula(Formula &F) { /// RecomputeRegs - Recompute the Regs field, and update RegUses. void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { // Now that we've filtered out some formulae, recompute the Regs set. - SmallPtrSet<const SCEV *, 4> OldRegs = Regs; + SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs); Regs.clear(); - for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(), - E = Formulae.end(); I != E; ++I) { - const Formula &F = *I; + for (const Formula &F : Formulae) { if (F.ScaledReg) Regs.insert(F.ScaledReg); Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); } @@ -4728,12 +4726,14 @@ void LSRInstance::RewriteForPHI(PHINode *PN, // Split the critical edge. BasicBlock *NewBB = nullptr; if (!Parent->isLandingPad()) { - NewBB = SplitCriticalEdge(BB, Parent, P, - /*MergeIdenticalEdges=*/true, - /*DontDeleteUselessPhis=*/true); + NewBB = SplitCriticalEdge(BB, Parent, + CriticalEdgeSplittingOptions(&DT, &LI) + .setMergeIdenticalEdges() + .setDontDeleteUselessPHIs()); } else { SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs); + SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, + /*AliasAnalysis*/ nullptr, &DT, &LI); NewBB = NewBBs[0]; } // If NewBB==NULL, then SplitCriticalEdge refused to split because all @@ -4863,9 +4863,10 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, LSRInstance::LSRInstance(Loop *L, Pass *P) : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()), - LI(P->getAnalysis<LoopInfo>()), - TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false), - IVIncInsertPos(nullptr) { + LI(P->getAnalysis<LoopInfoWrapperPass>().getLoopInfo()), + TTI(P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *L->getHeader()->getParent())), + L(L), Changed(false), IVIncInsertPos(nullptr) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; @@ -5041,11 +5042,11 @@ private: char LoopStrengthReduce::ID = 0; INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(IVUsers) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) @@ -5064,8 +5065,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { // many analyses if they are around. AU.addPreservedID(LoopSimplifyID); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); @@ -5076,7 +5077,7 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredID(LoopSimplifyID); AU.addRequired<IVUsers>(); AU.addPreserved<IVUsers>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { @@ -5098,7 +5099,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { #endif unsigned numFolded = Rewriter.replaceCongruentIVs( L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts, - &getAnalysis<TargetTransformInfo>()); + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *L->getHeader()->getParent())); if (numFolded) { Changed = true; DeleteTriviallyDeadInstructions(DeadInsts); diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index f60d990..924be16 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -13,11 +13,12 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/FunctionTargetTransformInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" @@ -28,6 +29,8 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Analysis/InstructionSimplify.h" #include <climits> using namespace llvm; @@ -38,6 +41,22 @@ static cl::opt<unsigned> UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, cl::desc("The cut-off point for automatic loop unrolling")); +static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze( + "unroll-max-iteration-count-to-analyze", cl::init(0), cl::Hidden, + cl::desc("Don't allow loop unrolling to simulate more than this number of" + "iterations when checking full unroll profitability")); + +static cl::opt<unsigned> UnrollMinPercentOfOptimized( + "unroll-percent-of-optimized-for-complete-unroll", cl::init(20), cl::Hidden, + cl::desc("If complete unrolling could trigger further optimizations, and, " + "by that, remove the given percent of instructions, perform the " + "complete unroll even if it's beyond the threshold")); + +static cl::opt<unsigned> UnrollAbsoluteThreshold( + "unroll-absolute-threshold", cl::init(2000), cl::Hidden, + cl::desc("Don't unroll if the unrolled size is bigger than this threshold," + " even if we can remove big portion of instructions later.")); + static cl::opt<unsigned> UnrollCount("unroll-count", cl::init(0), cl::Hidden, cl::desc("Use this unroll count for all loops including those with " @@ -63,11 +82,16 @@ namespace { static char ID; // Pass ID, replacement for typeid LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) { CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T); + CurrentAbsoluteThreshold = UnrollAbsoluteThreshold; + CurrentMinPercentOfOptimized = UnrollMinPercentOfOptimized; CurrentCount = (C == -1) ? UnrollCount : unsigned(C); CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P; CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R; UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0); + UserAbsoluteThreshold = (UnrollAbsoluteThreshold.getNumOccurrences() > 0); + UserPercentOfOptimized = + (UnrollMinPercentOfOptimized.getNumOccurrences() > 0); UserAllowPartial = (P != -1) || (UnrollAllowPartial.getNumOccurrences() > 0); UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0); @@ -91,10 +115,16 @@ namespace { unsigned CurrentCount; unsigned CurrentThreshold; + unsigned CurrentAbsoluteThreshold; + unsigned CurrentMinPercentOfOptimized; bool CurrentAllowPartial; bool CurrentRuntime; bool UserCount; // CurrentCount is user-specified. bool UserThreshold; // CurrentThreshold is user-specified. + bool UserAbsoluteThreshold; // CurrentAbsoluteThreshold is + // user-specified. + bool UserPercentOfOptimized; // CurrentMinPercentOfOptimized is + // user-specified. bool UserAllowPartial; // CurrentAllowPartial is user-specified. bool UserRuntime; // CurrentRuntime is user-specified. @@ -104,17 +134,16 @@ namespace { /// loop preheaders be inserted into the CFG... /// void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); AU.addRequired<ScalarEvolution>(); AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); - AU.addRequired<FunctionTargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. // If loop unroll does not preserve dom info then LCSSA pass on next // loop will receive invalid dom info. @@ -124,9 +153,11 @@ namespace { // Fill in the UnrollingPreferences parameter with values from the // TargetTransformationInfo. - void getUnrollingPreferences(Loop *L, const FunctionTargetTransformInfo &FTTI, + void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI, TargetTransformInfo::UnrollingPreferences &UP) { UP.Threshold = CurrentThreshold; + UP.AbsoluteThreshold = CurrentAbsoluteThreshold; + UP.MinPercentOfOptimized = CurrentMinPercentOfOptimized; UP.OptSizeThreshold = OptSizeUnrollThreshold; UP.PartialThreshold = CurrentThreshold; UP.PartialOptSizeThreshold = OptSizeUnrollThreshold; @@ -134,7 +165,7 @@ namespace { UP.MaxCount = UINT_MAX; UP.Partial = CurrentAllowPartial; UP.Runtime = CurrentRuntime; - FTTI.getUnrollingPreferences(L, UP); + TTI.getUnrollingPreferences(L, UP); } // Select and return an unroll count based on parameters from @@ -153,18 +184,37 @@ namespace { // unrolled loops respectively. void selectThresholds(const Loop *L, bool HasPragma, const TargetTransformInfo::UnrollingPreferences &UP, - unsigned &Threshold, unsigned &PartialThreshold) { + unsigned &Threshold, unsigned &PartialThreshold, + unsigned NumberOfOptimizedInstructions) { // Determine the current unrolling threshold. While this is // normally set from UnrollThreshold, it is overridden to a // smaller value if the current function is marked as // optimize-for-size, and the unroll threshold was not user // specified. Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; + + // If we are allowed to completely unroll if we can remove M% of + // instructions, and we know that with complete unrolling we'll be able + // to kill N instructions, then we can afford to completely unroll loops + // with unrolled size up to N*100/M. + // Adjust the threshold according to that: + unsigned PercentOfOptimizedForCompleteUnroll = + UserPercentOfOptimized ? CurrentMinPercentOfOptimized + : UP.MinPercentOfOptimized; + unsigned AbsoluteThreshold = UserAbsoluteThreshold + ? CurrentAbsoluteThreshold + : UP.AbsoluteThreshold; + if (PercentOfOptimizedForCompleteUnroll) + Threshold = std::max<unsigned>(Threshold, + NumberOfOptimizedInstructions * 100 / + PercentOfOptimizedForCompleteUnroll); + // But don't allow unrolling loops bigger than absolute threshold. + Threshold = std::min<unsigned>(Threshold, AbsoluteThreshold); + PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold; if (!UserThreshold && - L->getHeader()->getParent()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize)) { + L->getHeader()->getParent()->hasFnAttribute( + Attribute::OptimizeForSize)) { Threshold = UP.OptSizeThreshold; PartialThreshold = UP.PartialOptSizeThreshold; } @@ -185,10 +235,9 @@ namespace { char LoopUnroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) -INITIALIZE_PASS_DEPENDENCY(FunctionTargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) @@ -203,13 +252,333 @@ Pass *llvm::createSimpleLoopUnrollPass() { return llvm::createLoopUnrollPass(-1, -1, 0, 0); } +static bool isLoadFromConstantInitializer(Value *V) { + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + if (GV->isConstant() && GV->hasDefinitiveInitializer()) + return GV->getInitializer(); + return false; +} + +struct FindConstantPointers { + bool LoadCanBeConstantFolded; + bool IndexIsConstant; + APInt Step; + APInt StartValue; + Value *BaseAddress; + const Loop *L; + ScalarEvolution &SE; + FindConstantPointers(const Loop *loop, ScalarEvolution &SE) + : LoadCanBeConstantFolded(true), IndexIsConstant(true), L(loop), SE(SE) {} + + bool follow(const SCEV *S) { + if (const SCEVUnknown *SC = dyn_cast<SCEVUnknown>(S)) { + // We've reached the leaf node of SCEV, it's most probably just a + // variable. Now it's time to see if it corresponds to a global constant + // global (in which case we can eliminate the load), or not. + BaseAddress = SC->getValue(); + LoadCanBeConstantFolded = + IndexIsConstant && isLoadFromConstantInitializer(BaseAddress); + return false; + } + if (isa<SCEVConstant>(S)) + return true; + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + // If the current SCEV expression is AddRec, and its loop isn't the loop + // we are about to unroll, then we won't get a constant address after + // unrolling, and thus, won't be able to eliminate the load. + if (AR->getLoop() != L) + return IndexIsConstant = false; + // If the step isn't constant, we won't get constant addresses in unrolled + // version. Bail out. + if (const SCEVConstant *StepSE = + dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) + Step = StepSE->getValue()->getValue(); + else + return IndexIsConstant = false; + + return IndexIsConstant; + } + // If Result is true, continue traversal. + // Otherwise, we have found something that prevents us from (possible) load + // elimination. + return IndexIsConstant; + } + bool isDone() const { return !IndexIsConstant; } +}; + +// This class is used to get an estimate of the optimization effects that we +// could get from complete loop unrolling. It comes from the fact that some +// loads might be replaced with concrete constant values and that could trigger +// a chain of instruction simplifications. +// +// E.g. we might have: +// int a[] = {0, 1, 0}; +// v = 0; +// for (i = 0; i < 3; i ++) +// v += b[i]*a[i]; +// If we completely unroll the loop, we would get: +// v = b[0]*a[0] + b[1]*a[1] + b[2]*a[2] +// Which then will be simplified to: +// v = b[0]* 0 + b[1]* 1 + b[2]* 0 +// And finally: +// v = b[1] +class UnrollAnalyzer : public InstVisitor<UnrollAnalyzer, bool> { + typedef InstVisitor<UnrollAnalyzer, bool> Base; + friend class InstVisitor<UnrollAnalyzer, bool>; + + const Loop *L; + unsigned TripCount; + ScalarEvolution &SE; + const TargetTransformInfo &TTI; + + DenseMap<Value *, Constant *> SimplifiedValues; + DenseMap<LoadInst *, Value *> LoadBaseAddresses; + SmallPtrSet<Instruction *, 32> CountedInstructions; + + /// \brief Count the number of optimized instructions. + unsigned NumberOfOptimizedInstructions; + + // Provide base case for our instruction visit. + bool visitInstruction(Instruction &I) { return false; }; + // TODO: We should also visit ICmp, FCmp, GetElementPtr, Trunc, ZExt, SExt, + // FPTrunc, FPExt, FPToUI, FPToSI, UIToFP, SIToFP, BitCast, Select, + // ExtractElement, InsertElement, ShuffleVector, ExtractValue, InsertValue. + // + // Probaly it's worth to hoist the code for estimating the simplifications + // effects to a separate class, since we have a very similar code in + // InlineCost already. + bool visitBinaryOperator(BinaryOperator &I) { + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + if (!isa<Constant>(LHS)) + if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) + LHS = SimpleLHS; + if (!isa<Constant>(RHS)) + if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) + RHS = SimpleRHS; + Value *SimpleV = nullptr; + if (auto FI = dyn_cast<FPMathOperator>(&I)) + SimpleV = + SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags()); + else + SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS); + + if (SimpleV && CountedInstructions.insert(&I).second) + NumberOfOptimizedInstructions += TTI.getUserCost(&I); + + if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) { + SimplifiedValues[&I] = C; + return true; + } + return false; + } + + Constant *computeLoadValue(LoadInst *LI, unsigned Iteration) { + if (!LI) + return nullptr; + Value *BaseAddr = LoadBaseAddresses[LI]; + if (!BaseAddr) + return nullptr; + + auto GV = dyn_cast<GlobalVariable>(BaseAddr); + if (!GV) + return nullptr; + + ConstantDataSequential *CDS = + dyn_cast<ConstantDataSequential>(GV->getInitializer()); + if (!CDS) + return nullptr; + + const SCEV *BaseAddrSE = SE.getSCEV(BaseAddr); + const SCEV *S = SE.getSCEV(LI->getPointerOperand()); + const SCEV *OffSE = SE.getMinusSCEV(S, BaseAddrSE); + + APInt StepC, StartC; + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffSE); + if (!AR) + return nullptr; + + if (const SCEVConstant *StepSE = + dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) + StepC = StepSE->getValue()->getValue(); + else + return nullptr; + + if (const SCEVConstant *StartSE = dyn_cast<SCEVConstant>(AR->getStart())) + StartC = StartSE->getValue()->getValue(); + else + return nullptr; + + unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; + unsigned Start = StartC.getLimitedValue(); + unsigned Step = StepC.getLimitedValue(); + + unsigned Index = (Start + Step * Iteration) / ElemSize; + if (Index >= CDS->getNumElements()) + return nullptr; + + Constant *CV = CDS->getElementAsConstant(Index); + + return CV; + } + +public: + UnrollAnalyzer(const Loop *L, unsigned TripCount, ScalarEvolution &SE, + const TargetTransformInfo &TTI) + : L(L), TripCount(TripCount), SE(SE), TTI(TTI), + NumberOfOptimizedInstructions(0) {} + + // Visit all loads the loop L, and for those that, after complete loop + // unrolling, would have a constant address and it will point to a known + // constant initializer, record its base address for future use. It is used + // when we estimate number of potentially simplified instructions. + void findConstFoldableLoads() { + for (auto BB : L->getBlocks()) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + if (!LI->isSimple()) + continue; + Value *AddrOp = LI->getPointerOperand(); + const SCEV *S = SE.getSCEV(AddrOp); + FindConstantPointers Visitor(L, SE); + SCEVTraversal<FindConstantPointers> T(Visitor); + T.visitAll(S); + if (Visitor.IndexIsConstant && Visitor.LoadCanBeConstantFolded) { + LoadBaseAddresses[LI] = Visitor.BaseAddress; + } + } + } + } + } + + // Given a list of loads that could be constant-folded (LoadBaseAddresses), + // estimate number of optimized instructions after substituting the concrete + // values for the given Iteration. Also track how many instructions become + // dead through this process. + unsigned estimateNumberOfOptimizedInstructions(unsigned Iteration) { + // We keep a set vector for the worklist so that we don't wast space in the + // worklist queuing up the same instruction repeatedly. This can happen due + // to multiple operands being the same instruction or due to the same + // instruction being an operand of lots of things that end up dead or + // simplified. + SmallSetVector<Instruction *, 8> Worklist; + + // Clear the simplified values and counts for this iteration. + SimplifiedValues.clear(); + CountedInstructions.clear(); + NumberOfOptimizedInstructions = 0; + + // We start by adding all loads to the worklist. + for (auto &LoadDescr : LoadBaseAddresses) { + LoadInst *LI = LoadDescr.first; + SimplifiedValues[LI] = computeLoadValue(LI, Iteration); + if (CountedInstructions.insert(LI).second) + NumberOfOptimizedInstructions += TTI.getUserCost(LI); + + for (User *U : LI->users()) + Worklist.insert(cast<Instruction>(U)); + } + + // And then we try to simplify every user of every instruction from the + // worklist. If we do simplify a user, add it to the worklist to process + // its users as well. + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + if (!L->contains(I)) + continue; + if (!visit(I)) + continue; + for (User *U : I->users()) + Worklist.insert(cast<Instruction>(U)); + } + + // Now that we know the potentially simplifed instructions, estimate number + // of instructions that would become dead if we do perform the + // simplification. + + // The dead instructions are held in a separate set. This is used to + // prevent us from re-examining instructions and make sure we only count + // the benifit once. The worklist's internal set handles insertion + // deduplication. + SmallPtrSet<Instruction *, 16> DeadInstructions; + + // Lambda to enque operands onto the worklist. + auto EnqueueOperands = [&](Instruction &I) { + for (auto *Op : I.operand_values()) + if (auto *OpI = dyn_cast<Instruction>(Op)) + if (!OpI->use_empty()) + Worklist.insert(OpI); + }; + + // Start by initializing worklist with simplified instructions. + for (auto &FoldedKeyValue : SimplifiedValues) + if (auto *FoldedInst = dyn_cast<Instruction>(FoldedKeyValue.first)) { + DeadInstructions.insert(FoldedInst); + + // Add each instruction operand of this dead instruction to the + // worklist. + EnqueueOperands(*FoldedInst); + } + + // If a definition of an insn is only used by simplified or dead + // instructions, it's also dead. Check defs of all instructions from the + // worklist. + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + if (!L->contains(I)) + continue; + if (DeadInstructions.count(I)) + continue; + + if (std::all_of(I->user_begin(), I->user_end(), [&](User *U) { + return DeadInstructions.count(cast<Instruction>(U)); + })) { + NumberOfOptimizedInstructions += TTI.getUserCost(I); + DeadInstructions.insert(I); + EnqueueOperands(*I); + } + } + return NumberOfOptimizedInstructions; + } +}; + +// Complete loop unrolling can make some loads constant, and we need to know if +// that would expose any further optimization opportunities. +// This routine estimates this optimization effect and returns the number of +// instructions, that potentially might be optimized away. +static unsigned +approximateNumberOfOptimizedInstructions(const Loop *L, ScalarEvolution &SE, + unsigned TripCount, + const TargetTransformInfo &TTI) { + if (!TripCount || !UnrollMaxIterationsCountToAnalyze) + return 0; + + UnrollAnalyzer UA(L, TripCount, SE, TTI); + UA.findConstFoldableLoads(); + + // Estimate number of instructions, that could be simplified if we replace a + // load with the corresponding constant. Since the same load will take + // different values on different iterations, we have to go through all loop's + // iterations here. To limit ourselves here, we check only first N + // iterations, and then scale the found number, if necessary. + unsigned IterationsNumberForEstimate = + std::min<unsigned>(UnrollMaxIterationsCountToAnalyze, TripCount); + unsigned NumberOfOptimizedInstructions = 0; + for (unsigned i = 0; i < IterationsNumberForEstimate; ++i) + NumberOfOptimizedInstructions += + UA.estimateNumberOfOptimizedInstructions(i); + + NumberOfOptimizedInstructions *= TripCount / IterationsNumberForEstimate; + + return NumberOfOptimizedInstructions; +} + /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, const TargetTransformInfo &TTI, - AssumptionTracker *AT) { + AssumptionCache *AC) { SmallPtrSet<const Value *, 32> EphValues; - CodeMetrics::collectEphemeralValues(L, AT, EphValues); + CodeMetrics::collectEphemeralValues(L, AC, EphValues); CodeMetrics Metrics; for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); @@ -222,8 +591,11 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, // Don't allow an estimate of size zero. This would allows unrolling of loops // with huge iteration counts, which is a compile time problem even if it's - // not a problem for code quality. - if (LoopSize == 0) LoopSize = 1; + // not a problem for code quality. Also, the code using this size may assume + // that each loop has at least three instructions (likely a conditional + // branch, a comparison feeding that branch, and some kind of loop increment + // feeding that comparison instruction). + LoopSize = std::max(LoopSize, 3u); return LoopSize; } @@ -231,48 +603,31 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, // Returns the loop hint metadata node with the given name (for example, // "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is // returned. -static const MDNode *GetUnrollMetadata(const Loop *L, StringRef Name) { - MDNode *LoopID = L->getLoopID(); - if (!LoopID) - return nullptr; - - // First operand should refer to the loop id itself. - assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); - assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); - - for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { - const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); - if (!MD) - continue; - - const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); - if (!S) - continue; - - if (Name.equals(S->getString())) - return MD; - } +static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) { + if (MDNode *LoopID = L->getLoopID()) + return GetUnrollMetadata(LoopID, Name); return nullptr; } // Returns true if the loop has an unroll(full) pragma. static bool HasUnrollFullPragma(const Loop *L) { - return GetUnrollMetadata(L, "llvm.loop.unroll.full"); + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full"); } // Returns true if the loop has an unroll(disable) pragma. static bool HasUnrollDisablePragma(const Loop *L) { - return GetUnrollMetadata(L, "llvm.loop.unroll.disable"); + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable"); } // If loop has an unroll_count pragma return the (necessarily // positive) value from the pragma. Otherwise return 0. static unsigned UnrollCountPragmaValue(const Loop *L) { - const MDNode *MD = GetUnrollMetadata(L, "llvm.loop.unroll.count"); + MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count"); if (MD) { assert(MD->getNumOperands() == 2 && "Unroll count hint metadata should have two operands."); - unsigned Count = cast<ConstantInt>(MD->getOperand(1))->getZExtValue(); + unsigned Count = + mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue(); assert(Count >= 1 && "Unroll count must be positive."); return Count; } @@ -288,9 +643,9 @@ static void SetLoopAlreadyUnrolled(Loop *L) { if (!LoopID) return; // First remove any existing loop unrolling metadata. - SmallVector<Value *, 4> Vals; + SmallVector<Metadata *, 4> MDs; // Reserve first location for self reference to the LoopID metadata node. - Vals.push_back(nullptr); + MDs.push_back(nullptr); for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { bool IsUnrollMetadata = false; MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); @@ -298,17 +653,18 @@ static void SetLoopAlreadyUnrolled(Loop *L) { const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); } - if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i)); + if (!IsUnrollMetadata) + MDs.push_back(LoopID->getOperand(i)); } // Add unroll(disable) metadata to disable future unrolling. LLVMContext &Context = L->getHeader()->getContext(); - SmallVector<Value *, 1> DisableOperands; + SmallVector<Metadata *, 1> DisableOperands; DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable")); MDNode *DisableNode = MDNode::get(Context, DisableOperands); - Vals.push_back(DisableNode); + MDs.push_back(DisableNode); - MDNode *NewLoopID = MDNode::get(Context, Vals); + MDNode *NewLoopID = MDNode::get(Context, MDs); // Set operand 0 to refer to the loop id itself. NewLoopID->replaceOperandWith(0, NewLoopID); L->setLoopID(NewLoopID); @@ -358,12 +714,13 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { if (skipOptnoneFunction(L)) return false; - LoopInfo *LI = &getAnalysis<LoopInfo>(); + Function &F = *L->getHeader()->getParent(); + + LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); - const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); - const FunctionTargetTransformInfo &FTTI = - getAnalysis<FunctionTargetTransformInfo>(); - AssumptionTracker *AT = &getAnalysis<AssumptionTracker>(); + const TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() @@ -377,7 +734,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { bool HasPragma = PragmaFullUnroll || PragmaCount > 0; TargetTransformInfo::UnrollingPreferences UP; - getUnrollingPreferences(L, FTTI, UP); + getUnrollingPreferences(L, TTI, UP); // Find trip count and trip multiple if count is not available unsigned TripCount = 0; @@ -402,9 +759,13 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { unsigned NumInlineCandidates; bool notDuplicatable; unsigned LoopSize = - ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, AT); + ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, &AC); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); - uint64_t UnrolledSize = (uint64_t)LoopSize * Count; + + // When computing the unrolled size, note that the conditional branch on the + // backedge and the comparison feeding it are not replicated like the rest of + // the loop body (which is why 2 is subtracted). + uint64_t UnrolledSize = (uint64_t)(LoopSize-2) * Count + 2; if (notDuplicatable) { DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" << " instructions.\n"); @@ -415,8 +776,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { return false; } + unsigned NumberOfOptimizedInstructions = + approximateNumberOfOptimizedInstructions(L, *SE, TripCount, TTI); + DEBUG(dbgs() << " Complete unrolling could save: " + << NumberOfOptimizedInstructions << "\n"); + unsigned Threshold, PartialThreshold; - selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold); + selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold, + NumberOfOptimizedInstructions); // Given Count, TripCount and thresholds determine the type of // unrolling which is to be performed. @@ -449,7 +816,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } if (PartialThreshold != NoThreshold && UnrolledSize > PartialThreshold) { // Reduce unroll count to be modulo of TripCount for partial unrolling. - Count = PartialThreshold / LoopSize; + Count = (std::max(PartialThreshold, 3u)-2) / (LoopSize-2); while (Count != 0 && TripCount % Count != 0) Count--; } @@ -463,7 +830,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // the original count which satisfies the threshold limit. while (Count != 0 && UnrolledSize > PartialThreshold) { Count >>= 1; - UnrolledSize = LoopSize * Count; + UnrolledSize = (LoopSize-2) * Count + 2; } if (Count > UP.MaxCount) Count = UP.MaxCount; @@ -509,7 +876,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Unroll the loop. if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this, - &LPM, AT)) + &LPM, &AC)) return false; return true; diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index ef43483..987dc96 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -30,7 +30,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" @@ -105,7 +105,7 @@ namespace { // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. bool countLoop(const Loop *L, const TargetTransformInfo &TTI, - AssumptionTracker *AT); + AssumptionCache *AC); // Clean all data related to given loop. void forgetLoop(const Loop *L); @@ -128,7 +128,7 @@ namespace { class LoopUnswitch : public LoopPass { LoopInfo *LI; // Loop information LPPassManager *LPM; - AssumptionTracker *AT; + AssumptionCache *AC; // LoopProcessWorklist - Used to check if second loop needs processing // after RewriteLoopBodyWithConditionConstant rewrites first loop. @@ -167,16 +167,16 @@ namespace { /// loop preheaders be inserted into the CFG. /// void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } private: @@ -217,7 +217,7 @@ namespace { // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI, - AssumptionTracker *AT) { + AssumptionCache *AC) { LoopPropsMapIt PropsIt; bool Inserted; @@ -235,7 +235,7 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI, // This is a very ad-hoc heuristic. SmallPtrSet<const Value *, 32> EphValues; - CodeMetrics::collectEphemeralValues(L, AT, EphValues); + CodeMetrics::collectEphemeralValues(L, AC, EphValues); // FIXME: This is overly conservative because it does not take into // consideration code simplification opportunities and code that can @@ -333,10 +333,10 @@ void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop, char LoopUnswitch::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) @@ -385,8 +385,9 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { if (skipOptnoneFunction(L)) return false; - AT = &getAnalysis<AssumptionTracker>(); - LI = &getAnalysis<LoopInfo>(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + *L->getHeader()->getParent()); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); LPM = &LPM_Ref; DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); @@ -431,8 +432,10 @@ bool LoopUnswitch::processCurrentLoop() { // Probably we reach the quota of branches for this loop. If so // stop unswitching. - if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>(), - AT)) + if (!BranchesInfo.countLoop( + currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *currentLoop->getHeader()->getParent()), + AC)) return false; // Loop over all of the basic blocks in the loop. If we find an interior @@ -654,9 +657,7 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) { // Check to see if it would be profitable to unswitch current loop. // Do not do non-trivial unswitch while optimizing for size. - if (OptimizeForSize || - F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize)) + if (OptimizeForSize || F->hasFnAttribute(Attribute::OptimizeForSize)) return false; UnswitchNontrivialCondition(LoopCond, Val, currentLoop); @@ -674,7 +675,7 @@ static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) if (LI->getLoopFor(*I) == L) - New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), LI->getBase()); + New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI); // Add all of the subloops to the new loop. for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) @@ -705,8 +706,9 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, // If either edge is critical, split it. This helps preserve LoopSimplify // form for enclosing loops. - SplitCriticalEdge(BI, 0, this, false, false, true); - SplitCriticalEdge(BI, 1, this, false, false, true); + auto Options = CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA(); + SplitCriticalEdge(BI, 0, Options); + SplitCriticalEdge(BI, 1, Options); } /// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable @@ -725,7 +727,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, // First step, split the preheader, so that we know that there is a safe place // to insert the conditional branch. We will change loopPreheader to have a // conditional branch on Cond. - BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, this); + BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, DT, LI); // Now that we have a place to insert the conditional branch, create a place // to branch to: this is the exit block out of the loop that we should @@ -736,7 +738,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, // without actually branching to it (the exit block should be dominated by the // loop header, not the preheader). assert(!L->contains(ExitBlock) && "Exit block is in the loop?"); - BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this); + BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), DT, LI); // Okay, now we have a position to branch from and a position to branch to, // insert the new conditional branch. @@ -767,13 +769,9 @@ void LoopUnswitch::SplitExitEdges(Loop *L, // Although SplitBlockPredecessors doesn't preserve loop-simplify in // general, if we call it on all predecessors of all exits then it does. - if (!ExitBlock->isLandingPad()) { - SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", this); - } else { - SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa", - this, NewBBs); - } + SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", + /*AliasAnalysis*/ nullptr, DT, LI, + /*PreserveLCSSA*/ true); } } @@ -796,7 +794,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // First step, split the preheader and exit blocks, and add these blocks to // the LoopBlocks list. - BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, this); + BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, DT, LI); LoopBlocks.push_back(NewPreheader); // We want the loop to come after the preheader, but before the exit blocks. @@ -836,7 +834,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // FIXME: We could register any cloned assumptions instead of clearing the // whole function's cache. - AT->forgetCachedAssumptions(F); + AC->clear(); // Now we create the new Loop object for the versioned loop. Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM); @@ -849,14 +847,14 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, if (ParentLoop) { // Make sure to add the cloned preheader and exit blocks to the parent loop // as well. - ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase()); + ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI); } for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]); // The new exit block should be in the same loop as the old one. if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i])) - ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase()); + ExitBBLoop->addBasicBlockToLoop(NewExit, *LI); assert(NewExit->getTerminator()->getNumSuccessors() == 1 && "Exit block should have been split to have one successor!"); @@ -1042,7 +1040,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, // and hooked up so as to preserve the loop structure, because // trying to update it is complicated. So instead we preserve the // loop structure and put the block on a dead code path. - SplitEdge(Switch, SISucc, this); + SplitEdge(Switch, SISucc, DT, LI); // Compute the successors instead of relying on the return value // of SplitEdge, since it may have split the switch successor // after PHI nodes. diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index ff89e74..0c47cbd 100644 --- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp +++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -11,7 +11,8 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -24,13 +25,14 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include <vector> +#include "llvm/Transforms/Scalar.h" using namespace llvm; #define DEBUG_TYPE "lower-expect-intrinsic" -STATISTIC(IfHandled, "Number of 'expect' intrinsic instructions handled"); +STATISTIC(ExpectIntrinsicsHandled, + "Number of 'expect' intrinsic instructions handled"); static cl::opt<uint32_t> LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64), @@ -39,27 +41,8 @@ static cl::opt<uint32_t> UnlikelyBranchWeight("unlikely-branch-weight", cl::Hidden, cl::init(4), cl::desc("Weight of the branch unlikely to be taken (default = 4)")); -namespace { - - class LowerExpectIntrinsic : public FunctionPass { - - bool HandleSwitchExpect(SwitchInst *SI); - - bool HandleIfExpect(BranchInst *BI); - - public: - static char ID; - LowerExpectIntrinsic() : FunctionPass(ID) { - initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; - }; -} - - -bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) { - CallInst *CI = dyn_cast<CallInst>(SI->getCondition()); +static bool handleSwitchExpect(SwitchInst &SI) { + CallInst *CI = dyn_cast<CallInst>(SI.getCondition()); if (!CI) return false; @@ -72,26 +55,24 @@ bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) { if (!ExpectedValue) return false; - SwitchInst::CaseIt Case = SI->findCaseValue(ExpectedValue); - unsigned n = SI->getNumCases(); // +1 for default case. - std::vector<uint32_t> Weights(n + 1); + SwitchInst::CaseIt Case = SI.findCaseValue(ExpectedValue); + unsigned n = SI.getNumCases(); // +1 for default case. + SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight); - Weights[0] = Case == SI->case_default() ? LikelyBranchWeight - : UnlikelyBranchWeight; - for (unsigned i = 0; i != n; ++i) - Weights[i + 1] = i == Case.getCaseIndex() ? LikelyBranchWeight - : UnlikelyBranchWeight; + if (Case == SI.case_default()) + Weights[0] = LikelyBranchWeight; + else + Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight; - SI->setMetadata(LLVMContext::MD_prof, - MDBuilder(CI->getContext()).createBranchWeights(Weights)); + SI.setMetadata(LLVMContext::MD_prof, + MDBuilder(CI->getContext()).createBranchWeights(Weights)); - SI->setCondition(ArgValue); + SI.setCondition(ArgValue); return true; } - -bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) { - if (BI->isUnconditional()) +static bool handleBranchExpect(BranchInst &BI) { + if (BI.isUnconditional()) return false; // Handle non-optimized IR code like: @@ -105,9 +86,9 @@ bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) { CallInst *CI; - ICmpInst *CmpI = dyn_cast<ICmpInst>(BI->getCondition()); + ICmpInst *CmpI = dyn_cast<ICmpInst>(BI.getCondition()); if (!CmpI) { - CI = dyn_cast<CallInst>(BI->getCondition()); + CI = dyn_cast<CallInst>(BI.getCondition()); } else { if (CmpI->getPredicate() != CmpInst::ICMP_NE) return false; @@ -136,32 +117,30 @@ bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) { else Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight); - BI->setMetadata(LLVMContext::MD_prof, Node); + BI.setMetadata(LLVMContext::MD_prof, Node); if (CmpI) CmpI->setOperand(0, ArgValue); else - BI->setCondition(ArgValue); + BI.setCondition(ArgValue); return true; } +static bool lowerExpectIntrinsic(Function &F) { + bool Changed = false; -bool LowerExpectIntrinsic::runOnFunction(Function &F) { - for (Function::iterator I = F.begin(), E = F.end(); I != E;) { - BasicBlock *BB = I++; - + for (BasicBlock &BB : F) { // Create "block_weights" metadata. - if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { - if (HandleIfExpect(BI)) - IfHandled++; - } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { - if (HandleSwitchExpect(SI)) - IfHandled++; + if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) { + if (handleBranchExpect(*BI)) + ExpectIntrinsicsHandled++; + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) { + if (handleSwitchExpect(*SI)) + ExpectIntrinsicsHandled++; } // remove llvm.expect intrinsics. - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); - BI != BE; ) { + for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { CallInst *CI = dyn_cast<CallInst>(BI++); if (!CI) continue; @@ -171,17 +150,42 @@ bool LowerExpectIntrinsic::runOnFunction(Function &F) { Value *Exp = CI->getArgOperand(0); CI->replaceAllUsesWith(Exp); CI->eraseFromParent(); + Changed = true; } } } - return false; + return Changed; } +PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F) { + if (lowerExpectIntrinsic(F)) + return PreservedAnalyses::none(); + + return PreservedAnalyses::all(); +} + +namespace { +/// \brief Legacy pass for lowering expect intrinsics out of the IR. +/// +/// When this pass is run over a function it uses expect intrinsics which feed +/// branches and switches to provide branch weight metadata for those +/// terminators. It then removes the expect intrinsics from the IR so the rest +/// of the optimizer can ignore them. +class LowerExpectIntrinsic : public FunctionPass { +public: + static char ID; + LowerExpectIntrinsic() : FunctionPass(ID) { + initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); } +}; +} char LowerExpectIntrinsic::ID = 0; -INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect", "Lower 'expect' " - "Intrinsics", false, false) +INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect", + "Lower 'expect' Intrinsics", false, false) FunctionPass *llvm::createLowerExpectIntrinsicPass() { return new LowerExpectIntrinsic(); diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index be524be..006b885 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -16,7 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" @@ -28,7 +28,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; @@ -330,11 +330,11 @@ namespace { // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<MemoryDependenceAnalysis>(); } @@ -363,10 +363,10 @@ FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false) @@ -750,6 +750,16 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // its dependence information by changing its parameter. MD->removeInstruction(C); + // Update AA metadata + // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be + // handled here, but combineMetadata doesn't support them yet + unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, + LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + }; + combineMetadata(C, cpy, KnownIDs); + // Remove the memcpy. MD->removeInstruction(cpy); ++NumMemCpyInstr; @@ -982,11 +992,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // If it is greater than the memcpy, then we check to see if we can force the // source of the memcpy to the alignment we need. If we fail, we bail out. - AssumptionTracker *AT = &getAnalysis<AssumptionTracker>(); + AssumptionCache &AC = + getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + *CS->getParent()->getParent()); DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); if (MDep->getAlignment() < ByValAlign && - getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, - DL, AT, CS.getInstruction(), &DT) < ByValAlign) + getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &AC, + CS.getInstruction(), &DT) < ByValAlign) return false; // Verify that the copied-from memory doesn't change in between the memcpy and @@ -1067,7 +1079,7 @@ bool MemCpyOpt::runOnFunction(Function &F) { MD = &getAnalysis<MemoryDependenceAnalysis>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); // If we don't have at least memset and memcpy, there is little point of doing // anything here. These are required by a freestanding implementation, so if diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 8281c59..8fad63f 100644 --- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -86,7 +86,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <vector> @@ -115,7 +115,7 @@ public: private: // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); AU.addPreserved<AliasAnalysis>(); @@ -143,7 +143,9 @@ private: // Routines for sinking stores StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI); PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); - bool isStoreSinkBarrier(Instruction *Inst); + bool isStoreSinkBarrierInRange(const Instruction& Start, + const Instruction& End, + AliasAnalysis::Location Loc); bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst); bool mergeStores(BasicBlock *BB); // The mergeLoad/Store algorithms could have Size0 * Size1 complexity, @@ -166,7 +168,7 @@ FunctionPass *llvm::createMergedLoadStoreMotionPass() { INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) @@ -239,7 +241,7 @@ bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, const Instruction& End, LoadInst* LI) { AliasAnalysis::Location Loc = AA->getLocation(LI); - return AA->canInstructionRangeModify(Start, End, Loc); + return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Mod); } /// @@ -389,26 +391,19 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { } /// -/// \brief True when instruction is sink barrier for a store -/// -bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) { - // FIXME: Conservatively let a load instruction block the store. - // Use alias analysis instead. - if (isa<LoadInst>(Inst)) - return true; - if (isa<CallInst>(Inst)) - return true; - if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst)) - return true; - // Note: mayHaveSideEffects covers all instructions that could - // trigger a change to state. Eg. in-flight stores have to be executed - // before ordered loads or fences, calls could invoke functions that store - // data to memory etc. - if (!isa<StoreInst>(Inst) && Inst->mayHaveSideEffects()) { - return true; - } - DEBUG(dbgs() << "No Sink Barrier\n"); - return false; +/// \brief True when instruction is a sink barrier for a store +/// located in Loc +/// +/// Whenever an instruction could possibly read or modify the +/// value being stored or protect against the store from +/// happening it is considered a sink barrier. +/// + +bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction& Start, + const Instruction& End, + AliasAnalysis::Location + Loc) { + return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::ModRef); } /// @@ -416,27 +411,30 @@ bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) { /// /// \return The store in \p when it is safe to sink. Otherwise return Null. /// -StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB, - StoreInst *SI) { - StoreInst *I = 0; - DEBUG(dbgs() << "can Sink? : "; SI->dump(); dbgs() << "\n"); - for (BasicBlock::reverse_iterator RBI = BB->rbegin(), RBE = BB->rend(); +StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1, + StoreInst *Store0) { + DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n"); + BasicBlock *BB0 = Store0->getParent(); + for (BasicBlock::reverse_iterator RBI = BB1->rbegin(), RBE = BB1->rend(); RBI != RBE; ++RBI) { Instruction *Inst = &*RBI; - // Only move loads if they are used in the block. - if (isStoreSinkBarrier(Inst)) - break; - if (isa<StoreInst>(Inst)) { - AliasAnalysis::Location LocSI = AA->getLocation(SI); - AliasAnalysis::Location LocInst = AA->getLocation((StoreInst *)Inst); - if (AA->isMustAlias(LocSI, LocInst)) { - I = (StoreInst *)Inst; - break; - } + if (!isa<StoreInst>(Inst)) + continue; + + StoreInst *Store1 = cast<StoreInst>(Inst); + + AliasAnalysis::Location Loc0 = AA->getLocation(Store0); + AliasAnalysis::Location Loc1 = AA->getLocation(Store1); + if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) && + !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))), + BB1->back(), Loc1) && + !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store0))), + BB0->back(), Loc0)) { + return Store1; } } - return I; + return nullptr; } /// @@ -548,8 +546,7 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { Instruction *I = &*RBI; ++RBI; - if (isStoreSinkBarrier(I)) - break; + // Sink move non-simple (atomic, volatile) stores if (!isa<StoreInst>(I)) continue; diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 5c8bed5..31d7df3 100644 --- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -18,7 +18,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -52,16 +52,18 @@ INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls", "Partially inline calls to library functions", false, false) void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetLibraryInfo>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); FunctionPass::getAnalysisUsage(AU); } bool PartiallyInlineLibCalls::runOnFunction(Function &F) { bool Changed = false; Function::iterator CurrBB; - TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); - const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfo>(); + TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + const TargetTransformInfo *TTI = + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) { CurrBB = BB++; @@ -126,7 +128,7 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call, // Move all instructions following Call to newly created block JoinBB. // Create phi and replace all uses. - BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode(), this); + BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode()); IRBuilder<> Builder(JoinBB, JoinBB->begin()); PHINode *Phi = Builder.CreatePHI(Call->getType(), 2); Call->replaceAllUsesWith(Phi); diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp new file mode 100644 index 0000000..944725a --- /dev/null +++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -0,0 +1,989 @@ +//===- PlaceSafepoints.cpp - Place GC Safepoints --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Place garbage collection safepoints at appropriate locations in the IR. This +// does not make relocation semantics or variable liveness explicit. That's +// done by RewriteStatepointsForGC. +// +// Terminology: +// - A call is said to be "parseable" if there is a stack map generated for the +// return PC of the call. A runtime can determine where values listed in the +// deopt arguments and (after RewriteStatepointsForGC) gc arguments are located +// on the stack when the code is suspended inside such a call. Every parse +// point is represented by a call wrapped in an gc.statepoint intrinsic. +// - A "poll" is an explicit check in the generated code to determine if the +// runtime needs the generated code to cooperate by calling a helper routine +// and thus suspending its execution at a known state. The call to the helper +// routine will be parseable. The (gc & runtime specific) logic of a poll is +// assumed to be provided in a function of the name "gc.safepoint_poll". +// +// We aim to insert polls such that running code can quickly be brought to a +// well defined state for inspection by the collector. In the current +// implementation, this is done via the insertion of poll sites at method entry +// and the backedge of most loops. We try to avoid inserting more polls than +// are neccessary to ensure a finite period between poll sites. This is not +// because the poll itself is expensive in the generated code; it's not. Polls +// do tend to impact the optimizer itself in negative ways; we'd like to avoid +// perturbing the optimization of the method as much as we can. +// +// We also need to make most call sites parseable. The callee might execute a +// poll (or otherwise be inspected by the GC). If so, the entire stack +// (including the suspended frame of the current method) must be parseable. +// +// This pass will insert: +// - Call parse points ("call safepoints") for any call which may need to +// reach a safepoint during the execution of the callee function. +// - Backedge safepoint polls and entry safepoint polls to ensure that +// executing code reaches a safepoint poll in a finite amount of time. +// +// We do not currently support return statepoints, but adding them would not +// be hard. They are not required for correctness - entry safepoints are an +// alternative - but some GCs may prefer them. Patches welcome. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Pass.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Statepoint.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" + +#define DEBUG_TYPE "safepoint-placement" +STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted"); +STATISTIC(NumCallSafepoints, "Number of call safepoints inserted"); +STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted"); + +STATISTIC(CallInLoop, "Number of loops w/o safepoints due to calls in loop"); +STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution"); + +using namespace llvm; + +// Ignore oppurtunities to avoid placing safepoints on backedges, useful for +// validation +static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden, + cl::init(false)); + +/// If true, do not place backedge safepoints in counted loops. +static cl::opt<bool> SkipCounted("spp-counted", cl::Hidden, cl::init(true)); + +// If true, split the backedge of a loop when placing the safepoint, otherwise +// split the latch block itself. Both are useful to support for +// experimentation, but in practice, it looks like splitting the backedge +// optimizes better. +static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden, + cl::init(false)); + +// Print tracing output +static cl::opt<bool> TraceLSP("spp-trace", cl::Hidden, cl::init(false)); + +namespace { + +/** An analysis pass whose purpose is to identify each of the backedges in + the function which require a safepoint poll to be inserted. */ +struct PlaceBackedgeSafepointsImpl : public LoopPass { + static char ID; + + /// The output of the pass - gives a list of each backedge (described by + /// pointing at the branch) which need a poll inserted. + std::vector<TerminatorInst *> PollLocations; + + /// True unless we're running spp-no-calls in which case we need to disable + /// the call dependend placement opts. + bool CallSafepointsEnabled; + PlaceBackedgeSafepointsImpl(bool CallSafepoints = false) + : LoopPass(ID), CallSafepointsEnabled(CallSafepoints) { + initializePlaceBackedgeSafepointsImplPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *, LPPassManager &LPM) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // needed for determining if the loop is finite + AU.addRequired<ScalarEvolution>(); + // to ensure each edge has a single backedge + // TODO: is this still required? + AU.addRequiredID(LoopSimplifyID); + + // We no longer modify the IR at all in this pass. Thus all + // analysis are preserved. + AU.setPreservesAll(); + } +}; +} + +static cl::opt<bool> NoEntry("spp-no-entry", cl::Hidden, cl::init(false)); +static cl::opt<bool> NoCall("spp-no-call", cl::Hidden, cl::init(false)); +static cl::opt<bool> NoBackedge("spp-no-backedge", cl::Hidden, cl::init(false)); + +namespace { +struct PlaceSafepoints : public ModulePass { + static char ID; // Pass identification, replacement for typeid + + PlaceSafepoints() : ModulePass(ID) { + initializePlaceSafepointsPass(*PassRegistry::getPassRegistry()); + } + bool runOnModule(Module &M) override { + bool modified = false; + for (Function &F : M) { + modified |= runOnFunction(F); + } + return modified; + } + bool runOnFunction(Function &F); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // We modify the graph wholesale (inlining, block insertion, etc). We + // preserve nothing at the moment. We could potentially preserve dom tree + // if that was worth doing + } +}; +} + +// Insert a safepoint poll immediately before the given instruction. Does +// not handle the parsability of state at the runtime call, that's the +// callers job. +static void +InsertSafepointPoll(DominatorTree &DT, Instruction *after, + std::vector<CallSite> &ParsePointsNeeded /*rval*/); + +static bool isGCLeafFunction(const CallSite &CS); + +static bool needsStatepoint(const CallSite &CS) { + if (isGCLeafFunction(CS)) + return false; + if (CS.isCall()) { + CallInst *call = cast<CallInst>(CS.getInstruction()); + if (call->isInlineAsm()) + return false; + } + if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS)) { + return false; + } + return true; +} + +static Value *ReplaceWithStatepoint(const CallSite &CS, Pass *P); + +/// Returns true if this loop is known to contain a call safepoint which +/// must unconditionally execute on any iteration of the loop which returns +/// to the loop header via an edge from Pred. Returns a conservative correct +/// answer; i.e. false is always valid. +static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, + BasicBlock *Pred, + DominatorTree &DT) { + // In general, we're looking for any cut of the graph which ensures + // there's a call safepoint along every edge between Header and Pred. + // For the moment, we look only for the 'cuts' that consist of a single call + // instruction in a block which is dominated by the Header and dominates the + // loop latch (Pred) block. Somewhat surprisingly, walking the entire chain + // of such dominating blocks gets substaintially more occurences than just + // checking the Pred and Header blocks themselves. This may be due to the + // density of loop exit conditions caused by range and null checks. + // TODO: structure this as an analysis pass, cache the result for subloops, + // avoid dom tree recalculations + assert(DT.dominates(Header, Pred) && "loop latch not dominated by header?"); + + BasicBlock *Current = Pred; + while (true) { + for (Instruction &I : *Current) { + if (CallSite CS = &I) + // Note: Technically, needing a safepoint isn't quite the right + // condition here. We should instead be checking if the target method + // has an + // unconditional poll. In practice, this is only a theoretical concern + // since we don't have any methods with conditional-only safepoint + // polls. + if (needsStatepoint(CS)) + return true; + } + + if (Current == Header) + break; + Current = DT.getNode(Current)->getIDom()->getBlock(); + } + + return false; +} + +/// Returns true if this loop is known to terminate in a finite number of +/// iterations. Note that this function may return false for a loop which +/// does actual terminate in a finite constant number of iterations due to +/// conservatism in the analysis. +static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE, + BasicBlock *Pred) { + // Only used when SkipCounted is off + const unsigned upperTripBound = 8192; + + // A conservative bound on the loop as a whole. + const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L); + if (MaxTrips != SE->getCouldNotCompute()) { + if (SE->getUnsignedRange(MaxTrips).getUnsignedMax().ult(upperTripBound)) + return true; + if (SkipCounted && + SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(32)) + return true; + } + + // If this is a conditional branch to the header with the alternate path + // being outside the loop, we can ask questions about the execution frequency + // of the exit block. + if (L->isLoopExiting(Pred)) { + // This returns an exact expression only. TODO: We really only need an + // upper bound here, but SE doesn't expose that. + const SCEV *MaxExec = SE->getExitCount(L, Pred); + if (MaxExec != SE->getCouldNotCompute()) { + if (SE->getUnsignedRange(MaxExec).getUnsignedMax().ult(upperTripBound)) + return true; + if (SkipCounted && + SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(32)) + return true; + } + } + + return /* not finite */ false; +} + +static void scanOneBB(Instruction *start, Instruction *end, + std::vector<CallInst *> &calls, + std::set<BasicBlock *> &seen, + std::vector<BasicBlock *> &worklist) { + for (BasicBlock::iterator itr(start); + itr != start->getParent()->end() && itr != BasicBlock::iterator(end); + itr++) { + if (CallInst *CI = dyn_cast<CallInst>(&*itr)) { + calls.push_back(CI); + } + // FIXME: This code does not handle invokes + assert(!dyn_cast<InvokeInst>(&*itr) && + "support for invokes in poll code needed"); + // Only add the successor blocks if we reach the terminator instruction + // without encountering end first + if (itr->isTerminator()) { + BasicBlock *BB = itr->getParent(); + for (BasicBlock *Succ : successors(BB)) { + if (seen.count(Succ) == 0) { + worklist.push_back(Succ); + seen.insert(Succ); + } + } + } + } +} +static void scanInlinedCode(Instruction *start, Instruction *end, + std::vector<CallInst *> &calls, + std::set<BasicBlock *> &seen) { + calls.clear(); + std::vector<BasicBlock *> worklist; + seen.insert(start->getParent()); + scanOneBB(start, end, calls, seen, worklist); + while (!worklist.empty()) { + BasicBlock *BB = worklist.back(); + worklist.pop_back(); + scanOneBB(&*BB->begin(), end, calls, seen, worklist); + } +} + +bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L, LPPassManager &LPM) { + ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); + + // Loop through all predecessors of the loop header and identify all + // backedges. We need to place a safepoint on every backedge (potentially). + // Note: Due to LoopSimplify there should only be one. Assert? Or can we + // relax this? + BasicBlock *header = L->getHeader(); + + // TODO: Use the analysis pass infrastructure for this. There is no reason + // to recalculate this here. + DominatorTree DT; + DT.recalculate(*header->getParent()); + + bool modified = false; + for (BasicBlock *pred : predecessors(header)) { + if (!L->contains(pred)) { + // This is not a backedge, it's coming from outside the loop + continue; + } + + // Make a policy decision about whether this loop needs a safepoint or + // not. Note that this is about unburdening the optimizer in loops, not + // avoiding the runtime cost of the actual safepoint. + if (!AllBackedges) { + if (mustBeFiniteCountedLoop(L, SE, pred)) { + if (TraceLSP) + errs() << "skipping safepoint placement in finite loop\n"; + FiniteExecution++; + continue; + } + if (CallSafepointsEnabled && + containsUnconditionalCallSafepoint(L, header, pred, DT)) { + // Note: This is only semantically legal since we won't do any further + // IPO or inlining before the actual call insertion.. If we hadn't, we + // might latter loose this call safepoint. + if (TraceLSP) + errs() << "skipping safepoint placement due to unconditional call\n"; + CallInLoop++; + continue; + } + } + + // TODO: We can create an inner loop which runs a finite number of + // iterations with an outer loop which contains a safepoint. This would + // not help runtime performance that much, but it might help our ability to + // optimize the inner loop. + + // We're unconditionally going to modify this loop. + modified = true; + + // Safepoint insertion would involve creating a new basic block (as the + // target of the current backedge) which does the safepoint (of all live + // variables) and branches to the true header + TerminatorInst *term = pred->getTerminator(); + + if (TraceLSP) { + errs() << "[LSP] terminator instruction: "; + term->dump(); + } + + PollLocations.push_back(term); + } + + return modified; +} + +static Instruction *findLocationForEntrySafepoint(Function &F, + DominatorTree &DT) { + + // Conceptually, this poll needs to be on method entry, but in + // practice, we place it as late in the entry block as possible. We + // can place it as late as we want as long as it dominates all calls + // that can grow the stack. This, combined with backedge polls, + // give us all the progress guarantees we need. + + // Due to the way the frontend generates IR, we may have a couple of initial + // basic blocks before the first bytecode. These will be single-entry + // single-exit blocks which conceptually are just part of the first 'real + // basic block'. Since we don't have deopt state until the first bytecode, + // walk forward until we've found the first unconditional branch or merge. + + // hasNextInstruction and nextInstruction are used to iterate + // through a "straight line" execution sequence. + + auto hasNextInstruction = [](Instruction *I) { + if (!I->isTerminator()) { + return true; + } + BasicBlock *nextBB = I->getParent()->getUniqueSuccessor(); + return nextBB && (nextBB->getUniquePredecessor() != nullptr); + }; + + auto nextInstruction = [&hasNextInstruction](Instruction *I) { + assert(hasNextInstruction(I) && + "first check if there is a next instruction!"); + if (I->isTerminator()) { + return I->getParent()->getUniqueSuccessor()->begin(); + } else { + return std::next(BasicBlock::iterator(I)); + } + }; + + Instruction *cursor = nullptr; + for (cursor = F.getEntryBlock().begin(); hasNextInstruction(cursor); + cursor = nextInstruction(cursor)) { + + // We need to stop going forward as soon as we see a call that can + // grow the stack (i.e. the call target has a non-zero frame + // size). + if (CallSite CS = cursor) { + (void)CS; // Silence an unused variable warning by gcc 4.8.2 + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(cursor)) { + // llvm.assume(...) are not really calls. + if (II->getIntrinsicID() == Intrinsic::assume) { + continue; + } + } + break; + } + } + + assert((hasNextInstruction(cursor) || cursor->isTerminator()) && + "either we stopped because of a call, or because of terminator"); + + if (cursor->isTerminator()) { + return cursor; + } + + BasicBlock *BB = cursor->getParent(); + SplitBlock(BB, cursor, nullptr); + + // Note: SplitBlock modifies the DT. Simply passing a Pass (which is a + // module pass) is not enough. + DT.recalculate(F); +#ifndef NDEBUG + // SplitBlock updates the DT + DT.verifyDomTree(); +#endif + + return BB->getTerminator(); +} + +/// Identify the list of call sites which need to be have parseable state +static void findCallSafepoints(Function &F, + std::vector<CallSite> &Found /*rval*/) { + assert(Found.empty() && "must be empty!"); + for (Instruction &I : inst_range(F)) { + Instruction *inst = &I; + if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) { + CallSite CS(inst); + + // No safepoint needed or wanted + if (!needsStatepoint(CS)) { + continue; + } + + Found.push_back(CS); + } + } +} + +/// Implement a unique function which doesn't require we sort the input +/// vector. Doing so has the effect of changing the output of a couple of +/// tests in ways which make them less useful in testing fused safepoints. +template <typename T> static void unique_unsorted(std::vector<T> &vec) { + std::set<T> seen; + std::vector<T> tmp; + vec.reserve(vec.size()); + std::swap(tmp, vec); + for (auto V : tmp) { + if (seen.insert(V).second) { + vec.push_back(V); + } + } +} + +static std::string GCSafepointPollName("gc.safepoint_poll"); + +static bool isGCSafepointPoll(Function &F) { + return F.getName().equals(GCSafepointPollName); +} + +/// Returns true if this function should be rewritten to include safepoint +/// polls and parseable call sites. The main point of this function is to be +/// an extension point for custom logic. +static bool shouldRewriteFunction(Function &F) { + // TODO: This should check the GCStrategy + if (F.hasGC()) { + const std::string StatepointExampleName("statepoint-example"); + return StatepointExampleName == F.getGC(); + } else + return false; +} + +// TODO: These should become properties of the GCStrategy, possibly with +// command line overrides. +static bool enableEntrySafepoints(Function &F) { return !NoEntry; } +static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; } +static bool enableCallSafepoints(Function &F) { return !NoCall; } + + +bool PlaceSafepoints::runOnFunction(Function &F) { + if (F.isDeclaration() || F.empty()) { + // This is a declaration, nothing to do. Must exit early to avoid crash in + // dom tree calculation + return false; + } + + if (isGCSafepointPoll(F)) { + // Given we're inlining this inside of safepoint poll insertion, this + // doesn't make any sense. Note that we do make any contained calls + // parseable after we inline a poll. + return false; + } + + if (!shouldRewriteFunction(F)) + return false; + + bool modified = false; + + // In various bits below, we rely on the fact that uses are reachable from + // defs. When there are basic blocks unreachable from the entry, dominance + // and reachablity queries return non-sensical results. Thus, we preprocess + // the function to ensure these properties hold. + modified |= removeUnreachableBlocks(F); + + // STEP 1 - Insert the safepoint polling locations. We do not need to + // actually insert parse points yet. That will be done for all polls and + // calls in a single pass. + + // Note: With the migration, we need to recompute this for each 'pass'. Once + // we merge these, we'll do it once before the analysis + DominatorTree DT; + + std::vector<CallSite> ParsePointNeeded; + + if (enableBackedgeSafepoints(F)) { + // Construct a pass manager to run the LoopPass backedge logic. We + // need the pass manager to handle scheduling all the loop passes + // appropriately. Doing this by hand is painful and just not worth messing + // with for the moment. + legacy::FunctionPassManager FPM(F.getParent()); + bool CanAssumeCallSafepoints = enableCallSafepoints(F); + PlaceBackedgeSafepointsImpl *PBS = + new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints); + FPM.add(PBS); + // Note: While the analysis pass itself won't modify the IR, LoopSimplify + // (which it depends on) may. i.e. analysis must be recalculated after run + FPM.run(F); + + // We preserve dominance information when inserting the poll, otherwise + // we'd have to recalculate this on every insert + DT.recalculate(F); + + // Insert a poll at each point the analysis pass identified + for (size_t i = 0; i < PBS->PollLocations.size(); i++) { + // We are inserting a poll, the function is modified + modified = true; + + // The poll location must be the terminator of a loop latch block. + TerminatorInst *Term = PBS->PollLocations[i]; + + std::vector<CallSite> ParsePoints; + if (SplitBackedge) { + // Split the backedge of the loop and insert the poll within that new + // basic block. This creates a loop with two latches per original + // latch (which is non-ideal), but this appears to be easier to + // optimize in practice than inserting the poll immediately before the + // latch test. + + // Since this is a latch, at least one of the successors must dominate + // it. Its possible that we have a) duplicate edges to the same header + // and b) edges to distinct loop headers. We need to insert pools on + // each. (Note: This still relies on LoopSimplify.) + DenseSet<BasicBlock *> Headers; + for (unsigned i = 0; i < Term->getNumSuccessors(); i++) { + BasicBlock *Succ = Term->getSuccessor(i); + if (DT.dominates(Succ, Term->getParent())) { + Headers.insert(Succ); + } + } + assert(!Headers.empty() && "poll location is not a loop latch?"); + + // The split loop structure here is so that we only need to recalculate + // the dominator tree once. Alternatively, we could just keep it up to + // date and use a more natural merged loop. + DenseSet<BasicBlock *> SplitBackedges; + for (BasicBlock *Header : Headers) { + BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, nullptr); + SplitBackedges.insert(NewBB); + } + DT.recalculate(F); + for (BasicBlock *NewBB : SplitBackedges) { + InsertSafepointPoll(DT, NewBB->getTerminator(), ParsePoints); + NumBackedgeSafepoints++; + } + + } else { + // Split the latch block itself, right before the terminator. + InsertSafepointPoll(DT, Term, ParsePoints); + NumBackedgeSafepoints++; + } + + // Record the parse points for later use + ParsePointNeeded.insert(ParsePointNeeded.end(), ParsePoints.begin(), + ParsePoints.end()); + } + } + + if (enableEntrySafepoints(F)) { + DT.recalculate(F); + Instruction *term = findLocationForEntrySafepoint(F, DT); + if (!term) { + // policy choice not to insert? + } else { + std::vector<CallSite> RuntimeCalls; + InsertSafepointPoll(DT, term, RuntimeCalls); + modified = true; + NumEntrySafepoints++; + ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(), + RuntimeCalls.end()); + } + } + + if (enableCallSafepoints(F)) { + DT.recalculate(F); + std::vector<CallSite> Calls; + findCallSafepoints(F, Calls); + NumCallSafepoints += Calls.size(); + ParsePointNeeded.insert(ParsePointNeeded.end(), Calls.begin(), Calls.end()); + } + + // Unique the vectors since we can end up with duplicates if we scan the call + // site for call safepoints after we add it for entry or backedge. The + // only reason we need tracking at all is that some functions might have + // polls but not call safepoints and thus we might miss marking the runtime + // calls for the polls. (This is useful in test cases!) + unique_unsorted(ParsePointNeeded); + + // Any parse point (no matter what source) will be handled here + DT.recalculate(F); // Needed? + + // We're about to start modifying the function + if (!ParsePointNeeded.empty()) + modified = true; + + // Now run through and insert the safepoints, but do _NOT_ update or remove + // any existing uses. We have references to live variables that need to + // survive to the last iteration of this loop. + std::vector<Value *> Results; + Results.reserve(ParsePointNeeded.size()); + for (size_t i = 0; i < ParsePointNeeded.size(); i++) { + CallSite &CS = ParsePointNeeded[i]; + Value *GCResult = ReplaceWithStatepoint(CS, nullptr); + Results.push_back(GCResult); + } + assert(Results.size() == ParsePointNeeded.size()); + + // Adjust all users of the old call sites to use the new ones instead + for (size_t i = 0; i < ParsePointNeeded.size(); i++) { + CallSite &CS = ParsePointNeeded[i]; + Value *GCResult = Results[i]; + if (GCResult) { + // In case if we inserted result in a different basic block than the + // original safepoint (this can happen for invokes). We need to be sure + // that + // original result value was not used in any of the phi nodes at the + // beginning of basic block with gc result. Because we know that all such + // blocks will have single predecessor we can safely assume that all phi + // nodes have single entry (because of normalizeBBForInvokeSafepoint). + // Just remove them all here. + if (CS.isInvoke()) { + FoldSingleEntryPHINodes(cast<Instruction>(GCResult)->getParent(), + nullptr); + assert( + !isa<PHINode>(cast<Instruction>(GCResult)->getParent()->begin())); + } + + // Replace all uses with the new call + CS.getInstruction()->replaceAllUsesWith(GCResult); + } + + // Now that we've handled all uses, remove the original call itself + // Note: The insert point can't be the deleted instruction! + CS.getInstruction()->eraseFromParent(); + } + return modified; +} + +char PlaceBackedgeSafepointsImpl::ID = 0; +char PlaceSafepoints::ID = 0; + +ModulePass *llvm::createPlaceSafepointsPass() { return new PlaceSafepoints(); } + +INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl, + "place-backedge-safepoints-impl", + "Place Backedge Safepoints", false, false) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl, + "place-backedge-safepoints-impl", + "Place Backedge Safepoints", false, false) + +INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints", + false, false) +INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints", + false, false) + +static bool isGCLeafFunction(const CallSite &CS) { + Instruction *inst = CS.getInstruction(); + if (isa<IntrinsicInst>(inst)) { + // Most LLVM intrinsics are things which can never take a safepoint. + // As a result, we don't need to have the stack parsable at the + // callsite. This is a highly useful optimization since intrinsic + // calls are fairly prevelent, particularly in debug builds. + return true; + } + + // If this function is marked explicitly as a leaf call, we don't need to + // place a safepoint of it. In fact, for correctness we *can't* in many + // cases. Note: Indirect calls return Null for the called function, + // these obviously aren't runtime functions with attributes + // TODO: Support attributes on the call site as well. + const Function *F = CS.getCalledFunction(); + bool isLeaf = + F && + F->getFnAttribute("gc-leaf-function").getValueAsString().equals("true"); + if (isLeaf) { + return true; + } + return false; +} + +static void +InsertSafepointPoll(DominatorTree &DT, Instruction *term, + std::vector<CallSite> &ParsePointsNeeded /*rval*/) { + Module *M = term->getParent()->getParent()->getParent(); + assert(M); + + // Inline the safepoint poll implementation - this will get all the branch, + // control flow, etc.. Most importantly, it will introduce the actual slow + // path call - where we need to insert a safepoint (parsepoint). + FunctionType *ftype = + FunctionType::get(Type::getVoidTy(M->getContext()), false); + assert(ftype && "null?"); + // Note: This cast can fail if there's a function of the same name with a + // different type inserted previously + Function *F = + dyn_cast<Function>(M->getOrInsertFunction("gc.safepoint_poll", ftype)); + assert(F && "void @gc.safepoint_poll() must be defined"); + assert(!F->empty() && "gc.safepoint_poll must be a non-empty function"); + CallInst *poll = CallInst::Create(F, "", term); + + // Record some information about the call site we're replacing + BasicBlock *OrigBB = term->getParent(); + BasicBlock::iterator before(poll), after(poll); + bool isBegin(false); + if (before == term->getParent()->begin()) { + isBegin = true; + } else { + before--; + } + after++; + assert(after != poll->getParent()->end() && "must have successor"); + assert(DT.dominates(before, after) && "trivially true"); + + // do the actual inlining + InlineFunctionInfo IFI; + bool inlineStatus = InlineFunction(poll, IFI); + assert(inlineStatus && "inline must succeed"); + (void)inlineStatus; // suppress warning in release-asserts + + // Check post conditions + assert(IFI.StaticAllocas.empty() && "can't have allocs"); + + std::vector<CallInst *> calls; // new calls + std::set<BasicBlock *> BBs; // new BBs + insertee + // Include only the newly inserted instructions, Note: begin may not be valid + // if we inserted to the beginning of the basic block + BasicBlock::iterator start; + if (isBegin) { + start = OrigBB->begin(); + } else { + start = before; + start++; + } + + // If your poll function includes an unreachable at the end, that's not + // valid. Bugpoint likes to create this, so check for it. + assert(isPotentiallyReachable(&*start, &*after, nullptr, nullptr) && + "malformed poll function"); + + scanInlinedCode(&*(start), &*(after), calls, BBs); + + // Recompute since we've invalidated cached data. Conceptually we + // shouldn't need to do this, but implementation wise we appear to. Needed + // so we can insert safepoints correctly. + // TODO: update more cheaply + DT.recalculate(*after->getParent()->getParent()); + + assert(!calls.empty() && "slow path not found for safepoint poll"); + + // Record the fact we need a parsable state at the runtime call contained in + // the poll function. This is required so that the runtime knows how to + // parse the last frame when we actually take the safepoint (i.e. execute + // the slow path) + assert(ParsePointsNeeded.empty()); + for (size_t i = 0; i < calls.size(); i++) { + + // No safepoint needed or wanted + if (!needsStatepoint(calls[i])) { + continue; + } + + // These are likely runtime calls. Should we assert that via calling + // convention or something? + ParsePointsNeeded.push_back(CallSite(calls[i])); + } + assert(ParsePointsNeeded.size() <= calls.size()); +} + +// Normalize basic block to make it ready to be target of invoke statepoint. +// It means spliting it to have single predecessor. Return newly created BB +// ready to be successor of invoke statepoint. +static BasicBlock *normalizeBBForInvokeSafepoint(BasicBlock *BB, + BasicBlock *InvokeParent) { + BasicBlock *ret = BB; + + if (!BB->getUniquePredecessor()) { + ret = SplitBlockPredecessors(BB, InvokeParent, ""); + } + + // Another requirement for such basic blocks is to not have any phi nodes. + // Since we just ensured that new BB will have single predecessor, + // all phi nodes in it will have one value. Here it would be naturall place + // to + // remove them all. But we can not do this because we are risking to remove + // one of the values stored in liveset of another statepoint. We will do it + // later after placing all safepoints. + + return ret; +} + +/// Replaces the given call site (Call or Invoke) with a gc.statepoint +/// intrinsic with an empty deoptimization arguments list. This does +/// NOT do explicit relocation for GC support. +static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ + Pass *P) { + BasicBlock *BB = CS.getInstruction()->getParent(); + Function *F = BB->getParent(); + Module *M = F->getParent(); + assert(M && "must be set"); + + // TODO: technically, a pass is not allowed to get functions from within a + // function pass since it might trigger a new function addition. Refactor + // this logic out to the initialization of the pass. Doesn't appear to + // matter in practice. + + // Then go ahead and use the builder do actually do the inserts. We insert + // immediately before the previous instruction under the assumption that all + // arguments will be available here. We can't insert afterwards since we may + // be replacing a terminator. + Instruction *insertBefore = CS.getInstruction(); + IRBuilder<> Builder(insertBefore); + + // Note: The gc args are not filled in at this time, that's handled by + // RewriteStatepointsForGC (which is currently under review). + + // Create the statepoint given all the arguments + Instruction *token = nullptr; + AttributeSet return_attributes; + if (CS.isCall()) { + CallInst *toReplace = cast<CallInst>(CS.getInstruction()); + CallInst *Call = Builder.CreateGCStatepoint( + CS.getCalledValue(), makeArrayRef(CS.arg_begin(), CS.arg_end()), None, + None, "safepoint_token"); + Call->setTailCall(toReplace->isTailCall()); + Call->setCallingConv(toReplace->getCallingConv()); + + // Before we have to worry about GC semantics, all attributes are legal + AttributeSet new_attrs = toReplace->getAttributes(); + // In case if we can handle this set of sttributes - set up function attrs + // directly on statepoint and return attrs later for gc_result intrinsic. + Call->setAttributes(new_attrs.getFnAttributes()); + return_attributes = new_attrs.getRetAttributes(); + // TODO: handle param attributes + + token = Call; + + // Put the following gc_result and gc_relocate calls immediately after the + // the old call (which we're about to delete) + BasicBlock::iterator next(toReplace); + assert(BB->end() != next && "not a terminator, must have next"); + next++; + Instruction *IP = &*(next); + Builder.SetInsertPoint(IP); + Builder.SetCurrentDebugLocation(IP->getDebugLoc()); + + } else if (CS.isInvoke()) { + // TODO: make CreateGCStatepoint return an Instruction that we can cast to a + // Call or Invoke, instead of doing this junk here. + + // Fill in the one generic type'd argument (the function is also + // vararg) + std::vector<Type *> argTypes; + argTypes.push_back(CS.getCalledValue()->getType()); + + Function *gc_statepoint_decl = Intrinsic::getDeclaration( + M, Intrinsic::experimental_gc_statepoint, argTypes); + + // First, create the statepoint (with all live ptrs as arguments). + std::vector<llvm::Value *> args; + // target, #call args, unused, ... call parameters, #deopt args, ... deopt + // parameters, ... gc parameters + Value *Target = CS.getCalledValue(); + args.push_back(Target); + int callArgSize = CS.arg_size(); + // #call args + args.push_back(Builder.getInt32(callArgSize)); + // unused + args.push_back(Builder.getInt32(0)); + // call parameters + args.insert(args.end(), CS.arg_begin(), CS.arg_end()); + // #deopt args: 0 + args.push_back(Builder.getInt32(0)); + + InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction()); + + // Insert the new invoke into the old block. We'll remove the old one in a + // moment at which point this will become the new terminator for the + // original block. + InvokeInst *invoke = InvokeInst::Create( + gc_statepoint_decl, toReplace->getNormalDest(), + toReplace->getUnwindDest(), args, "", toReplace->getParent()); + invoke->setCallingConv(toReplace->getCallingConv()); + + // Currently we will fail on parameter attributes and on certain + // function attributes. + AttributeSet new_attrs = toReplace->getAttributes(); + // In case if we can handle this set of sttributes - set up function attrs + // directly on statepoint and return attrs later for gc_result intrinsic. + invoke->setAttributes(new_attrs.getFnAttributes()); + return_attributes = new_attrs.getRetAttributes(); + + token = invoke; + + // We'll insert the gc.result into the normal block + BasicBlock *normalDest = normalizeBBForInvokeSafepoint( + toReplace->getNormalDest(), invoke->getParent()); + Instruction *IP = &*(normalDest->getFirstInsertionPt()); + Builder.SetInsertPoint(IP); + } else { + llvm_unreachable("unexpect type of CallSite"); + } + assert(token); + + // Handle the return value of the original call - update all uses to use a + // gc_result hanging off the statepoint node we just inserted + + // Only add the gc_result iff there is actually a used result + if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) { + std::string takenName = + CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : ""; + CallInst *gc_result = + Builder.CreateGCResult(token, CS.getType(), takenName); + gc_result->setAttributes(return_attributes); + return gc_result; + } else { + // No return value for the call. + return nullptr; + } +} diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index 1bbaaf3..98016b4 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -917,10 +917,13 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, /// version of the value is returned, and BI is left pointing at the instruction /// that should be processed next by the reassociation pass. static Value *NegateValue(Value *V, Instruction *BI) { - if (ConstantFP *C = dyn_cast<ConstantFP>(V)) - return ConstantExpr::getFNeg(C); - if (Constant *C = dyn_cast<Constant>(V)) + if (Constant *C = dyn_cast<Constant>(V)) { + if (C->getType()->isFPOrFPVectorTy()) { + return ConstantExpr::getFNeg(C); + } return ConstantExpr::getNeg(C); + } + // We are trying to expose opportunity for reassociation. One of the things // that we want to do to achieve this is to push a negation as deep into an @@ -1512,7 +1515,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, ++NumFound; } while (i != Ops.size() && Ops[i].Op == TheOp); - DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n'); + DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n'); ++NumFactor; // Insert a new multiply. @@ -1650,7 +1653,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // If any factor occurred more than one time, we can pull it out. if (MaxOcc > 1) { - DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n'); + DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n'); ++NumFactor; // Create a new instruction that uses the MaxOccVal twice. If we don't do @@ -1988,7 +1991,7 @@ Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) { Constant *C = C0 ? C0 : C1; unsigned ConstIdx = C0 ? 0 : 1; if (auto *CI = dyn_cast<ConstantInt>(C)) { - if (!CI->isNegative()) + if (!CI->isNegative() || CI->isMinValue(true)) return nullptr; } else if (auto *CF = dyn_cast<ConstantFP>(C)) { if (!CF->isNegative()) diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp index b6023e2..1b46727 100644 --- a/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/lib/Transforms/Scalar/Reg2Mem.cpp @@ -73,7 +73,7 @@ bool RegToMem::runOnFunction(Function &F) { // Insert all new allocas into entry block. BasicBlock *BBEntry = &F.getEntryBlock(); - assert(pred_begin(BBEntry) == pred_end(BBEntry) && + assert(pred_empty(BBEntry) && "Entry block to function must not have predecessors!"); // Find first non-alloca instruction and create insertion point. This is diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp new file mode 100644 index 0000000..ca9ab54 --- /dev/null +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -0,0 +1,1897 @@ +//===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Rewrite an existing set of gc.statepoints such that they make potential +// relocations performed by the garbage collector explicit in the IR. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Pass.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Statepoint.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" + +#define DEBUG_TYPE "rewrite-statepoints-for-gc" + +using namespace llvm; + +// Print tracing output +static cl::opt<bool> TraceLSP("trace-rewrite-statepoints", cl::Hidden, + cl::init(false)); + +// Print the liveset found at the insert location +static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden, + cl::init(false)); +static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", + cl::Hidden, cl::init(false)); +// Print out the base pointers for debugging +static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", + cl::Hidden, cl::init(false)); + +namespace { +struct RewriteStatepointsForGC : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + + RewriteStatepointsForGC() : FunctionPass(ID) { + initializeRewriteStatepointsForGCPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // We add and rewrite a bunch of instructions, but don't really do much + // else. We could in theory preserve a lot more analyses here. + AU.addRequired<DominatorTreeWrapperPass>(); + } +}; +} // namespace + +char RewriteStatepointsForGC::ID = 0; + +FunctionPass *llvm::createRewriteStatepointsForGCPass() { + return new RewriteStatepointsForGC(); +} + +INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc", + "Make relocations explicit at statepoints", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc", + "Make relocations explicit at statepoints", false, false) + +namespace { +// The type of the internal cache used inside the findBasePointers family +// of functions. From the callers perspective, this is an opaque type and +// should not be inspected. +// +// In the actual implementation this caches two relations: +// - The base relation itself (i.e. this pointer is based on that one) +// - The base defining value relation (i.e. before base_phi insertion) +// Generally, after the execution of a full findBasePointer call, only the +// base relation will remain. Internally, we add a mixture of the two +// types, then update all the second type to the first type +typedef DenseMap<Value *, Value *> DefiningValueMapTy; +typedef DenseSet<llvm::Value *> StatepointLiveSetTy; + +struct PartiallyConstructedSafepointRecord { + /// The set of values known to be live accross this safepoint + StatepointLiveSetTy liveset; + + /// Mapping from live pointers to a base-defining-value + DenseMap<llvm::Value *, llvm::Value *> PointerToBase; + + /// Any new values which were added to the IR during base pointer analysis + /// for this safepoint + DenseSet<llvm::Value *> NewInsertedDefs; + + /// The *new* gc.statepoint instruction itself. This produces the token + /// that normal path gc.relocates and the gc.result are tied to. + Instruction *StatepointToken; + + /// Instruction to which exceptional gc relocates are attached + /// Makes it easier to iterate through them during relocationViaAlloca. + Instruction *UnwindToken; +}; +} + +// TODO: Once we can get to the GCStrategy, this becomes +// Optional<bool> isGCManagedPointer(const Value *V) const override { + +static bool isGCPointerType(const Type *T) { + if (const PointerType *PT = dyn_cast<PointerType>(T)) + // For the sake of this example GC, we arbitrarily pick addrspace(1) as our + // GC managed heap. We know that a pointer into this heap needs to be + // updated and that no other pointer does. + return (1 == PT->getAddressSpace()); + return false; +} + +/// Return true if the Value is a gc reference type which is potentially used +/// after the instruction 'loc'. This is only used with the edge reachability +/// liveness code. Note: It is assumed the V dominates loc. +static bool isLiveGCReferenceAt(Value &V, Instruction *loc, DominatorTree &DT, + LoopInfo *LI) { + if (!isGCPointerType(V.getType())) + return false; + + if (V.use_empty()) + return false; + + // Given assumption that V dominates loc, this may be live + return true; +} + +#ifndef NDEBUG +static bool isAggWhichContainsGCPtrType(Type *Ty) { + if (VectorType *VT = dyn_cast<VectorType>(Ty)) + return isGCPointerType(VT->getScalarType()); + if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) + return isGCPointerType(AT->getElementType()) || + isAggWhichContainsGCPtrType(AT->getElementType()); + if (StructType *ST = dyn_cast<StructType>(Ty)) + return std::any_of(ST->subtypes().begin(), ST->subtypes().end(), + [](Type *SubType) { + return isGCPointerType(SubType) || + isAggWhichContainsGCPtrType(SubType); + }); + return false; +} +#endif + +// Conservatively identifies any definitions which might be live at the +// given instruction. The analysis is performed immediately before the +// given instruction. Values defined by that instruction are not considered +// live. Values used by that instruction are considered live. +// +// preconditions: valid IR graph, term is either a terminator instruction or +// a call instruction, pred is the basic block of term, DT, LI are valid +// +// side effects: none, does not mutate IR +// +// postconditions: populates liveValues as discussed above +static void findLiveGCValuesAtInst(Instruction *term, BasicBlock *pred, + DominatorTree &DT, LoopInfo *LI, + StatepointLiveSetTy &liveValues) { + liveValues.clear(); + + assert(isa<CallInst>(term) || isa<InvokeInst>(term) || term->isTerminator()); + + Function *F = pred->getParent(); + + auto is_live_gc_reference = + [&](Value &V) { return isLiveGCReferenceAt(V, term, DT, LI); }; + + // Are there any gc pointer arguments live over this point? This needs to be + // special cased since arguments aren't defined in basic blocks. + for (Argument &arg : F->args()) { + assert(!isAggWhichContainsGCPtrType(arg.getType()) && + "support for FCA unimplemented"); + + if (is_live_gc_reference(arg)) { + liveValues.insert(&arg); + } + } + + // Walk through all dominating blocks - the ones which can contain + // definitions used in this block - and check to see if any of the values + // they define are used in locations potentially reachable from the + // interesting instruction. + BasicBlock *BBI = pred; + while (true) { + if (TraceLSP) { + errs() << "[LSP] Looking at dominating block " << pred->getName() << "\n"; + } + assert(DT.dominates(BBI, pred)); + assert(isPotentiallyReachable(BBI, pred, &DT) && + "dominated block must be reachable"); + + // Walk through the instructions in dominating blocks and keep any + // that have a use potentially reachable from the block we're + // considering putting the safepoint in + for (Instruction &inst : *BBI) { + if (TraceLSP) { + errs() << "[LSP] Looking at instruction "; + inst.dump(); + } + + if (pred == BBI && (&inst) == term) { + if (TraceLSP) { + errs() << "[LSP] stopped because we encountered the safepoint " + "instruction.\n"; + } + + // If we're in the block which defines the interesting instruction, + // we don't want to include any values as live which are defined + // _after_ the interesting line or as part of the line itself + // i.e. "term" is the call instruction for a call safepoint, the + // results of the call should not be considered live in that stackmap + break; + } + + assert(!isAggWhichContainsGCPtrType(inst.getType()) && + "support for FCA unimplemented"); + + if (is_live_gc_reference(inst)) { + if (TraceLSP) { + errs() << "[LSP] found live value for this safepoint "; + inst.dump(); + term->dump(); + } + liveValues.insert(&inst); + } + } + if (!DT.getNode(BBI)->getIDom()) { + assert(BBI == &F->getEntryBlock() && + "failed to find a dominator for something other than " + "the entry block"); + break; + } + BBI = DT.getNode(BBI)->getIDom()->getBlock(); + } +} + +static bool order_by_name(llvm::Value *a, llvm::Value *b) { + if (a->hasName() && b->hasName()) { + return -1 == a->getName().compare(b->getName()); + } else if (a->hasName() && !b->hasName()) { + return true; + } else if (!a->hasName() && b->hasName()) { + return false; + } else { + // Better than nothing, but not stable + return a < b; + } +} + +/// Find the initial live set. Note that due to base pointer +/// insertion, the live set may be incomplete. +static void +analyzeParsePointLiveness(DominatorTree &DT, const CallSite &CS, + PartiallyConstructedSafepointRecord &result) { + Instruction *inst = CS.getInstruction(); + + BasicBlock *BB = inst->getParent(); + StatepointLiveSetTy liveset; + findLiveGCValuesAtInst(inst, BB, DT, nullptr, liveset); + + if (PrintLiveSet) { + // Note: This output is used by several of the test cases + // The order of elemtns in a set is not stable, put them in a vec and sort + // by name + SmallVector<Value *, 64> temp; + temp.insert(temp.end(), liveset.begin(), liveset.end()); + std::sort(temp.begin(), temp.end(), order_by_name); + errs() << "Live Variables:\n"; + for (Value *V : temp) { + errs() << " " << V->getName(); // no newline + V->dump(); + } + } + if (PrintLiveSetSize) { + errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n"; + errs() << "Number live values: " << liveset.size() << "\n"; + } + result.liveset = liveset; +} + +/// True iff this value is the null pointer constant (of any pointer type) +static bool LLVM_ATTRIBUTE_UNUSED isNullConstant(Value *V) { + return isa<Constant>(V) && isa<PointerType>(V->getType()) && + cast<Constant>(V)->isNullValue(); +} + +/// Helper function for findBasePointer - Will return a value which either a) +/// defines the base pointer for the input or b) blocks the simple search +/// (i.e. a PHI or Select of two derived pointers) +static Value *findBaseDefiningValue(Value *I) { + assert(I->getType()->isPointerTy() && + "Illegal to ask for the base pointer of a non-pointer type"); + + // There are instructions which can never return gc pointer values. Sanity + // check + // that this is actually true. + assert(!isa<InsertElementInst>(I) && !isa<ExtractElementInst>(I) && + !isa<ShuffleVectorInst>(I) && "Vector types are not gc pointers"); + assert((!isa<Instruction>(I) || isa<InvokeInst>(I) || + !cast<Instruction>(I)->isTerminator()) && + "With the exception of invoke terminators don't define values"); + assert(!isa<StoreInst>(I) && !isa<FenceInst>(I) && + "Can't be definitions to start with"); + assert(!isa<ICmpInst>(I) && !isa<FCmpInst>(I) && + "Comparisons don't give ops"); + // There's a bunch of instructions which just don't make sense to apply to + // a pointer. The only valid reason for this would be pointer bit + // twiddling which we're just not going to support. + assert((!isa<Instruction>(I) || !cast<Instruction>(I)->isBinaryOp()) && + "Binary ops on pointer values are meaningless. Unless your " + "bit-twiddling which we don't support"); + + if (Argument *Arg = dyn_cast<Argument>(I)) { + // An incoming argument to the function is a base pointer + // We should have never reached here if this argument isn't an gc value + assert(Arg->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + return Arg; + } + + if (GlobalVariable *global = dyn_cast<GlobalVariable>(I)) { + // base case + assert(global->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + return global; + } + + // inlining could possibly introduce phi node that contains + // undef if callee has multiple returns + if (UndefValue *undef = dyn_cast<UndefValue>(I)) { + assert(undef->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + return undef; // utterly meaningless, but useful for dealing with + // partially optimized code. + } + + // Due to inheritance, this must be _after_ the global variable and undef + // checks + if (Constant *con = dyn_cast<Constant>(I)) { + assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) && + "order of checks wrong!"); + // Note: Finding a constant base for something marked for relocation + // doesn't really make sense. The most likely case is either a) some + // screwed up the address space usage or b) your validating against + // compiled C++ code w/o the proper separation. The only real exception + // is a null pointer. You could have generic code written to index of + // off a potentially null value and have proven it null. We also use + // null pointers in dead paths of relocation phis (which we might later + // want to find a base pointer for). + assert(con->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + assert(con->isNullValue() && "null is the only case which makes sense"); + return con; + } + + if (CastInst *CI = dyn_cast<CastInst>(I)) { + Value *def = CI->stripPointerCasts(); + assert(def->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + // If we find a cast instruction here, it means we've found a cast which is + // not simply a pointer cast (i.e. an inttoptr). We don't know how to + // handle int->ptr conversion. + assert(!isa<CastInst>(def) && "shouldn't find another cast here"); + return findBaseDefiningValue(def); + } + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + if (LI->getType()->isPointerTy()) { + Value *Op = LI->getOperand(0); + (void)Op; + // Has to be a pointer to an gc object, or possibly an array of such? + assert(Op->getType()->isPointerTy()); + return LI; // The value loaded is an gc base itself + } + } + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { + Value *Op = GEP->getOperand(0); + if (Op->getType()->isPointerTy()) { + return findBaseDefiningValue(Op); // The base of this GEP is the base + } + } + + if (AllocaInst *alloc = dyn_cast<AllocaInst>(I)) { + // An alloca represents a conceptual stack slot. It's the slot itself + // that the GC needs to know about, not the value in the slot. + assert(alloc->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + return alloc; + } + + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: + // fall through to general call handling + break; + case Intrinsic::experimental_gc_statepoint: + case Intrinsic::experimental_gc_result_float: + case Intrinsic::experimental_gc_result_int: + llvm_unreachable("these don't produce pointers"); + case Intrinsic::experimental_gc_result_ptr: + // This is just a special case of the CallInst check below to handle a + // statepoint with deopt args which hasn't been rewritten for GC yet. + // TODO: Assert that the statepoint isn't rewritten yet. + return II; + case Intrinsic::experimental_gc_relocate: { + // Rerunning safepoint insertion after safepoints are already + // inserted is not supported. It could probably be made to work, + // but why are you doing this? There's no good reason. + llvm_unreachable("repeat safepoint insertion is not supported"); + } + case Intrinsic::gcroot: + // Currently, this mechanism hasn't been extended to work with gcroot. + // There's no reason it couldn't be, but I haven't thought about the + // implications much. + llvm_unreachable( + "interaction with the gcroot mechanism is not supported"); + } + } + // We assume that functions in the source language only return base + // pointers. This should probably be generalized via attributes to support + // both source language and internal functions. + if (CallInst *call = dyn_cast<CallInst>(I)) { + assert(call->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + return call; + } + if (InvokeInst *invoke = dyn_cast<InvokeInst>(I)) { + assert(invoke->getType()->isPointerTy() && + "Base for pointer must be another pointer"); + return invoke; + } + + // I have absolutely no idea how to implement this part yet. It's not + // neccessarily hard, I just haven't really looked at it yet. + assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented"); + + if (AtomicCmpXchgInst *cas = dyn_cast<AtomicCmpXchgInst>(I)) { + // A CAS is effectively a atomic store and load combined under a + // predicate. From the perspective of base pointers, we just treat it + // like a load. We loaded a pointer from a address in memory, that value + // had better be a valid base pointer. + return cas->getPointerOperand(); + } + if (AtomicRMWInst *atomic = dyn_cast<AtomicRMWInst>(I)) { + assert(AtomicRMWInst::Xchg == atomic->getOperation() && + "All others are binary ops which don't apply to base pointers"); + // semantically, a load, store pair. Treat it the same as a standard load + return atomic->getPointerOperand(); + } + + // The aggregate ops. Aggregates can either be in the heap or on the + // stack, but in either case, this is simply a field load. As a result, + // this is a defining definition of the base just like a load is. + if (ExtractValueInst *ev = dyn_cast<ExtractValueInst>(I)) { + return ev; + } + + // We should never see an insert vector since that would require we be + // tracing back a struct value not a pointer value. + assert(!isa<InsertValueInst>(I) && + "Base pointer for a struct is meaningless"); + + // The last two cases here don't return a base pointer. Instead, they + // return a value which dynamically selects from amoung several base + // derived pointers (each with it's own base potentially). It's the job of + // the caller to resolve these. + if (SelectInst *select = dyn_cast<SelectInst>(I)) { + return select; + } + + return cast<PHINode>(I); +} + +/// Returns the base defining value for this value. +static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &cache) { + Value *&Cached = cache[I]; + if (!Cached) { + Cached = findBaseDefiningValue(I); + } + assert(cache[I] != nullptr); + + if (TraceLSP) { + errs() << "fBDV-cached: " << I->getName() << " -> " << Cached->getName() + << "\n"; + } + return Cached; +} + +/// Return a base pointer for this value if known. Otherwise, return it's +/// base defining value. +static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &cache) { + Value *def = findBaseDefiningValueCached(I, cache); + auto Found = cache.find(def); + if (Found != cache.end()) { + // Either a base-of relation, or a self reference. Caller must check. + return Found->second; + } + // Only a BDV available + return def; +} + +/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV, +/// is it known to be a base pointer? Or do we need to continue searching. +static bool isKnownBaseResult(Value *v) { + if (!isa<PHINode>(v) && !isa<SelectInst>(v)) { + // no recursion possible + return true; + } + if (cast<Instruction>(v)->getMetadata("is_base_value")) { + // This is a previously inserted base phi or select. We know + // that this is a base value. + return true; + } + + // We need to keep searching + return false; +} + +// TODO: find a better name for this +namespace { +class PhiState { +public: + enum Status { Unknown, Base, Conflict }; + + PhiState(Status s, Value *b = nullptr) : status(s), base(b) { + assert(status != Base || b); + } + PhiState(Value *b) : status(Base), base(b) {} + PhiState() : status(Unknown), base(nullptr) {} + PhiState(const PhiState &other) : status(other.status), base(other.base) { + assert(status != Base || base); + } + + Status getStatus() const { return status; } + Value *getBase() const { return base; } + + bool isBase() const { return getStatus() == Base; } + bool isUnknown() const { return getStatus() == Unknown; } + bool isConflict() const { return getStatus() == Conflict; } + + bool operator==(const PhiState &other) const { + return base == other.base && status == other.status; + } + + bool operator!=(const PhiState &other) const { return !(*this == other); } + + void dump() { + errs() << status << " (" << base << " - " + << (base ? base->getName() : "nullptr") << "): "; + } + +private: + Status status; + Value *base; // non null only if status == base +}; + +typedef DenseMap<Value *, PhiState> ConflictStateMapTy; +// Values of type PhiState form a lattice, and this is a helper +// class that implementes the meet operation. The meat of the meet +// operation is implemented in MeetPhiStates::pureMeet +class MeetPhiStates { +public: + // phiStates is a mapping from PHINodes and SelectInst's to PhiStates. + explicit MeetPhiStates(const ConflictStateMapTy &phiStates) + : phiStates(phiStates) {} + + // Destructively meet the current result with the base V. V can + // either be a merge instruction (SelectInst / PHINode), in which + // case its status is looked up in the phiStates map; or a regular + // SSA value, in which case it is assumed to be a base. + void meetWith(Value *V) { + PhiState otherState = getStateForBDV(V); + assert((MeetPhiStates::pureMeet(otherState, currentResult) == + MeetPhiStates::pureMeet(currentResult, otherState)) && + "math is wrong: meet does not commute!"); + currentResult = MeetPhiStates::pureMeet(otherState, currentResult); + } + + PhiState getResult() const { return currentResult; } + +private: + const ConflictStateMapTy &phiStates; + PhiState currentResult; + + /// Return a phi state for a base defining value. We'll generate a new + /// base state for known bases and expect to find a cached state otherwise + PhiState getStateForBDV(Value *baseValue) { + if (isKnownBaseResult(baseValue)) { + return PhiState(baseValue); + } else { + return lookupFromMap(baseValue); + } + } + + PhiState lookupFromMap(Value *V) { + auto I = phiStates.find(V); + assert(I != phiStates.end() && "lookup failed!"); + return I->second; + } + + static PhiState pureMeet(const PhiState &stateA, const PhiState &stateB) { + switch (stateA.getStatus()) { + case PhiState::Unknown: + return stateB; + + case PhiState::Base: + assert(stateA.getBase() && "can't be null"); + if (stateB.isUnknown()) + return stateA; + + if (stateB.isBase()) { + if (stateA.getBase() == stateB.getBase()) { + assert(stateA == stateB && "equality broken!"); + return stateA; + } + return PhiState(PhiState::Conflict); + } + assert(stateB.isConflict() && "only three states!"); + return PhiState(PhiState::Conflict); + + case PhiState::Conflict: + return stateA; + } + llvm_unreachable("only three states!"); + } +}; +} +/// For a given value or instruction, figure out what base ptr it's derived +/// from. For gc objects, this is simply itself. On success, returns a value +/// which is the base pointer. (This is reliable and can be used for +/// relocation.) On failure, returns nullptr. +static Value *findBasePointer(Value *I, DefiningValueMapTy &cache, + DenseSet<llvm::Value *> &NewInsertedDefs) { + Value *def = findBaseOrBDV(I, cache); + + if (isKnownBaseResult(def)) { + return def; + } + + // Here's the rough algorithm: + // - For every SSA value, construct a mapping to either an actual base + // pointer or a PHI which obscures the base pointer. + // - Construct a mapping from PHI to unknown TOP state. Use an + // optimistic algorithm to propagate base pointer information. Lattice + // looks like: + // UNKNOWN + // b1 b2 b3 b4 + // CONFLICT + // When algorithm terminates, all PHIs will either have a single concrete + // base or be in a conflict state. + // - For every conflict, insert a dummy PHI node without arguments. Add + // these to the base[Instruction] = BasePtr mapping. For every + // non-conflict, add the actual base. + // - For every conflict, add arguments for the base[a] of each input + // arguments. + // + // Note: A simpler form of this would be to add the conflict form of all + // PHIs without running the optimistic algorithm. This would be + // analougous to pessimistic data flow and would likely lead to an + // overall worse solution. + + ConflictStateMapTy states; + states[def] = PhiState(); + // Recursively fill in all phis & selects reachable from the initial one + // for which we don't already know a definite base value for + // PERF: Yes, this is as horribly inefficient as it looks. + bool done = false; + while (!done) { + done = true; + for (auto Pair : states) { + Value *v = Pair.first; + assert(!isKnownBaseResult(v) && "why did it get added?"); + if (PHINode *phi = dyn_cast<PHINode>(v)) { + assert(phi->getNumIncomingValues() > 0 && + "zero input phis are illegal"); + for (Value *InVal : phi->incoming_values()) { + Value *local = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(local) && states.find(local) == states.end()) { + states[local] = PhiState(); + done = false; + } + } + } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) { + Value *local = findBaseOrBDV(sel->getTrueValue(), cache); + if (!isKnownBaseResult(local) && states.find(local) == states.end()) { + states[local] = PhiState(); + done = false; + } + local = findBaseOrBDV(sel->getFalseValue(), cache); + if (!isKnownBaseResult(local) && states.find(local) == states.end()) { + states[local] = PhiState(); + done = false; + } + } + } + } + + if (TraceLSP) { + errs() << "States after initialization:\n"; + for (auto Pair : states) { + Instruction *v = cast<Instruction>(Pair.first); + PhiState state = Pair.second; + state.dump(); + v->dump(); + } + } + + // TODO: come back and revisit the state transitions around inputs which + // have reached conflict state. The current version seems too conservative. + + bool progress = true; + size_t oldSize = 0; + while (progress) { + oldSize = states.size(); + progress = false; + for (auto Pair : states) { + MeetPhiStates calculateMeet(states); + Value *v = Pair.first; + assert(!isKnownBaseResult(v) && "why did it get added?"); + if (SelectInst *select = dyn_cast<SelectInst>(v)) { + calculateMeet.meetWith(findBaseOrBDV(select->getTrueValue(), cache)); + calculateMeet.meetWith(findBaseOrBDV(select->getFalseValue(), cache)); + } else + for (Value *Val : cast<PHINode>(v)->incoming_values()) + calculateMeet.meetWith(findBaseOrBDV(Val, cache)); + + PhiState oldState = states[v]; + PhiState newState = calculateMeet.getResult(); + if (oldState != newState) { + progress = true; + states[v] = newState; + } + } + + assert(oldSize <= states.size()); + assert(oldSize == states.size() || progress); + } + + if (TraceLSP) { + errs() << "States after meet iteration:\n"; + for (auto Pair : states) { + Instruction *v = cast<Instruction>(Pair.first); + PhiState state = Pair.second; + state.dump(); + v->dump(); + } + } + + // Insert Phis for all conflicts + for (auto Pair : states) { + Instruction *v = cast<Instruction>(Pair.first); + PhiState state = Pair.second; + assert(!isKnownBaseResult(v) && "why did it get added?"); + assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); + if (state.isConflict()) { + if (isa<PHINode>(v)) { + int num_preds = + std::distance(pred_begin(v->getParent()), pred_end(v->getParent())); + assert(num_preds > 0 && "how did we reach here"); + PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v); + NewInsertedDefs.insert(phi); + // Add metadata marking this as a base value + auto *const_1 = ConstantInt::get( + Type::getInt32Ty( + v->getParent()->getParent()->getParent()->getContext()), + 1); + auto MDConst = ConstantAsMetadata::get(const_1); + MDNode *md = MDNode::get( + v->getParent()->getParent()->getParent()->getContext(), MDConst); + phi->setMetadata("is_base_value", md); + states[v] = PhiState(PhiState::Conflict, phi); + } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) { + // The undef will be replaced later + UndefValue *undef = UndefValue::get(sel->getType()); + SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef, + undef, "base_select", sel); + NewInsertedDefs.insert(basesel); + // Add metadata marking this as a base value + auto *const_1 = ConstantInt::get( + Type::getInt32Ty( + v->getParent()->getParent()->getParent()->getContext()), + 1); + auto MDConst = ConstantAsMetadata::get(const_1); + MDNode *md = MDNode::get( + v->getParent()->getParent()->getParent()->getContext(), MDConst); + basesel->setMetadata("is_base_value", md); + states[v] = PhiState(PhiState::Conflict, basesel); + } else + llvm_unreachable("unknown conflict type"); + } + } + + // Fixup all the inputs of the new PHIs + for (auto Pair : states) { + Instruction *v = cast<Instruction>(Pair.first); + PhiState state = Pair.second; + + assert(!isKnownBaseResult(v) && "why did it get added?"); + assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); + if (state.isConflict()) { + if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) { + PHINode *phi = cast<PHINode>(v); + unsigned NumPHIValues = phi->getNumIncomingValues(); + for (unsigned i = 0; i < NumPHIValues; i++) { + Value *InVal = phi->getIncomingValue(i); + BasicBlock *InBB = phi->getIncomingBlock(i); + + // If we've already seen InBB, add the same incoming value + // we added for it earlier. The IR verifier requires phi + // nodes with multiple entries from the same basic block + // to have the same incoming value for each of those + // entries. If we don't do this check here and basephi + // has a different type than base, we'll end up adding two + // bitcasts (and hence two distinct values) as incoming + // values for the same basic block. + + int blockIndex = basephi->getBasicBlockIndex(InBB); + if (blockIndex != -1) { + Value *oldBase = basephi->getIncomingValue(blockIndex); + basephi->addIncoming(oldBase, InBB); +#ifndef NDEBUG + Value *base = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(base)) { + // Either conflict or base. + assert(states.count(base)); + base = states[base].getBase(); + assert(base != nullptr && "unknown PhiState!"); + assert(NewInsertedDefs.count(base) && + "should have already added this in a prev. iteration!"); + } + + // In essense this assert states: the only way two + // values incoming from the same basic block may be + // different is by being different bitcasts of the same + // value. A cleanup that remains TODO is changing + // findBaseOrBDV to return an llvm::Value of the correct + // type (and still remain pure). This will remove the + // need to add bitcasts. + assert(base->stripPointerCasts() == oldBase->stripPointerCasts() && + "sanity -- findBaseOrBDV should be pure!"); +#endif + continue; + } + + // Find either the defining value for the PHI or the normal base for + // a non-phi node + Value *base = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(base)) { + // Either conflict or base. + assert(states.count(base)); + base = states[base].getBase(); + assert(base != nullptr && "unknown PhiState!"); + } + assert(base && "can't be null"); + // Must use original input BB since base may not be Instruction + // The cast is needed since base traversal may strip away bitcasts + if (base->getType() != basephi->getType()) { + base = new BitCastInst(base, basephi->getType(), "cast", + InBB->getTerminator()); + NewInsertedDefs.insert(base); + } + basephi->addIncoming(base, InBB); + } + assert(basephi->getNumIncomingValues() == NumPHIValues); + } else if (SelectInst *basesel = dyn_cast<SelectInst>(state.getBase())) { + SelectInst *sel = cast<SelectInst>(v); + // Operand 1 & 2 are true, false path respectively. TODO: refactor to + // something more safe and less hacky. + for (int i = 1; i <= 2; i++) { + Value *InVal = sel->getOperand(i); + // Find either the defining value for the PHI or the normal base for + // a non-phi node + Value *base = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(base)) { + // Either conflict or base. + assert(states.count(base)); + base = states[base].getBase(); + assert(base != nullptr && "unknown PhiState!"); + } + assert(base && "can't be null"); + // Must use original input BB since base may not be Instruction + // The cast is needed since base traversal may strip away bitcasts + if (base->getType() != basesel->getType()) { + base = new BitCastInst(base, basesel->getType(), "cast", basesel); + NewInsertedDefs.insert(base); + } + basesel->setOperand(i, base); + } + } else + llvm_unreachable("unexpected conflict type"); + } + } + + // Cache all of our results so we can cheaply reuse them + // NOTE: This is actually two caches: one of the base defining value + // relation and one of the base pointer relation! FIXME + for (auto item : states) { + Value *v = item.first; + Value *base = item.second.getBase(); + assert(v && base); + assert(!isKnownBaseResult(v) && "why did it get added?"); + + if (TraceLSP) { + std::string fromstr = + cache.count(v) ? (cache[v]->hasName() ? cache[v]->getName() : "") + : "none"; + errs() << "Updating base value cache" + << " for: " << (v->hasName() ? v->getName() : "") + << " from: " << fromstr + << " to: " << (base->hasName() ? base->getName() : "") << "\n"; + } + + assert(isKnownBaseResult(base) && + "must be something we 'know' is a base pointer"); + if (cache.count(v)) { + // Once we transition from the BDV relation being store in the cache to + // the base relation being stored, it must be stable + assert((!isKnownBaseResult(cache[v]) || cache[v] == base) && + "base relation should be stable"); + } + cache[v] = base; + } + assert(cache.find(def) != cache.end()); + return cache[def]; +} + +// For a set of live pointers (base and/or derived), identify the base +// pointer of the object which they are derived from. This routine will +// mutate the IR graph as needed to make the 'base' pointer live at the +// definition site of 'derived'. This ensures that any use of 'derived' can +// also use 'base'. This may involve the insertion of a number of +// additional PHI nodes. +// +// preconditions: live is a set of pointer type Values +// +// side effects: may insert PHI nodes into the existing CFG, will preserve +// CFG, will not remove or mutate any existing nodes +// +// post condition: PointerToBase contains one (derived, base) pair for every +// pointer in live. Note that derived can be equal to base if the original +// pointer was a base pointer. +static void findBasePointers(const StatepointLiveSetTy &live, + DenseMap<llvm::Value *, llvm::Value *> &PointerToBase, + DominatorTree *DT, DefiningValueMapTy &DVCache, + DenseSet<llvm::Value *> &NewInsertedDefs) { + for (Value *ptr : live) { + Value *base = findBasePointer(ptr, DVCache, NewInsertedDefs); + assert(base && "failed to find base pointer"); + PointerToBase[ptr] = base; + assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) || + DT->dominates(cast<Instruction>(base)->getParent(), + cast<Instruction>(ptr)->getParent())) && + "The base we found better dominate the derived pointer"); + + // If you see this trip and like to live really dangerously, the code should + // be correct, just with idioms the verifier can't handle. You can try + // disabling the verifier at your own substaintial risk. + assert(!isNullConstant(base) && "the relocation code needs adjustment to " + "handle the relocation of a null pointer " + "constant without causing false positives " + "in the safepoint ir verifier."); + } +} + +/// Find the required based pointers (and adjust the live set) for the given +/// parse point. +static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, + const CallSite &CS, + PartiallyConstructedSafepointRecord &result) { + DenseMap<llvm::Value *, llvm::Value *> PointerToBase; + DenseSet<llvm::Value *> NewInsertedDefs; + findBasePointers(result.liveset, PointerToBase, &DT, DVCache, NewInsertedDefs); + + if (PrintBasePointers) { + errs() << "Base Pairs (w/o Relocation):\n"; + for (auto Pair : PointerToBase) { + errs() << " derived %" << Pair.first->getName() << " base %" + << Pair.second->getName() << "\n"; + } + } + + result.PointerToBase = PointerToBase; + result.NewInsertedDefs = NewInsertedDefs; +} + +/// Check for liveness of items in the insert defs and add them to the live +/// and base pointer sets +static void fixupLiveness(DominatorTree &DT, const CallSite &CS, + const DenseSet<Value *> &allInsertedDefs, + PartiallyConstructedSafepointRecord &result) { + Instruction *inst = CS.getInstruction(); + + auto liveset = result.liveset; + auto PointerToBase = result.PointerToBase; + + auto is_live_gc_reference = + [&](Value &V) { return isLiveGCReferenceAt(V, inst, DT, nullptr); }; + + // For each new definition, check to see if a) the definition dominates the + // instruction we're interested in, and b) one of the uses of that definition + // is edge-reachable from the instruction we're interested in. This is the + // same definition of liveness we used in the intial liveness analysis + for (Value *newDef : allInsertedDefs) { + if (liveset.count(newDef)) { + // already live, no action needed + continue; + } + + // PERF: Use DT to check instruction domination might not be good for + // compilation time, and we could change to optimal solution if this + // turn to be a issue + if (!DT.dominates(cast<Instruction>(newDef), inst)) { + // can't possibly be live at inst + continue; + } + + if (is_live_gc_reference(*newDef)) { + // Add the live new defs into liveset and PointerToBase + liveset.insert(newDef); + PointerToBase[newDef] = newDef; + } + } + + result.liveset = liveset; + result.PointerToBase = PointerToBase; +} + +static void fixupLiveReferences( + Function &F, DominatorTree &DT, Pass *P, + const DenseSet<llvm::Value *> &allInsertedDefs, + ArrayRef<CallSite> toUpdate, + MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + const CallSite &CS = toUpdate[i]; + fixupLiveness(DT, CS, allInsertedDefs, info); + } +} + +// Normalize basic block to make it ready to be target of invoke statepoint. +// It means spliting it to have single predecessor. Return newly created BB +// ready to be successor of invoke statepoint. +static BasicBlock *normalizeBBForInvokeSafepoint(BasicBlock *BB, + BasicBlock *InvokeParent, + Pass *P) { + BasicBlock *ret = BB; + + if (!BB->getUniquePredecessor()) { + ret = SplitBlockPredecessors(BB, InvokeParent, ""); + } + + // Another requirement for such basic blocks is to not have any phi nodes. + // Since we just ensured that new BB will have single predecessor, + // all phi nodes in it will have one value. Here it would be naturall place + // to + // remove them all. But we can not do this because we are risking to remove + // one of the values stored in liveset of another statepoint. We will do it + // later after placing all safepoints. + + return ret; +} + +static int find_index(ArrayRef<Value *> livevec, Value *val) { + auto itr = std::find(livevec.begin(), livevec.end(), val); + assert(livevec.end() != itr); + size_t index = std::distance(livevec.begin(), itr); + assert(index < livevec.size()); + return index; +} + +// Create new attribute set containing only attributes which can be transfered +// from original call to the safepoint. +static AttributeSet legalizeCallAttributes(AttributeSet AS) { + AttributeSet ret; + + for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) { + unsigned index = AS.getSlotIndex(Slot); + + if (index == AttributeSet::ReturnIndex || + index == AttributeSet::FunctionIndex) { + + for (auto it = AS.begin(Slot), it_end = AS.end(Slot); it != it_end; + ++it) { + Attribute attr = *it; + + // Do not allow certain attributes - just skip them + // Safepoint can not be read only or read none. + if (attr.hasAttribute(Attribute::ReadNone) || + attr.hasAttribute(Attribute::ReadOnly)) + continue; + + ret = ret.addAttributes( + AS.getContext(), index, + AttributeSet::get(AS.getContext(), index, AttrBuilder(attr))); + } + } + + // Just skip parameter attributes for now + } + + return ret; +} + +/// Helper function to place all gc relocates necessary for the given +/// statepoint. +/// Inputs: +/// liveVariables - list of variables to be relocated. +/// liveStart - index of the first live variable. +/// basePtrs - base pointers. +/// statepointToken - statepoint instruction to which relocates should be +/// bound. +/// Builder - Llvm IR builder to be used to construct new calls. +void CreateGCRelocates(ArrayRef<llvm::Value *> liveVariables, + const int liveStart, + ArrayRef<llvm::Value *> basePtrs, + Instruction *statepointToken, IRBuilder<> Builder) { + + SmallVector<Instruction *, 64> NewDefs; + NewDefs.reserve(liveVariables.size()); + + Module *M = statepointToken->getParent()->getParent()->getParent(); + + for (unsigned i = 0; i < liveVariables.size(); i++) { + // We generate a (potentially) unique declaration for every pointer type + // combination. This results is some blow up the function declarations in + // the IR, but removes the need for argument bitcasts which shrinks the IR + // greatly and makes it much more readable. + SmallVector<Type *, 1> types; // one per 'any' type + types.push_back(liveVariables[i]->getType()); // result type + Value *gc_relocate_decl = Intrinsic::getDeclaration( + M, Intrinsic::experimental_gc_relocate, types); + + // Generate the gc.relocate call and save the result + Value *baseIdx = + ConstantInt::get(Type::getInt32Ty(M->getContext()), + liveStart + find_index(liveVariables, basePtrs[i])); + Value *liveIdx = ConstantInt::get( + Type::getInt32Ty(M->getContext()), + liveStart + find_index(liveVariables, liveVariables[i])); + + // only specify a debug name if we can give a useful one + Value *reloc = Builder.CreateCall3( + gc_relocate_decl, statepointToken, baseIdx, liveIdx, + liveVariables[i]->hasName() ? liveVariables[i]->getName() + ".relocated" + : ""); + // Trick CodeGen into thinking there are lots of free registers at this + // fake call. + cast<CallInst>(reloc)->setCallingConv(CallingConv::Cold); + + NewDefs.push_back(cast<Instruction>(reloc)); + } + assert(NewDefs.size() == liveVariables.size() && + "missing or extra redefinition at safepoint"); +} + +static void +makeStatepointExplicitImpl(const CallSite &CS, /* to replace */ + const SmallVectorImpl<llvm::Value *> &basePtrs, + const SmallVectorImpl<llvm::Value *> &liveVariables, + Pass *P, + PartiallyConstructedSafepointRecord &result) { + assert(basePtrs.size() == liveVariables.size()); + assert(isStatepoint(CS) && + "This method expects to be rewriting a statepoint"); + + BasicBlock *BB = CS.getInstruction()->getParent(); + assert(BB); + Function *F = BB->getParent(); + assert(F && "must be set"); + Module *M = F->getParent(); + (void)M; + assert(M && "must be set"); + + // We're not changing the function signature of the statepoint since the gc + // arguments go into the var args section. + Function *gc_statepoint_decl = CS.getCalledFunction(); + + // Then go ahead and use the builder do actually do the inserts. We insert + // immediately before the previous instruction under the assumption that all + // arguments will be available here. We can't insert afterwards since we may + // be replacing a terminator. + Instruction *insertBefore = CS.getInstruction(); + IRBuilder<> Builder(insertBefore); + // Copy all of the arguments from the original statepoint - this includes the + // target, call args, and deopt args + SmallVector<llvm::Value *, 64> args; + args.insert(args.end(), CS.arg_begin(), CS.arg_end()); + // TODO: Clear the 'needs rewrite' flag + + // add all the pointers to be relocated (gc arguments) + // Capture the start of the live variable list for use in the gc_relocates + const int live_start = args.size(); + args.insert(args.end(), liveVariables.begin(), liveVariables.end()); + + // Create the statepoint given all the arguments + Instruction *token = nullptr; + AttributeSet return_attributes; + if (CS.isCall()) { + CallInst *toReplace = cast<CallInst>(CS.getInstruction()); + CallInst *call = + Builder.CreateCall(gc_statepoint_decl, args, "safepoint_token"); + call->setTailCall(toReplace->isTailCall()); + call->setCallingConv(toReplace->getCallingConv()); + + // Currently we will fail on parameter attributes and on certain + // function attributes. + AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); + // In case if we can handle this set of sttributes - set up function attrs + // directly on statepoint and return attrs later for gc_result intrinsic. + call->setAttributes(new_attrs.getFnAttributes()); + return_attributes = new_attrs.getRetAttributes(); + + token = call; + + // Put the following gc_result and gc_relocate calls immediately after the + // the old call (which we're about to delete) + BasicBlock::iterator next(toReplace); + assert(BB->end() != next && "not a terminator, must have next"); + next++; + Instruction *IP = &*(next); + Builder.SetInsertPoint(IP); + Builder.SetCurrentDebugLocation(IP->getDebugLoc()); + + } else { + InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction()); + + // Insert the new invoke into the old block. We'll remove the old one in a + // moment at which point this will become the new terminator for the + // original block. + InvokeInst *invoke = InvokeInst::Create( + gc_statepoint_decl, toReplace->getNormalDest(), + toReplace->getUnwindDest(), args, "", toReplace->getParent()); + invoke->setCallingConv(toReplace->getCallingConv()); + + // Currently we will fail on parameter attributes and on certain + // function attributes. + AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); + // In case if we can handle this set of sttributes - set up function attrs + // directly on statepoint and return attrs later for gc_result intrinsic. + invoke->setAttributes(new_attrs.getFnAttributes()); + return_attributes = new_attrs.getRetAttributes(); + + token = invoke; + + // Generate gc relocates in exceptional path + BasicBlock *unwindBlock = normalizeBBForInvokeSafepoint( + toReplace->getUnwindDest(), invoke->getParent(), P); + + Instruction *IP = &*(unwindBlock->getFirstInsertionPt()); + Builder.SetInsertPoint(IP); + Builder.SetCurrentDebugLocation(toReplace->getDebugLoc()); + + // Extract second element from landingpad return value. We will attach + // exceptional gc relocates to it. + const unsigned idx = 1; + Instruction *exceptional_token = + cast<Instruction>(Builder.CreateExtractValue( + unwindBlock->getLandingPadInst(), idx, "relocate_token")); + result.UnwindToken = exceptional_token; + + // Just throw away return value. We will use the one we got for normal + // block. + (void)CreateGCRelocates(liveVariables, live_start, basePtrs, + exceptional_token, Builder); + + // Generate gc relocates and returns for normal block + BasicBlock *normalDest = normalizeBBForInvokeSafepoint( + toReplace->getNormalDest(), invoke->getParent(), P); + + IP = &*(normalDest->getFirstInsertionPt()); + Builder.SetInsertPoint(IP); + + // gc relocates will be generated later as if it were regular call + // statepoint + } + assert(token); + + // Take the name of the original value call if it had one. + token->takeName(CS.getInstruction()); + + // The GCResult is already inserted, we just need to find it +#ifndef NDEBUG + Instruction *toReplace = CS.getInstruction(); + assert((toReplace->hasNUses(0) || toReplace->hasNUses(1)) && + "only valid use before rewrite is gc.result"); + assert(!toReplace->hasOneUse() || + isGCResult(cast<Instruction>(*toReplace->user_begin()))); +#endif + + // Update the gc.result of the original statepoint (if any) to use the newly + // inserted statepoint. This is safe to do here since the token can't be + // considered a live reference. + CS.getInstruction()->replaceAllUsesWith(token); + + result.StatepointToken = token; + + // Second, create a gc.relocate for every live variable + CreateGCRelocates(liveVariables, live_start, basePtrs, token, Builder); + +} + +namespace { +struct name_ordering { + Value *base; + Value *derived; + bool operator()(name_ordering const &a, name_ordering const &b) { + return -1 == a.derived->getName().compare(b.derived->getName()); + } +}; +} +static void stablize_order(SmallVectorImpl<Value *> &basevec, + SmallVectorImpl<Value *> &livevec) { + assert(basevec.size() == livevec.size()); + + SmallVector<name_ordering, 64> temp; + for (size_t i = 0; i < basevec.size(); i++) { + name_ordering v; + v.base = basevec[i]; + v.derived = livevec[i]; + temp.push_back(v); + } + std::sort(temp.begin(), temp.end(), name_ordering()); + for (size_t i = 0; i < basevec.size(); i++) { + basevec[i] = temp[i].base; + livevec[i] = temp[i].derived; + } +} + +// Replace an existing gc.statepoint with a new one and a set of gc.relocates +// which make the relocations happening at this safepoint explicit. +// +// WARNING: Does not do any fixup to adjust users of the original live +// values. That's the callers responsibility. +static void +makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, Pass *P, + PartiallyConstructedSafepointRecord &result) { + auto liveset = result.liveset; + auto PointerToBase = result.PointerToBase; + + // Convert to vector for efficient cross referencing. + SmallVector<Value *, 64> basevec, livevec; + livevec.reserve(liveset.size()); + basevec.reserve(liveset.size()); + for (Value *L : liveset) { + livevec.push_back(L); + + assert(PointerToBase.find(L) != PointerToBase.end()); + Value *base = PointerToBase[L]; + basevec.push_back(base); + } + assert(livevec.size() == basevec.size()); + + // To make the output IR slightly more stable (for use in diffs), ensure a + // fixed order of the values in the safepoint (by sorting the value name). + // The order is otherwise meaningless. + stablize_order(basevec, livevec); + + // Do the actual rewriting and delete the old statepoint + makeStatepointExplicitImpl(CS, basevec, livevec, P, result); + CS.getInstruction()->eraseFromParent(); +} + +// Helper function for the relocationViaAlloca. +// It receives iterator to the statepoint gc relocates and emits store to the +// assigned +// location (via allocaMap) for the each one of them. +// Add visited values into the visitedLiveValues set we will later use them +// for sanity check. +static void +insertRelocationStores(iterator_range<Value::user_iterator> gcRelocs, + DenseMap<Value *, Value *> &allocaMap, + DenseSet<Value *> &visitedLiveValues) { + + for (User *U : gcRelocs) { + if (!isa<IntrinsicInst>(U)) + continue; + + IntrinsicInst *relocatedValue = cast<IntrinsicInst>(U); + + // We only care about relocates + if (relocatedValue->getIntrinsicID() != + Intrinsic::experimental_gc_relocate) { + continue; + } + + GCRelocateOperands relocateOperands(relocatedValue); + Value *originalValue = const_cast<Value *>(relocateOperands.derivedPtr()); + assert(allocaMap.count(originalValue)); + Value *alloca = allocaMap[originalValue]; + + // Emit store into the related alloca + StoreInst *store = new StoreInst(relocatedValue, alloca); + store->insertAfter(relocatedValue); + +#ifndef NDEBUG + visitedLiveValues.insert(originalValue); +#endif + } +} + +/// do all the relocation update via allocas and mem2reg +static void relocationViaAlloca( + Function &F, DominatorTree &DT, ArrayRef<Value *> live, + ArrayRef<struct PartiallyConstructedSafepointRecord> records) { +#ifndef NDEBUG + int initialAllocaNum = 0; + + // record initial number of allocas + for (inst_iterator itr = inst_begin(F), end = inst_end(F); itr != end; + itr++) { + if (isa<AllocaInst>(*itr)) + initialAllocaNum++; + } +#endif + + // TODO-PERF: change data structures, reserve + DenseMap<Value *, Value *> allocaMap; + SmallVector<AllocaInst *, 200> PromotableAllocas; + PromotableAllocas.reserve(live.size()); + + // emit alloca for each live gc pointer + for (unsigned i = 0; i < live.size(); i++) { + Value *liveValue = live[i]; + AllocaInst *alloca = new AllocaInst(liveValue->getType(), "", + F.getEntryBlock().getFirstNonPHI()); + allocaMap[liveValue] = alloca; + PromotableAllocas.push_back(alloca); + } + + // The next two loops are part of the same conceptual operation. We need to + // insert a store to the alloca after the original def and at each + // redefinition. We need to insert a load before each use. These are split + // into distinct loops for performance reasons. + + // update gc pointer after each statepoint + // either store a relocated value or null (if no relocated value found for + // this gc pointer and it is not a gc_result) + // this must happen before we update the statepoint with load of alloca + // otherwise we lose the link between statepoint and old def + for (size_t i = 0; i < records.size(); i++) { + const struct PartiallyConstructedSafepointRecord &info = records[i]; + Value *Statepoint = info.StatepointToken; + + // This will be used for consistency check + DenseSet<Value *> visitedLiveValues; + + // Insert stores for normal statepoint gc relocates + insertRelocationStores(Statepoint->users(), allocaMap, visitedLiveValues); + + // In case if it was invoke statepoint + // we will insert stores for exceptional path gc relocates. + if (isa<InvokeInst>(Statepoint)) { + insertRelocationStores(info.UnwindToken->users(), + allocaMap, visitedLiveValues); + } + +#ifndef NDEBUG + // As a debuging aid, pretend that an unrelocated pointer becomes null at + // the gc.statepoint. This will turn some subtle GC problems into slightly + // easier to debug SEGVs + SmallVector<AllocaInst *, 64> ToClobber; + for (auto Pair : allocaMap) { + Value *Def = Pair.first; + AllocaInst *Alloca = cast<AllocaInst>(Pair.second); + + // This value was relocated + if (visitedLiveValues.count(Def)) { + continue; + } + ToClobber.push_back(Alloca); + } + + auto InsertClobbersAt = [&](Instruction *IP) { + for (auto *AI : ToClobber) { + auto AIType = cast<PointerType>(AI->getType()); + auto PT = cast<PointerType>(AIType->getElementType()); + Constant *CPN = ConstantPointerNull::get(PT); + StoreInst *store = new StoreInst(CPN, AI); + store->insertBefore(IP); + } + }; + + // Insert the clobbering stores. These may get intermixed with the + // gc.results and gc.relocates, but that's fine. + if (auto II = dyn_cast<InvokeInst>(Statepoint)) { + InsertClobbersAt(II->getNormalDest()->getFirstInsertionPt()); + InsertClobbersAt(II->getUnwindDest()->getFirstInsertionPt()); + } else { + BasicBlock::iterator Next(cast<CallInst>(Statepoint)); + Next++; + InsertClobbersAt(Next); + } +#endif + } + // update use with load allocas and add store for gc_relocated + for (auto Pair : allocaMap) { + Value *def = Pair.first; + Value *alloca = Pair.second; + + // we pre-record the uses of allocas so that we dont have to worry about + // later update + // that change the user information. + SmallVector<Instruction *, 20> uses; + // PERF: trade a linear scan for repeated reallocation + uses.reserve(std::distance(def->user_begin(), def->user_end())); + for (User *U : def->users()) { + if (!isa<ConstantExpr>(U)) { + // If the def has a ConstantExpr use, then the def is either a + // ConstantExpr use itself or null. In either case + // (recursively in the first, directly in the second), the oop + // it is ultimately dependent on is null and this particular + // use does not need to be fixed up. + uses.push_back(cast<Instruction>(U)); + } + } + + std::sort(uses.begin(), uses.end()); + auto last = std::unique(uses.begin(), uses.end()); + uses.erase(last, uses.end()); + + for (Instruction *use : uses) { + if (isa<PHINode>(use)) { + PHINode *phi = cast<PHINode>(use); + for (unsigned i = 0; i < phi->getNumIncomingValues(); i++) { + if (def == phi->getIncomingValue(i)) { + LoadInst *load = new LoadInst( + alloca, "", phi->getIncomingBlock(i)->getTerminator()); + phi->setIncomingValue(i, load); + } + } + } else { + LoadInst *load = new LoadInst(alloca, "", use); + use->replaceUsesOfWith(def, load); + } + } + + // emit store for the initial gc value + // store must be inserted after load, otherwise store will be in alloca's + // use list and an extra load will be inserted before it + StoreInst *store = new StoreInst(def, alloca); + if (isa<Instruction>(def)) { + store->insertAfter(cast<Instruction>(def)); + } else { + assert((isa<Argument>(def) || isa<GlobalVariable>(def) || + (isa<Constant>(def) && cast<Constant>(def)->isNullValue())) && + "Must be argument or global"); + store->insertAfter(cast<Instruction>(alloca)); + } + } + + assert(PromotableAllocas.size() == live.size() && + "we must have the same allocas with lives"); + if (!PromotableAllocas.empty()) { + // apply mem2reg to promote alloca to SSA + PromoteMemToReg(PromotableAllocas, DT); + } + +#ifndef NDEBUG + for (inst_iterator itr = inst_begin(F), end = inst_end(F); itr != end; + itr++) { + if (isa<AllocaInst>(*itr)) + initialAllocaNum--; + } + assert(initialAllocaNum == 0 && "We must not introduce any extra allocas"); +#endif +} + +/// Implement a unique function which doesn't require we sort the input +/// vector. Doing so has the effect of changing the output of a couple of +/// tests in ways which make them less useful in testing fused safepoints. +template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) { + DenseSet<T> Seen; + SmallVector<T, 128> TempVec; + TempVec.reserve(Vec.size()); + for (auto Element : Vec) + TempVec.push_back(Element); + Vec.clear(); + for (auto V : TempVec) { + if (Seen.insert(V).second) { + Vec.push_back(V); + } + } +} + +static Function *getUseHolder(Module &M) { + FunctionType *ftype = + FunctionType::get(Type::getVoidTy(M.getContext()), true); + Function *Func = cast<Function>(M.getOrInsertFunction("__tmp_use", ftype)); + return Func; +} + +/// Insert holders so that each Value is obviously live through the entire +/// liftetime of the call. +static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values, + SmallVectorImpl<CallInst *> &holders) { + Module *M = CS.getInstruction()->getParent()->getParent()->getParent(); + Function *Func = getUseHolder(*M); + if (CS.isCall()) { + // For call safepoints insert dummy calls right after safepoint + BasicBlock::iterator next(CS.getInstruction()); + next++; + CallInst *base_holder = CallInst::Create(Func, Values, "", next); + holders.push_back(base_holder); + } else if (CS.isInvoke()) { + // For invoke safepooints insert dummy calls both in normal and + // exceptional destination blocks + InvokeInst *invoke = cast<InvokeInst>(CS.getInstruction()); + CallInst *normal_holder = CallInst::Create( + Func, Values, "", invoke->getNormalDest()->getFirstInsertionPt()); + CallInst *unwind_holder = CallInst::Create( + Func, Values, "", invoke->getUnwindDest()->getFirstInsertionPt()); + holders.push_back(normal_holder); + holders.push_back(unwind_holder); + } else + llvm_unreachable("unsupported call type"); +} + +static void findLiveReferences( + Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate, + MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + const CallSite &CS = toUpdate[i]; + analyzeParsePointLiveness(DT, CS, info); + } +} + +static void addBasesAsLiveValues(StatepointLiveSetTy &liveset, + DenseMap<Value *, Value *> &PointerToBase) { + // Identify any base pointers which are used in this safepoint, but not + // themselves relocated. We need to relocate them so that later inserted + // safepoints can get the properly relocated base register. + DenseSet<Value *> missing; + for (Value *L : liveset) { + assert(PointerToBase.find(L) != PointerToBase.end()); + Value *base = PointerToBase[L]; + assert(base); + if (liveset.find(base) == liveset.end()) { + assert(PointerToBase.find(base) == PointerToBase.end()); + // uniqued by set insert + missing.insert(base); + } + } + + // Note that we want these at the end of the list, otherwise + // register placement gets screwed up once we lower to STATEPOINT + // instructions. This is an utter hack, but there doesn't seem to be a + // better one. + for (Value *base : missing) { + assert(base); + liveset.insert(base); + PointerToBase[base] = base; + } + assert(liveset.size() == PointerToBase.size()); +} + +static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, + SmallVectorImpl<CallSite> &toUpdate) { +#ifndef NDEBUG + // sanity check the input + std::set<CallSite> uniqued; + uniqued.insert(toUpdate.begin(), toUpdate.end()); + assert(uniqued.size() == toUpdate.size() && "no duplicates please!"); + + for (size_t i = 0; i < toUpdate.size(); i++) { + CallSite &CS = toUpdate[i]; + assert(CS.getInstruction()->getParent()->getParent() == &F); + assert(isStatepoint(CS) && "expected to already be a deopt statepoint"); + } +#endif + + // A list of dummy calls added to the IR to keep various values obviously + // live in the IR. We'll remove all of these when done. + SmallVector<CallInst *, 64> holders; + + // Insert a dummy call with all of the arguments to the vm_state we'll need + // for the actual safepoint insertion. This ensures reference arguments in + // the deopt argument list are considered live through the safepoint (and + // thus makes sure they get relocated.) + for (size_t i = 0; i < toUpdate.size(); i++) { + CallSite &CS = toUpdate[i]; + Statepoint StatepointCS(CS); + + SmallVector<Value *, 64> DeoptValues; + for (Use &U : StatepointCS.vm_state_args()) { + Value *Arg = cast<Value>(&U); + if (isGCPointerType(Arg->getType())) + DeoptValues.push_back(Arg); + } + insertUseHolderAfter(CS, DeoptValues, holders); + } + + SmallVector<struct PartiallyConstructedSafepointRecord, 64> records; + records.reserve(toUpdate.size()); + for (size_t i = 0; i < toUpdate.size(); i++) { + struct PartiallyConstructedSafepointRecord info; + records.push_back(info); + } + assert(records.size() == toUpdate.size()); + + // A) Identify all gc pointers which are staticly live at the given call + // site. + findLiveReferences(F, DT, P, toUpdate, records); + + // B) Find the base pointers for each live pointer + /* scope for caching */ { + // Cache the 'defining value' relation used in the computation and + // insertion of base phis and selects. This ensures that we don't insert + // large numbers of duplicate base_phis. + DefiningValueMapTy DVCache; + + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + CallSite &CS = toUpdate[i]; + findBasePointers(DT, DVCache, CS, info); + } + } // end of cache scope + + // The base phi insertion logic (for any safepoint) may have inserted new + // instructions which are now live at some safepoint. The simplest such + // example is: + // loop: + // phi a <-- will be a new base_phi here + // safepoint 1 <-- that needs to be live here + // gep a + 1 + // safepoint 2 + // br loop + DenseSet<llvm::Value *> allInsertedDefs; + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + allInsertedDefs.insert(info.NewInsertedDefs.begin(), + info.NewInsertedDefs.end()); + } + + // We insert some dummy calls after each safepoint to definitely hold live + // the base pointers which were identified for that safepoint. We'll then + // ask liveness for _every_ base inserted to see what is now live. Then we + // remove the dummy calls. + holders.reserve(holders.size() + records.size()); + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + CallSite &CS = toUpdate[i]; + + SmallVector<Value *, 128> Bases; + for (auto Pair : info.PointerToBase) { + Bases.push_back(Pair.second); + } + insertUseHolderAfter(CS, Bases, holders); + } + + // Add the bases explicitly to the live vector set. This may result in a few + // extra relocations, but the base has to be available whenever a pointer + // derived from it is used. Thus, we need it to be part of the statepoint's + // gc arguments list. TODO: Introduce an explicit notion (in the following + // code) of the GC argument list as seperate from the live Values at a + // given statepoint. + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + addBasesAsLiveValues(info.liveset, info.PointerToBase); + } + + // If we inserted any new values, we need to adjust our notion of what is + // live at a particular safepoint. + if (!allInsertedDefs.empty()) { + fixupLiveReferences(F, DT, P, allInsertedDefs, toUpdate, records); + } + if (PrintBasePointers) { + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + errs() << "Base Pairs: (w/Relocation)\n"; + for (auto Pair : info.PointerToBase) { + errs() << " derived %" << Pair.first->getName() << " base %" + << Pair.second->getName() << "\n"; + } + } + } + for (size_t i = 0; i < holders.size(); i++) { + holders[i]->eraseFromParent(); + holders[i] = nullptr; + } + holders.clear(); + + // Now run through and replace the existing statepoints with new ones with + // the live variables listed. We do not yet update uses of the values being + // relocated. We have references to live variables that need to + // survive to the last iteration of this loop. (By construction, the + // previous statepoint can not be a live variable, thus we can and remove + // the old statepoint calls as we go.) + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + CallSite &CS = toUpdate[i]; + makeStatepointExplicit(DT, CS, P, info); + } + toUpdate.clear(); // prevent accident use of invalid CallSites + + // In case if we inserted relocates in a different basic block than the + // original safepoint (this can happen for invokes). We need to be sure that + // original values were not used in any of the phi nodes at the + // beginning of basic block containing them. Because we know that all such + // blocks will have single predecessor we can safely assume that all phi + // nodes have single entry (because of normalizeBBForInvokeSafepoint). + // Just remove them all here. + for (size_t i = 0; i < records.size(); i++) { + Instruction *I = records[i].StatepointToken; + + if (InvokeInst *invoke = dyn_cast<InvokeInst>(I)) { + FoldSingleEntryPHINodes(invoke->getNormalDest()); + assert(!isa<PHINode>(invoke->getNormalDest()->begin())); + + FoldSingleEntryPHINodes(invoke->getUnwindDest()); + assert(!isa<PHINode>(invoke->getUnwindDest()->begin())); + } + } + + // Do all the fixups of the original live variables to their relocated selves + SmallVector<Value *, 128> live; + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + // We can't simply save the live set from the original insertion. One of + // the live values might be the result of a call which needs a safepoint. + // That Value* no longer exists and we need to use the new gc_result. + // Thankfully, the liveset is embedded in the statepoint (and updated), so + // we just grab that. + Statepoint statepoint(info.StatepointToken); + live.insert(live.end(), statepoint.gc_args_begin(), + statepoint.gc_args_end()); + } + unique_unsorted(live); + +#ifndef NDEBUG + // sanity check + for (auto ptr : live) { + assert(isGCPointerType(ptr->getType()) && "must be a gc pointer type"); + } +#endif + + relocationViaAlloca(F, DT, live, records); + return !records.empty(); +} + +/// Returns true if this function should be rewritten by this pass. The main +/// point of this function is as an extension point for custom logic. +static bool shouldRewriteStatepointsIn(Function &F) { + // TODO: This should check the GCStrategy + if (F.hasGC()) { + const std::string StatepointExampleName("statepoint-example"); + return StatepointExampleName == F.getGC(); + } else + return false; +} + +bool RewriteStatepointsForGC::runOnFunction(Function &F) { + // Nothing to do for declarations. + if (F.isDeclaration() || F.empty()) + return false; + + // Policy choice says not to rewrite - the most common reason is that we're + // compiling code without a GCStrategy. + if (!shouldRewriteStatepointsIn(F)) + return false; + + // Gather all the statepoints which need rewritten. + SmallVector<CallSite, 64> ParsePointNeeded; + for (Instruction &I : inst_range(F)) { + // TODO: only the ones with the flag set! + if (isStatepoint(I)) + ParsePointNeeded.push_back(CallSite(&I)); + } + + // Return early if no work to do. + if (ParsePointNeeded.empty()) + return false; + + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + return insertParsePoints(F, DT, this, ParsePointNeeded); +} diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index cfc9a8e..05b9608 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -35,7 +35,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> @@ -1504,7 +1504,7 @@ namespace { /// struct SCCP : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } static char ID; // Pass identification, replacement for typeid SCCP() : FunctionPass(ID) { @@ -1563,7 +1563,8 @@ bool SCCP::runOnFunction(Function &F) { DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); const DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); SCCPSolver Solver(DL, TLI); // Mark the first block of the function as being executable. @@ -1637,7 +1638,7 @@ namespace { /// struct IPSCCP : public ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } static char ID; IPSCCP() : ModulePass(ID) { @@ -1651,7 +1652,7 @@ char IPSCCP::ID = 0; INITIALIZE_PASS_BEGIN(IPSCCP, "ipsccp", "Interprocedural Sparse Conditional Constant Propagation", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(IPSCCP, "ipsccp", "Interprocedural Sparse Conditional Constant Propagation", false, false) @@ -1692,7 +1693,8 @@ static bool AddressIsTaken(const GlobalValue *GV) { bool IPSCCP::runOnModule(Module &M) { DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); SCCPSolver Solver(DL, TLI); // AddressTakenFunctions - This set keeps track of the address-taken functions diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 6135114..f69c750 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -28,7 +28,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" @@ -79,8 +79,8 @@ STATISTIC(NumVectorized, "Number of vectorized aggregates"); /// Hidden option to force the pass to not use DomTree and mem2reg, instead /// forming SSA values through the SSAUpdater infrastructure. -static cl::opt<bool> -ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); +static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false), + cl::Hidden); /// Hidden option to enable randomly shuffling the slices to help uncover /// instability in their order. @@ -89,15 +89,15 @@ static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices", /// Hidden option to experiment with completely strict handling of inbounds /// GEPs. -static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", - cl::init(false), cl::Hidden); +static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false), + cl::Hidden); namespace { /// \brief A custom IRBuilder inserter which prefixes all names if they are /// preserved. template <bool preserveNames = true> -class IRBuilderPrefixedInserter : - public IRBuilderDefaultInserter<preserveNames> { +class IRBuilderPrefixedInserter + : public IRBuilderDefaultInserter<preserveNames> { std::string Prefix; public: @@ -113,19 +113,19 @@ protected: // Specialization for not preserving the name is trivial. template <> -class IRBuilderPrefixedInserter<false> : - public IRBuilderDefaultInserter<false> { +class IRBuilderPrefixedInserter<false> + : public IRBuilderDefaultInserter<false> { public: void SetNamePrefix(const Twine &P) {} }; /// \brief Provide a typedef for IRBuilder that drops names in release builds. #ifndef NDEBUG -typedef llvm::IRBuilder<true, ConstantFolder, - IRBuilderPrefixedInserter<true> > IRBuilderTy; +typedef llvm::IRBuilder<true, ConstantFolder, IRBuilderPrefixedInserter<true>> + IRBuilderTy; #else -typedef llvm::IRBuilder<false, ConstantFolder, - IRBuilderPrefixedInserter<false> > IRBuilderTy; +typedef llvm::IRBuilder<false, ConstantFolder, IRBuilderPrefixedInserter<false>> + IRBuilderTy; #endif } @@ -171,10 +171,14 @@ public: /// decreasing. Thus the spanning range comes first in a cluster with the /// same start position. bool operator<(const Slice &RHS) const { - if (beginOffset() < RHS.beginOffset()) return true; - if (beginOffset() > RHS.beginOffset()) return false; - if (isSplittable() != RHS.isSplittable()) return !isSplittable(); - if (endOffset() > RHS.endOffset()) return true; + if (beginOffset() < RHS.beginOffset()) + return true; + if (beginOffset() > RHS.beginOffset()) + return false; + if (isSplittable() != RHS.isSplittable()) + return !isSplittable(); + if (endOffset() > RHS.endOffset()) + return true; return false; } @@ -198,9 +202,7 @@ public: namespace llvm { template <typename T> struct isPodLike; -template <> struct isPodLike<Slice> { - static const bool value = true; -}; +template <> struct isPodLike<Slice> { static const bool value = true; }; } namespace { @@ -235,6 +237,298 @@ public: const_iterator end() const { return Slices.end(); } /// @} + /// \brief Erase a range of slices. + void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); } + + /// \brief Insert new slices for this alloca. + /// + /// This moves the slices into the alloca's slices collection, and re-sorts + /// everything so that the usual ordering properties of the alloca's slices + /// hold. + void insert(ArrayRef<Slice> NewSlices) { + int OldSize = Slices.size(); + std::move(NewSlices.begin(), NewSlices.end(), std::back_inserter(Slices)); + auto SliceI = Slices.begin() + OldSize; + std::sort(SliceI, Slices.end()); + std::inplace_merge(Slices.begin(), SliceI, Slices.end()); + } + + // Forward declare an iterator to befriend it. + class partition_iterator; + + /// \brief A partition of the slices. + /// + /// An ephemeral representation for a range of slices which can be viewed as + /// a partition of the alloca. This range represents a span of the alloca's + /// memory which cannot be split, and provides access to all of the slices + /// overlapping some part of the partition. + /// + /// Objects of this type are produced by traversing the alloca's slices, but + /// are only ephemeral and not persistent. + class Partition { + private: + friend class AllocaSlices; + friend class AllocaSlices::partition_iterator; + + /// \brief The begining and ending offsets of the alloca for this partition. + uint64_t BeginOffset, EndOffset; + + /// \brief The start end end iterators of this partition. + iterator SI, SJ; + + /// \brief A collection of split slice tails overlapping the partition. + SmallVector<Slice *, 4> SplitTails; + + /// \brief Raw constructor builds an empty partition starting and ending at + /// the given iterator. + Partition(iterator SI) : SI(SI), SJ(SI) {} + + public: + /// \brief The start offset of this partition. + /// + /// All of the contained slices start at or after this offset. + uint64_t beginOffset() const { return BeginOffset; } + + /// \brief The end offset of this partition. + /// + /// All of the contained slices end at or before this offset. + uint64_t endOffset() const { return EndOffset; } + + /// \brief The size of the partition. + /// + /// Note that this can never be zero. + uint64_t size() const { + assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); + return EndOffset - BeginOffset; + } + + /// \brief Test whether this partition contains no slices, and merely spans + /// a region occupied by split slices. + bool empty() const { return SI == SJ; } + + /// \name Iterate slices that start within the partition. + /// These may be splittable or unsplittable. They have a begin offset >= the + /// partition begin offset. + /// @{ + // FIXME: We should probably define a "concat_iterator" helper and use that + // to stitch together pointee_iterators over the split tails and the + // contiguous iterators of the partition. That would give a much nicer + // interface here. We could then additionally expose filtered iterators for + // split, unsplit, and unsplittable splices based on the usage patterns. + iterator begin() const { return SI; } + iterator end() const { return SJ; } + /// @} + + /// \brief Get the sequence of split slice tails. + /// + /// These tails are of slices which start before this partition but are + /// split and overlap into the partition. We accumulate these while forming + /// partitions. + ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } + }; + + /// \brief An iterator over partitions of the alloca's slices. + /// + /// This iterator implements the core algorithm for partitioning the alloca's + /// slices. It is a forward iterator as we don't support backtracking for + /// efficiency reasons, and re-use a single storage area to maintain the + /// current set of split slices. + /// + /// It is templated on the slice iterator type to use so that it can operate + /// with either const or non-const slice iterators. + class partition_iterator + : public iterator_facade_base<partition_iterator, + std::forward_iterator_tag, Partition> { + friend class AllocaSlices; + + /// \brief Most of the state for walking the partitions is held in a class + /// with a nice interface for examining them. + Partition P; + + /// \brief We need to keep the end of the slices to know when to stop. + AllocaSlices::iterator SE; + + /// \brief We also need to keep track of the maximum split end offset seen. + /// FIXME: Do we really? + uint64_t MaxSplitSliceEndOffset; + + /// \brief Sets the partition to be empty at given iterator, and sets the + /// end iterator. + partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) + : P(SI), SE(SE), MaxSplitSliceEndOffset(0) { + // If not already at the end, advance our state to form the initial + // partition. + if (SI != SE) + advance(); + } + + /// \brief Advance the iterator to the next partition. + /// + /// Requires that the iterator not be at the end of the slices. + void advance() { + assert((P.SI != SE || !P.SplitTails.empty()) && + "Cannot advance past the end of the slices!"); + + // Clear out any split uses which have ended. + if (!P.SplitTails.empty()) { + if (P.EndOffset >= MaxSplitSliceEndOffset) { + // If we've finished all splits, this is easy. + P.SplitTails.clear(); + MaxSplitSliceEndOffset = 0; + } else { + // Remove the uses which have ended in the prior partition. This + // cannot change the max split slice end because we just checked that + // the prior partition ended prior to that max. + P.SplitTails.erase( + std::remove_if( + P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), + P.SplitTails.end()); + assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { + return S->endOffset() == MaxSplitSliceEndOffset; + }) && + "Could not find the current max split slice offset!"); + assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { + return S->endOffset() <= MaxSplitSliceEndOffset; + }) && + "Max split slice end offset is not actually the max!"); + } + } + + // If P.SI is already at the end, then we've cleared the split tail and + // now have an end iterator. + if (P.SI == SE) { + assert(P.SplitTails.empty() && "Failed to clear the split slices!"); + return; + } + + // If we had a non-empty partition previously, set up the state for + // subsequent partitions. + if (P.SI != P.SJ) { + // Accumulate all the splittable slices which started in the old + // partition into the split list. + for (Slice &S : P) + if (S.isSplittable() && S.endOffset() > P.EndOffset) { + P.SplitTails.push_back(&S); + MaxSplitSliceEndOffset = + std::max(S.endOffset(), MaxSplitSliceEndOffset); + } + + // Start from the end of the previous partition. + P.SI = P.SJ; + + // If P.SI is now at the end, we at most have a tail of split slices. + if (P.SI == SE) { + P.BeginOffset = P.EndOffset; + P.EndOffset = MaxSplitSliceEndOffset; + return; + } + + // If the we have split slices and the next slice is after a gap and is + // not splittable immediately form an empty partition for the split + // slices up until the next slice begins. + if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && + !P.SI->isSplittable()) { + P.BeginOffset = P.EndOffset; + P.EndOffset = P.SI->beginOffset(); + return; + } + } + + // OK, we need to consume new slices. Set the end offset based on the + // current slice, and step SJ past it. The beginning offset of the + // parttion is the beginning offset of the next slice unless we have + // pre-existing split slices that are continuing, in which case we begin + // at the prior end offset. + P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; + P.EndOffset = P.SI->endOffset(); + ++P.SJ; + + // There are two strategies to form a partition based on whether the + // partition starts with an unsplittable slice or a splittable slice. + if (!P.SI->isSplittable()) { + // When we're forming an unsplittable region, it must always start at + // the first slice and will extend through its end. + assert(P.BeginOffset == P.SI->beginOffset()); + + // Form a partition including all of the overlapping slices with this + // unsplittable slice. + while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { + if (!P.SJ->isSplittable()) + P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); + ++P.SJ; + } + + // We have a partition across a set of overlapping unsplittable + // partitions. + return; + } + + // If we're starting with a splittable slice, then we need to form + // a synthetic partition spanning it and any other overlapping splittable + // splices. + assert(P.SI->isSplittable() && "Forming a splittable partition!"); + + // Collect all of the overlapping splittable slices. + while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && + P.SJ->isSplittable()) { + P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); + ++P.SJ; + } + + // Back upiP.EndOffset if we ended the span early when encountering an + // unsplittable slice. This synthesizes the early end offset of + // a partition spanning only splittable slices. + if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { + assert(!P.SJ->isSplittable()); + P.EndOffset = P.SJ->beginOffset(); + } + } + + public: + bool operator==(const partition_iterator &RHS) const { + assert(SE == RHS.SE && + "End iterators don't match between compared partition iterators!"); + + // The observed positions of partitions is marked by the P.SI iterator and + // the emptyness of the split slices. The latter is only relevant when + // P.SI == SE, as the end iterator will additionally have an empty split + // slices list, but the prior may have the same P.SI and a tail of split + // slices. + if (P.SI == RHS.P.SI && + P.SplitTails.empty() == RHS.P.SplitTails.empty()) { + assert(P.SJ == RHS.P.SJ && + "Same set of slices formed two different sized partitions!"); + assert(P.SplitTails.size() == RHS.P.SplitTails.size() && + "Same slice position with differently sized non-empty split " + "slice tails!"); + return true; + } + return false; + } + + partition_iterator &operator++() { + advance(); + return *this; + } + + Partition &operator*() { return P; } + }; + + /// \brief A forward range over the partitions of the alloca's slices. + /// + /// This accesses an iterator range over the partitions of the alloca's + /// slices. It computes these partitions on the fly based on the overlapping + /// offsets of the slices and the ability to split them. It will visit "empty" + /// partitions to cover regions of the alloca only accessed via split + /// slices. + iterator_range<partition_iterator> partitions() { + return make_range(partition_iterator(begin(), end()), + partition_iterator(end(), end())); + } + /// \brief Access the dead users for this alloca. ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; } @@ -308,7 +602,7 @@ static Value *foldSelectInst(SelectInst &SI) { // being selected between, fold the select. Yes this does (rarely) happen // early on. if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) - return SI.getOperand(1+CI->isZero()); + return SI.getOperand(1 + CI->isZero()); if (SI.getOperand(1) == SI.getOperand(2)) return SI.getOperand(1); @@ -421,7 +715,8 @@ private: GEPOffset += APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx)); } else { - // For array or vector indices, scale the index by the size of the type. + // For array or vector indices, scale the index by the size of the + // type. APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth()); GEPOffset += Index * APInt(Offset.getBitWidth(), DL.getTypeAllocSize(GTI.getIndexedType())); @@ -440,16 +735,10 @@ private: void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, uint64_t Size, bool IsVolatile) { - // We allow splitting of loads and stores where the type is an integer type - // and cover the entire alloca. This prevents us from splitting over - // eagerly. - // FIXME: In the great blue eventually, we should eagerly split all integer - // loads and stores, and then have a separate step that merges adjacent - // alloca partitions into a single partition suitable for integer widening. - // Or we should skip the merge step and rely on GVN and other passes to - // merge adjacent loads and stores that survive mem2reg. - bool IsSplittable = - Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize; + // We allow splitting of non-volatile loads and stores where the type is an + // integer type. These may be used to implement 'memcpy' or other "transfer + // of bits" patterns. + bool IsSplittable = Ty->isIntegerTy() && !IsVolatile; insertUse(I, Offset, Size, IsSplittable); } @@ -495,7 +784,6 @@ private: handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile()); } - void visitMemSetInst(MemSetInst &II) { assert(II.getRawDest() == *U && "Pointer use is not the destination?"); ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); @@ -507,9 +795,8 @@ private: if (!IsOffsetKnown) return PI.setAborted(&II); - insertUse(II, Offset, - Length ? Length->getLimitedValue() - : AllocSize - Offset.getLimitedValue(), + insertUse(II, Offset, Length ? Length->getLimitedValue() + : AllocSize - Offset.getLimitedValue(), (bool)Length); } @@ -533,15 +820,15 @@ private: // FIXME: Yet another place we really should bypass this when // instrumenting for ASan. if (Offset.uge(AllocSize)) { - SmallDenseMap<Instruction *, unsigned>::iterator MTPI = MemTransferSliceMap.find(&II); + SmallDenseMap<Instruction *, unsigned>::iterator MTPI = + MemTransferSliceMap.find(&II); if (MTPI != MemTransferSliceMap.end()) AS.Slices[MTPI->second].kill(); return markAsDead(II); } uint64_t RawOffset = Offset.getLimitedValue(); - uint64_t Size = Length ? Length->getLimitedValue() - : AllocSize - RawOffset; + uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset; // Check for the special case where the same exact value is used for both // source and dest. @@ -697,18 +984,12 @@ private: insertUse(I, Offset, Size); } - void visitPHINode(PHINode &PN) { - visitPHINodeOrSelectInst(PN); - } + void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); } - void visitSelectInst(SelectInst &SI) { - visitPHINodeOrSelectInst(SI); - } + void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); } /// \brief Disable SROA entirely if there are unhandled users of the alloca. - void visitInstruction(Instruction &I) { - PI.setAborted(&I); - } + void visitInstruction(Instruction &I) { PI.setAborted(&I); } }; AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) @@ -729,7 +1010,9 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) } Slices.erase(std::remove_if(Slices.begin(), Slices.end(), - std::mem_fun_ref(&Slice::isDead)), + [](const Slice &S) { + return S.isDead(); + }), Slices.end()); #if __cplusplus >= 201103L && !defined(NDEBUG) @@ -749,6 +1032,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) void AllocaSlices::print(raw_ostream &OS, const_iterator I, StringRef Indent) const { printSlice(OS, I, Indent); + OS << "\n"; printUse(OS, I, Indent); } @@ -756,7 +1040,7 @@ void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I, StringRef Indent) const { OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")" << " slice #" << (I - begin()) - << (I->isSplittable() ? " (splittable)" : "") << "\n"; + << (I->isSplittable() ? " (splittable)" : ""); } void AllocaSlices::printUse(raw_ostream &OS, const_iterator I, @@ -804,15 +1088,17 @@ public: AllocaInst &AI, DIBuilder &DIB) : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} - void run(const SmallVectorImpl<Instruction*> &Insts) { + void run(const SmallVectorImpl<Instruction *> &Insts) { // Retain the debug information attached to the alloca for use when // rewriting loads and stores. - if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { - for (User *U : DebugNode->users()) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) - DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) - DVIs.push_back(DVI); + if (auto *L = LocalAsMetadata::getIfExists(&AI)) { + if (auto *DebugNode = MetadataAsValue::getIfExists(AI.getContext(), L)) { + for (User *U : DebugNode->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) + DDIs.push_back(DDI); + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) + DVIs.push_back(DVI); + } } LoadAndStorePromoter::run(Insts); @@ -825,8 +1111,9 @@ public: DVIs.pop_back_val()->eraseFromParent(); } - bool isInstInList(Instruction *I, - const SmallVectorImpl<Instruction*> &Insts) const override { + bool + isInstInList(Instruction *I, + const SmallVectorImpl<Instruction *> &Insts) const override { Value *Ptr; if (LoadInst *LI = dyn_cast<LoadInst>(I)) Ptr = LI->getOperand(0); @@ -884,7 +1171,6 @@ public: }; } // end anon namespace - namespace { /// \brief An optimization pass providing Scalar Replacement of Aggregates. /// @@ -910,7 +1196,7 @@ class SROA : public FunctionPass { LLVMContext *C; const DataLayout *DL; DominatorTree *DT; - AssumptionTracker *AT; + AssumptionCache *AC; /// \brief Worklist of alloca instructions to simplify. /// @@ -919,12 +1205,12 @@ class SROA : public FunctionPass { /// directly promoted. Finally, each time we rewrite a use of an alloca other /// the one being actively rewritten, we add it back onto the list if not /// already present to ensure it is re-visited. - SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; + SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> Worklist; /// \brief A collection of instructions to delete. /// We try to batch deletions to simplify code and make things a bit more /// efficient. - SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts; + SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts; /// \brief Post-promotion worklist. /// @@ -934,7 +1220,7 @@ class SROA : public FunctionPass { /// /// Note that we have to be very careful to clear allocas out of this list in /// the event they are deleted. - SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; + SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> PostPromotionWorklist; /// \brief A collection of alloca instructions we can directly promote. std::vector<AllocaInst *> PromotableAllocas; @@ -944,7 +1230,7 @@ class SROA : public FunctionPass { /// All of these PHIs have been checked for the safety of speculation and by /// being speculated will allow promoting allocas currently in the promotable /// queue. - SetVector<PHINode *, SmallVector<PHINode *, 2> > SpeculatablePHIs; + SetVector<PHINode *, SmallVector<PHINode *, 2>> SpeculatablePHIs; /// \brief A worklist of select instructions to speculate prior to promoting /// allocas. @@ -952,12 +1238,12 @@ class SROA : public FunctionPass { /// All of these select instructions have been checked for the safety of /// speculation and by being speculated will allow promoting allocas /// currently in the promotable queue. - SetVector<SelectInst *, SmallVector<SelectInst *, 2> > SpeculatableSelects; + SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects; public: SROA(bool RequiresDomTree = true) - : FunctionPass(ID), RequiresDomTree(RequiresDomTree), - C(nullptr), DL(nullptr), DT(nullptr) { + : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr), + DL(nullptr), DT(nullptr) { initializeSROAPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -970,10 +1256,9 @@ private: friend class PHIOrSelectSpeculator; friend class AllocaSliceRewriter; - bool rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::iterator B, AllocaSlices::iterator E, - int64_t BeginOffset, int64_t EndOffset, - ArrayRef<AllocaSlices::iterator> SplitUses); + bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS); + AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, + AllocaSlices::Partition &P); bool splitAlloca(AllocaInst &AI, AllocaSlices &AS); bool runOnAlloca(AllocaInst &AI); void clobberUse(Use &U); @@ -988,12 +1273,12 @@ FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { return new SROA(RequiresDomTree); } -INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", - false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false, + false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", - false, false) +INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false, + false) /// Walk the range of a partitioning looking for a common type to cover this /// sequence of slices. @@ -1064,8 +1349,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B, /// /// FIXME: This should be hoisted into a generic utility, likely in /// Transforms/Util/Local.h -static bool isSafePHIToSpeculate(PHINode &PN, - const DataLayout *DL = nullptr) { +static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) { // For now, we can only do this promotion if the load is in the same block // as the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. @@ -1325,7 +1609,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, SmallVectorImpl<Value *> &Indices, Twine NamePrefix) { if (Offset == 0) - return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, NamePrefix); + return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, + NamePrefix); // We can't recurse through pointer types. if (Ty->isPointerTy()) @@ -1433,8 +1718,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, /// a single GEP as possible, thus making each GEP more independent of the /// surrounding code. static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, - APInt Offset, Type *PointerTy, - Twine NamePrefix) { + APInt Offset, Type *PointerTy, Twine NamePrefix) { // Even though we don't look through PHI nodes, we could be called on an // instruction in an unreachable block, which may be on a cycle. SmallPtrSet<Value *, 4> Visited; @@ -1443,8 +1727,9 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, // We may end up computing an offset pointer that has the wrong type. If we // never are able to compute one directly that has the correct type, we'll - // fall back to it, so keep it around here. + // fall back to it, so keep it and the base it was computed from around here. Value *OffsetPtr = nullptr; + Value *OffsetBasePtr; // Remember any i8 pointer we come across to re-use if we need to do a raw // byte offset. @@ -1469,16 +1754,19 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Indices.clear(); if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy, Indices, NamePrefix)) { - if (P->getType() == PointerTy) { - // Zap any offset pointer that we ended up computing in previous rounds. - if (OffsetPtr && OffsetPtr->use_empty()) - if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) - I->eraseFromParent(); + // If we have a new natural pointer at the offset, clear out any old + // offset pointer we computed. Unless it is the base pointer or + // a non-instruction, we built a GEP we don't need. Zap it. + if (OffsetPtr && OffsetPtr != OffsetBasePtr) + if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) { + assert(I->use_empty() && "Built a GEP with uses some how!"); + I->eraseFromParent(); + } + OffsetPtr = P; + OffsetBasePtr = Ptr; + // If we also found a pointer of the right type, we're done. + if (P->getType() == PointerTy) return P; - } - if (!OffsetPtr) { - OffsetPtr = P; - } } // Stash this pointer if we've found an i8*. @@ -1508,9 +1796,10 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Int8PtrOffset = Offset; } - OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : - IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), - NamePrefix + "sroa_raw_idx"); + OffsetPtr = Int8PtrOffset == 0 + ? Int8Ptr + : IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), + NamePrefix + "sroa_raw_idx"); } Ptr = OffsetPtr; @@ -1521,6 +1810,27 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, return Ptr; } +/// \brief Compute the adjusted alignment for a load or store from an offset. +static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset, + const DataLayout &DL) { + unsigned Alignment; + Type *Ty; + if (auto *LI = dyn_cast<LoadInst>(I)) { + Alignment = LI->getAlignment(); + Ty = LI->getType(); + } else if (auto *SI = dyn_cast<StoreInst>(I)) { + Alignment = SI->getAlignment(); + Ty = SI->getValueOperand()->getType(); + } else { + llvm_unreachable("Only loads and stores are allowed!"); + } + + if (!Alignment) + Alignment = DL.getABITypeAlignment(Ty); + + return MinAlign(Alignment, Offset); +} + /// \brief Test whether we can convert a value from the old to the new type. /// /// This predicate should be used to guard calls to convertValue in order to @@ -1614,19 +1924,19 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, /// /// This function is called to test each entry in a partioning which is slated /// for a single slice. -static bool -isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset, - uint64_t SliceEndOffset, VectorType *Ty, - uint64_t ElementSize, const Slice &S) { +static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P, + const Slice &S, VectorType *Ty, + uint64_t ElementSize, + const DataLayout &DL) { // First validate the slice offsets. uint64_t BeginOffset = - std::max(S.beginOffset(), SliceBeginOffset) - SliceBeginOffset; + std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset(); uint64_t BeginIndex = BeginOffset / ElementSize; if (BeginIndex * ElementSize != BeginOffset || BeginIndex >= Ty->getNumElements()) return false; uint64_t EndOffset = - std::min(S.endOffset(), SliceEndOffset) - SliceBeginOffset; + std::min(S.endOffset(), P.endOffset()) - P.beginOffset(); uint64_t EndIndex = EndOffset / ElementSize; if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements()) return false; @@ -1658,7 +1968,7 @@ isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset, if (LI->isVolatile()) return false; Type *LTy = LI->getType(); - if (SliceBeginOffset > S.beginOffset() || SliceEndOffset < S.endOffset()) { + if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { assert(LTy->isIntegerTy()); LTy = SplitIntTy; } @@ -1668,7 +1978,7 @@ isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset, if (SI->isVolatile()) return false; Type *STy = SI->getValueOperand()->getType(); - if (SliceBeginOffset > S.beginOffset() || SliceEndOffset < S.endOffset()) { + if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { assert(STy->isIntegerTy()); STy = SplitIntTy; } @@ -1690,11 +2000,8 @@ isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset, /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static VectorType * -isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, - uint64_t SliceBeginOffset, uint64_t SliceEndOffset, - AllocaSlices::const_range Slices, - ArrayRef<AllocaSlices::iterator> SplitUses) { +static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P, + const DataLayout &DL) { // Collect the candidate types for vector-based promotion. Also track whether // we have different element types. SmallVector<VectorType *, 4> CandidateTys; @@ -1709,11 +2016,10 @@ isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, HaveCommonEltTy = false; } }; - CheckCandidateType(AllocaTy); // Consider any loads or stores that are the exact size of the slice. - for (const auto &S : Slices) - if (S.beginOffset() == SliceBeginOffset && - S.endOffset() == SliceEndOffset) { + for (const Slice &S : P) + if (S.beginOffset() == P.beginOffset() && + S.endOffset() == P.endOffset()) { if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser())) CheckCandidateType(LI->getType()); else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) @@ -1780,14 +2086,12 @@ isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, "vector size not a multiple of element size?"); ElementSize /= 8; - for (const auto &S : Slices) - if (!isVectorPromotionViableForSlice(DL, SliceBeginOffset, SliceEndOffset, - VTy, ElementSize, S)) + for (const Slice &S : P) + if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) return false; - for (const auto &SI : SplitUses) - if (!isVectorPromotionViableForSlice(DL, SliceBeginOffset, SliceEndOffset, - VTy, ElementSize, *SI)) + for (const Slice *S : P.splitSliceTails()) + if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) return false; return true; @@ -1803,12 +2107,13 @@ isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, /// /// This implements the necessary checking for the \c isIntegerWideningViable /// test below on a single slice of the alloca. -static bool isIntegerWideningViableForSlice(const DataLayout &DL, - Type *AllocaTy, +static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, - uint64_t Size, - const Slice &S, + Type *AllocaTy, + const DataLayout &DL, bool &WholeAllocaOp) { + uint64_t Size = DL.getTypeStoreSize(AllocaTy); + uint64_t RelBegin = S.beginOffset() - AllocBeginOffset; uint64_t RelEnd = S.endOffset() - AllocBeginOffset; @@ -1876,11 +2181,8 @@ static bool isIntegerWideningViableForSlice(const DataLayout &DL, /// This is a quick test to check whether we can rewrite the integer loads and /// stores to a particular alloca into wider loads and stores and be able to /// promote the resulting alloca. -static bool -isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy, - uint64_t AllocBeginOffset, - AllocaSlices::const_range Slices, - ArrayRef<AllocaSlices::iterator> SplitUses) { +static bool isIntegerWideningViable(AllocaSlices::Partition &P, Type *AllocaTy, + const DataLayout &DL) { uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy); // Don't create integer types larger than the maximum bitwidth. if (SizeInBits > IntegerType::MAX_INT_BITS) @@ -1898,24 +2200,24 @@ isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy, !canConvertValue(DL, IntTy, AllocaTy)) return false; - uint64_t Size = DL.getTypeStoreSize(AllocaTy); - // While examining uses, we ensure that the alloca has a covering load or // store. We don't want to widen the integer operations only to fail to // promote due to some other unsplittable entry (which we may make splittable // later). However, if there are only splittable uses, go ahead and assume // that we cover the alloca. + // FIXME: We shouldn't consider split slices that happen to start in the + // partition here... bool WholeAllocaOp = - Slices.begin() != Slices.end() ? false : DL.isLegalInteger(SizeInBits); + P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits); - for (const auto &S : Slices) - if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, - S, WholeAllocaOp)) + for (const Slice &S : P) + if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL, + WholeAllocaOp)) return false; - for (const auto &SI : SplitUses) - if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, - *SI, WholeAllocaOp)) + for (const Slice *S : P.splitSliceTails()) + if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL, + WholeAllocaOp)) return false; return WholeAllocaOp; @@ -1928,9 +2230,9 @@ static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *IntTy = cast<IntegerType>(V->getType()); assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && "Element extends past full value"); - uint64_t ShAmt = 8*Offset; + uint64_t ShAmt = 8 * Offset; if (DL.isBigEndian()) - ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); if (ShAmt) { V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); DEBUG(dbgs() << " shifted: " << *V << "\n"); @@ -1957,9 +2259,9 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, } assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && "Element store outside of alloca store"); - uint64_t ShAmt = 8*Offset; + uint64_t ShAmt = 8 * Offset; if (DL.isBigEndian()) - ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); if (ShAmt) { V = IRB.CreateShl(V, ShAmt, Name + ".shift"); DEBUG(dbgs() << " shifted: " << *V << "\n"); @@ -1975,9 +2277,8 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, return V; } -static Value *extractVector(IRBuilderTy &IRB, Value *V, - unsigned BeginIndex, unsigned EndIndex, - const Twine &Name) { +static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, + unsigned EndIndex, const Twine &Name) { VectorType *VecTy = cast<VectorType>(V->getType()); unsigned NumElements = EndIndex - BeginIndex; assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); @@ -1992,13 +2293,12 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, return V; } - SmallVector<Constant*, 8> Mask; + SmallVector<Constant *, 8> Mask; Mask.reserve(NumElements); for (unsigned i = BeginIndex; i != EndIndex; ++i) Mask.push_back(IRB.getInt32(i)); V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), - ConstantVector::get(Mask), - Name + ".extract"); + ConstantVector::get(Mask), Name + ".extract"); DEBUG(dbgs() << " shuffle: " << *V << "\n"); return V; } @@ -2013,7 +2313,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, // Single element to insert. V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), Name + ".insert"); - DEBUG(dbgs() << " insert: " << *V << "\n"); + DEBUG(dbgs() << " insert: " << *V << "\n"); return V; } @@ -2029,7 +2329,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, // use a shuffle vector to widen it with undef elements, and then // a second shuffle vector to select between the loaded vector and the // incoming vector. - SmallVector<Constant*, 8> Mask; + SmallVector<Constant *, 8> Mask; Mask.reserve(VecTy->getNumElements()); for (unsigned i = 0; i != VecTy->getNumElements(); ++i) if (i >= BeginIndex && i < EndIndex) @@ -2037,8 +2337,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, else Mask.push_back(UndefValue::get(IRB.getInt32Ty())); V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), - ConstantVector::get(Mask), - Name + ".expand"); + ConstantVector::get(Mask), Name + ".expand"); DEBUG(dbgs() << " shuffle: " << *V << "\n"); Mask.clear(); @@ -2148,6 +2447,9 @@ public: IsSplittable = I->isSplittable(); IsSplit = BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; + DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : "")); + DEBUG(AS.printSlice(dbgs(), I, "")); + DEBUG(dbgs() << "\n"); // Compute the intersecting offset range. assert(BeginOffset < NewAllocaEndOffset); @@ -2218,7 +2520,8 @@ private: ); } - /// \brief Compute suitable alignment to access this slice of the *new* alloca. + /// \brief Compute suitable alignment to access this slice of the *new* + /// alloca. /// /// You can optionally pass a type to this routine and if that type's ABI /// alignment is itself suitable, this will return zero. @@ -2226,7 +2529,8 @@ private: unsigned NewAIAlign = NewAI.getAlignment(); if (!NewAIAlign) NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType()); - unsigned Align = MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); + unsigned Align = + MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align; } @@ -2250,16 +2554,14 @@ private: unsigned EndIndex = getIndex(NewEndOffset); assert(EndIndex > BeginIndex && "Empty vector!"); - Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); return extractVector(IRB, V, BeginIndex, EndIndex, "vec"); } Value *rewriteIntegerLoad(LoadInst &LI) { assert(IntTy && "We cannot insert an integer to the alloca"); assert(!LI.isVolatile()); - Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); V = convertValue(DL, IRB, V, IntTy); assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; @@ -2284,8 +2586,8 @@ private: V = rewriteIntegerLoad(LI); } else if (NewBeginOffset == NewAllocaBeginOffset && canConvertValue(DL, NewAllocaTy, LI.getType())) { - V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - LI.isVolatile(), LI.getName()); + V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(), + LI.getName()); } else { Type *LTy = TargetTy->getPointerTo(); V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), @@ -2302,7 +2604,7 @@ private: assert(SliceSize < DL.getTypeStoreSize(LI.getType()) && "Split load isn't smaller than original load"); assert(LI.getType()->getIntegerBitWidth() == - DL.getTypeStoreSizeInBits(LI.getType()) && + DL.getTypeStoreSizeInBits(LI.getType()) && "Non-byte-multiple bit width"); // Move the insertion point just past the load so that we can refer to it. IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI))); @@ -2310,9 +2612,9 @@ private: // basis for the new value. This allows us to replace the uses of LI with // the computed value, and then replace the placeholder with LI, leaving // LI only used for this computation. - Value *Placeholder - = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); - V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset, + Value *Placeholder = + new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); + V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset, "insert"); LI.replaceAllUsesWith(V); Placeholder->replaceAllUsesWith(&LI); @@ -2334,15 +2636,14 @@ private: assert(EndIndex > BeginIndex && "Empty vector!"); unsigned NumElements = EndIndex - BeginIndex; assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); - Type *SliceTy = - (NumElements == 1) ? ElementTy - : VectorType::get(ElementTy, NumElements); + Type *SliceTy = (NumElements == 1) + ? ElementTy + : VectorType::get(ElementTy, NumElements); if (V->getType() != SliceTy) V = convertValue(DL, IRB, V, SliceTy); // Mix in the existing elements. - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); V = insertVector(IRB, Old, V, BeginIndex, "vec"); } StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); @@ -2357,13 +2658,12 @@ private: assert(IntTy && "We cannot extract an integer from the alloca"); assert(!SI.isVolatile()); if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Old = convertValue(DL, IRB, Old, IntTy); assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = BeginOffset - NewAllocaBeginOffset; - V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, - "insert"); + V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert"); } V = convertValue(DL, IRB, V, NewAllocaTy); StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); @@ -2391,10 +2691,10 @@ private: assert(V->getType()->isIntegerTy() && "Only integer type loads and stores are split"); assert(V->getType()->getIntegerBitWidth() == - DL.getTypeStoreSizeInBits(V->getType()) && + DL.getTypeStoreSizeInBits(V->getType()) && "Non-byte-multiple bit width"); IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8); - V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, + V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset, "extract"); } @@ -2439,14 +2739,14 @@ private: if (Size == 1) return V; - Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); - V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, "zext"), - ConstantExpr::getUDiv( - Constant::getAllOnesValue(SplatIntTy), - ConstantExpr::getZExt( - Constant::getAllOnesValue(V->getType()), - SplatIntTy)), - "isplat"); + Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8); + V = IRB.CreateMul( + IRB.CreateZExt(V, SplatIntTy, "zext"), + ConstantExpr::getUDiv( + Constant::getAllOnesValue(SplatIntTy), + ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()), + SplatIntTy)), + "isplat"); return V; } @@ -2483,12 +2783,11 @@ private: // If this doesn't map cleanly onto the alloca type, and that type isn't // a single value type, just emit a memset. if (!VecTy && !IntTy && - (BeginOffset > NewAllocaBeginOffset || - EndOffset < NewAllocaEndOffset || + (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset || SliceSize != DL.getTypeStoreSize(AllocaTy) || !AllocaTy->isSingleValueType() || !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) || - DL.getTypeSizeInBits(ScalarTy)%8 != 0)) { + DL.getTypeSizeInBits(ScalarTy) % 8 != 0)) { Type *SizeTy = II.getLength()->getType(); Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); CallInst *New = IRB.CreateMemSet( @@ -2522,8 +2821,8 @@ private: if (NumElements > 1) Splat = getVectorSplat(Splat, NumElements); - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); V = insertVector(IRB, Old, Splat, BeginIndex, "vec"); } else if (IntTy) { // If this is a memset on an alloca where we can widen stores, insert the @@ -2535,8 +2834,8 @@ private: if (IntTy && (BeginOffset != NewAllocaBeginOffset || EndOffset != NewAllocaBeginOffset)) { - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Old = convertValue(DL, IRB, Old, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; V = insertInteger(DL, IRB, Old, V, Offset, "insert"); @@ -2633,8 +2932,8 @@ private: // Strip all inbounds GEPs and pointer casts to try to dig out any root // alloca that should be re-examined after rewriting this instruction. Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); - if (AllocaInst *AI - = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) { + if (AllocaInst *AI = + dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) { assert(AI != &OldAI && AI != &NewAI && "Splittable transfers cannot reach the same alloca on both ends."); Pass.Worklist.insert(AI); @@ -2673,8 +2972,8 @@ private: unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0; unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0; unsigned NumElements = EndIndex - BeginIndex; - IntegerType *SubIntTy - = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : nullptr; + IntegerType *SubIntTy = + IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr; // Reset the other pointer type to match the register type we're going to // use, but using the address space of the original other pointer. @@ -2703,27 +3002,25 @@ private: Value *Src; if (VecTy && !IsWholeAlloca && !IsDest) { - Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec"); } else if (IntTy && !IsWholeAlloca && !IsDest) { - Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); Src = convertValue(DL, IRB, Src, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); } else { - Src = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), - "copyload"); + Src = + IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload"); } if (VecTy && !IsWholeAlloca && IsDest) { - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Src = insertVector(IRB, Old, Src, BeginIndex, "vec"); } else if (IntTy && !IsWholeAlloca && IsDest) { - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Old = convertValue(DL, IRB, Old, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = insertInteger(DL, IRB, Old, Src, Offset, "insert"); @@ -2746,8 +3043,8 @@ private: // Record this instruction for deletion. Pass.DeadInsts.insert(&II); - ConstantInt *Size - = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), + ConstantInt *Size = + ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), NewEndOffset - NewBeginOffset); Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); Value *New; @@ -2814,7 +3111,6 @@ private: SelectUsers.insert(&SI); return true; } - }; } @@ -2869,8 +3165,7 @@ private: bool visitInstruction(Instruction &I) { return false; } /// \brief Generic recursive split emission class. - template <typename Derived> - class OpSplitter { + template <typename Derived> class OpSplitter { protected: /// The builder used to form new instructions. IRBuilderTy IRB; @@ -2887,7 +3182,7 @@ private: /// Initialize the splitter with an insertion point, Ptr and start with a /// single zero GEP index. OpSplitter(Instruction *InsertionPoint, Value *Ptr) - : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} + : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} public: /// \brief Generic recursive split emission routine. @@ -2943,7 +3238,7 @@ private: struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) - : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} + : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} /// Emit a leaf load of a single value. This is called at the leaves of the /// recursive emission to actually load values. @@ -2974,7 +3269,7 @@ private: struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) - : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} + : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} /// Emit a leaf store of a single value. This is called at the leaves of the /// recursive emission to actually produce stores. @@ -2982,8 +3277,8 @@ private: assert(Ty->isSingleValueType()); // Extract the single value and store it using the indices. Value *Store = IRB.CreateStore( - IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), - IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); + IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), + IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); } @@ -3069,8 +3364,8 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { /// when the size or offset cause either end of type-based partition to be off. /// Also, this is a best-effort routine. It is reasonable to give up and not /// return a type if necessary. -static Type *getTypePartition(const DataLayout &DL, Type *Ty, - uint64_t Offset, uint64_t Size) { +static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, + uint64_t Size) { if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size) return stripAggregateTypeWrapping(DL, Ty); if (Offset > DL.getTypeAllocSize(Ty) || @@ -3162,8 +3457,8 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, } // Try to build up a sub-structure. - StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), - STy->isPacked()); + StructType *SubTy = + StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked()); const StructLayout *SubSL = DL.getStructLayout(SubTy); if (Size != SubSL->getSizeInBytes()) return nullptr; // The sub-struct doesn't have quite the size needed. @@ -3171,6 +3466,494 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, return SubTy; } +/// \brief Pre-split loads and stores to simplify rewriting. +/// +/// We want to break up the splittable load+store pairs as much as +/// possible. This is important to do as a preprocessing step, as once we +/// start rewriting the accesses to partitions of the alloca we lose the +/// necessary information to correctly split apart paired loads and stores +/// which both point into this alloca. The case to consider is something like +/// the following: +/// +/// %a = alloca [12 x i8] +/// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0 +/// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4 +/// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8 +/// %iptr1 = bitcast i8* %gep1 to i64* +/// %iptr2 = bitcast i8* %gep2 to i64* +/// %fptr1 = bitcast i8* %gep1 to float* +/// %fptr2 = bitcast i8* %gep2 to float* +/// %fptr3 = bitcast i8* %gep3 to float* +/// store float 0.0, float* %fptr1 +/// store float 1.0, float* %fptr2 +/// %v = load i64* %iptr1 +/// store i64 %v, i64* %iptr2 +/// %f1 = load float* %fptr2 +/// %f2 = load float* %fptr3 +/// +/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and +/// promote everything so we recover the 2 SSA values that should have been +/// there all along. +/// +/// \returns true if any changes are made. +bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { + DEBUG(dbgs() << "Pre-splitting loads and stores\n"); + + // Track the loads and stores which are candidates for pre-splitting here, in + // the order they first appear during the partition scan. These give stable + // iteration order and a basis for tracking which loads and stores we + // actually split. + SmallVector<LoadInst *, 4> Loads; + SmallVector<StoreInst *, 4> Stores; + + // We need to accumulate the splits required of each load or store where we + // can find them via a direct lookup. This is important to cross-check loads + // and stores against each other. We also track the slice so that we can kill + // all the slices that end up split. + struct SplitOffsets { + Slice *S; + std::vector<uint64_t> Splits; + }; + SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap; + + // Track loads out of this alloca which cannot, for any reason, be pre-split. + // This is important as we also cannot pre-split stores of those loads! + // FIXME: This is all pretty gross. It means that we can be more aggressive + // in pre-splitting when the load feeding the store happens to come from + // a separate alloca. Put another way, the effectiveness of SROA would be + // decreased by a frontend which just concatenated all of its local allocas + // into one big flat alloca. But defeating such patterns is exactly the job + // SROA is tasked with! Sadly, to not have this discrepancy we would have + // change store pre-splitting to actually force pre-splitting of the load + // that feeds it *and all stores*. That makes pre-splitting much harder, but + // maybe it would make it more principled? + SmallPtrSet<LoadInst *, 8> UnsplittableLoads; + + DEBUG(dbgs() << " Searching for candidate loads and stores\n"); + for (auto &P : AS.partitions()) { + for (Slice &S : P) { + Instruction *I = cast<Instruction>(S.getUse()->getUser()); + if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) { + // If this was a load we have to track that it can't participate in any + // pre-splitting! + if (auto *LI = dyn_cast<LoadInst>(I)) + UnsplittableLoads.insert(LI); + continue; + } + assert(P.endOffset() > S.beginOffset() && + "Empty or backwards partition!"); + + // Determine if this is a pre-splittable slice. + if (auto *LI = dyn_cast<LoadInst>(I)) { + assert(!LI->isVolatile() && "Cannot split volatile loads!"); + + // The load must be used exclusively to store into other pointers for + // us to be able to arbitrarily pre-split it. The stores must also be + // simple to avoid changing semantics. + auto IsLoadSimplyStored = [](LoadInst *LI) { + for (User *LU : LI->users()) { + auto *SI = dyn_cast<StoreInst>(LU); + if (!SI || !SI->isSimple()) + return false; + } + return true; + }; + if (!IsLoadSimplyStored(LI)) { + UnsplittableLoads.insert(LI); + continue; + } + + Loads.push_back(LI); + } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) { + if (!SI || + S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex())) + continue; + auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand()); + if (!StoredLoad || !StoredLoad->isSimple()) + continue; + assert(!SI->isVolatile() && "Cannot split volatile stores!"); + + Stores.push_back(SI); + } else { + // Other uses cannot be pre-split. + continue; + } + + // Record the initial split. + DEBUG(dbgs() << " Candidate: " << *I << "\n"); + auto &Offsets = SplitOffsetsMap[I]; + assert(Offsets.Splits.empty() && + "Should not have splits the first time we see an instruction!"); + Offsets.S = &S; + Offsets.Splits.push_back(P.endOffset() - S.beginOffset()); + } + + // Now scan the already split slices, and add a split for any of them which + // we're going to pre-split. + for (Slice *S : P.splitSliceTails()) { + auto SplitOffsetsMapI = + SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser())); + if (SplitOffsetsMapI == SplitOffsetsMap.end()) + continue; + auto &Offsets = SplitOffsetsMapI->second; + + assert(Offsets.S == S && "Found a mismatched slice!"); + assert(!Offsets.Splits.empty() && + "Cannot have an empty set of splits on the second partition!"); + assert(Offsets.Splits.back() == + P.beginOffset() - Offsets.S->beginOffset() && + "Previous split does not end where this one begins!"); + + // Record each split. The last partition's end isn't needed as the size + // of the slice dictates that. + if (S->endOffset() > P.endOffset()) + Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset()); + } + } + + // We may have split loads where some of their stores are split stores. For + // such loads and stores, we can only pre-split them if their splits exactly + // match relative to their starting offset. We have to verify this prior to + // any rewriting. + Stores.erase( + std::remove_if(Stores.begin(), Stores.end(), + [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) { + // Lookup the load we are storing in our map of split + // offsets. + auto *LI = cast<LoadInst>(SI->getValueOperand()); + // If it was completely unsplittable, then we're done, + // and this store can't be pre-split. + if (UnsplittableLoads.count(LI)) + return true; + + auto LoadOffsetsI = SplitOffsetsMap.find(LI); + if (LoadOffsetsI == SplitOffsetsMap.end()) + return false; // Unrelated loads are definitely safe. + auto &LoadOffsets = LoadOffsetsI->second; + + // Now lookup the store's offsets. + auto &StoreOffsets = SplitOffsetsMap[SI]; + + // If the relative offsets of each split in the load and + // store match exactly, then we can split them and we + // don't need to remove them here. + if (LoadOffsets.Splits == StoreOffsets.Splits) + return false; + + DEBUG(dbgs() + << " Mismatched splits for load and store:\n" + << " " << *LI << "\n" + << " " << *SI << "\n"); + + // We've found a store and load that we need to split + // with mismatched relative splits. Just give up on them + // and remove both instructions from our list of + // candidates. + UnsplittableLoads.insert(LI); + return true; + }), + Stores.end()); + // Now we have to go *back* through all te stores, because a later store may + // have caused an earlier store's load to become unsplittable and if it is + // unsplittable for the later store, then we can't rely on it being split in + // the earlier store either. + Stores.erase(std::remove_if(Stores.begin(), Stores.end(), + [&UnsplittableLoads](StoreInst *SI) { + auto *LI = + cast<LoadInst>(SI->getValueOperand()); + return UnsplittableLoads.count(LI); + }), + Stores.end()); + // Once we've established all the loads that can't be split for some reason, + // filter any that made it into our list out. + Loads.erase(std::remove_if(Loads.begin(), Loads.end(), + [&UnsplittableLoads](LoadInst *LI) { + return UnsplittableLoads.count(LI); + }), + Loads.end()); + + + // If no loads or stores are left, there is no pre-splitting to be done for + // this alloca. + if (Loads.empty() && Stores.empty()) + return false; + + // From here on, we can't fail and will be building new accesses, so rig up + // an IR builder. + IRBuilderTy IRB(&AI); + + // Collect the new slices which we will merge into the alloca slices. + SmallVector<Slice, 4> NewSlices; + + // Track any allocas we end up splitting loads and stores for so we iterate + // on them. + SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas; + + // At this point, we have collected all of the loads and stores we can + // pre-split, and the specific splits needed for them. We actually do the + // splitting in a specific order in order to handle when one of the loads in + // the value operand to one of the stores. + // + // First, we rewrite all of the split loads, and just accumulate each split + // load in a parallel structure. We also build the slices for them and append + // them to the alloca slices. + SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap; + std::vector<LoadInst *> SplitLoads; + for (LoadInst *LI : Loads) { + SplitLoads.clear(); + + IntegerType *Ty = cast<IntegerType>(LI->getType()); + uint64_t LoadSize = Ty->getBitWidth() / 8; + assert(LoadSize > 0 && "Cannot have a zero-sized integer load!"); + + auto &Offsets = SplitOffsetsMap[LI]; + assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && + "Slice size should always match load size exactly!"); + uint64_t BaseOffset = Offsets.S->beginOffset(); + assert(BaseOffset + LoadSize > BaseOffset && + "Cannot represent alloca access size using 64-bit integers!"); + + Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand()); + IRB.SetInsertPoint(BasicBlock::iterator(LI)); + + DEBUG(dbgs() << " Splitting load: " << *LI << "\n"); + + uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); + int Idx = 0, Size = Offsets.Splits.size(); + for (;;) { + auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); + auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); + LoadInst *PLoad = IRB.CreateAlignedLoad( + getAdjustedPtr(IRB, *DL, BasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, BasePtr->getName() + "."), + getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false, + LI->getName()); + + // Append this load onto the list of split loads so we can find it later + // to rewrite the stores. + SplitLoads.push_back(PLoad); + + // Now build a new slice for the alloca. + NewSlices.push_back( + Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, + &PLoad->getOperandUse(PLoad->getPointerOperandIndex()), + /*IsSplittable*/ false)); + DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() + << ", " << NewSlices.back().endOffset() << "): " << *PLoad + << "\n"); + + // See if we've handled all the splits. + if (Idx >= Size) + break; + + // Setup the next partition. + PartOffset = Offsets.Splits[Idx]; + ++Idx; + PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset; + } + + // Now that we have the split loads, do the slow walk over all uses of the + // load and rewrite them as split stores, or save the split loads to use + // below if the store is going to be split there anyways. + bool DeferredStores = false; + for (User *LU : LI->users()) { + StoreInst *SI = cast<StoreInst>(LU); + if (!Stores.empty() && SplitOffsetsMap.count(SI)) { + DeferredStores = true; + DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n"); + continue; + } + + Value *StoreBasePtr = SI->getPointerOperand(); + IRB.SetInsertPoint(BasicBlock::iterator(SI)); + + DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); + + for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) { + LoadInst *PLoad = SplitLoads[Idx]; + uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1]; + auto *PartPtrTy = + PLoad->getType()->getPointerTo(SI->getPointerAddressSpace()); + + StoreInst *PStore = IRB.CreateAlignedStore( + PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, StoreBasePtr->getName() + "."), + getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false); + (void)PStore; + DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); + } + + // We want to immediately iterate on any allocas impacted by splitting + // this store, and we have to track any promotable alloca (indicated by + // a direct store) as needing to be resplit because it is no longer + // promotable. + if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) { + ResplitPromotableAllocas.insert(OtherAI); + Worklist.insert(OtherAI); + } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( + StoreBasePtr->stripInBoundsOffsets())) { + Worklist.insert(OtherAI); + } + + // Mark the original store as dead. + DeadInsts.insert(SI); + } + + // Save the split loads if there are deferred stores among the users. + if (DeferredStores) + SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads))); + + // Mark the original load as dead and kill the original slice. + DeadInsts.insert(LI); + Offsets.S->kill(); + } + + // Second, we rewrite all of the split stores. At this point, we know that + // all loads from this alloca have been split already. For stores of such + // loads, we can simply look up the pre-existing split loads. For stores of + // other loads, we split those loads first and then write split stores of + // them. + for (StoreInst *SI : Stores) { + auto *LI = cast<LoadInst>(SI->getValueOperand()); + IntegerType *Ty = cast<IntegerType>(LI->getType()); + uint64_t StoreSize = Ty->getBitWidth() / 8; + assert(StoreSize > 0 && "Cannot have a zero-sized integer store!"); + + auto &Offsets = SplitOffsetsMap[SI]; + assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && + "Slice size should always match load size exactly!"); + uint64_t BaseOffset = Offsets.S->beginOffset(); + assert(BaseOffset + StoreSize > BaseOffset && + "Cannot represent alloca access size using 64-bit integers!"); + + Value *LoadBasePtr = LI->getPointerOperand(); + Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand()); + + DEBUG(dbgs() << " Splitting store: " << *SI << "\n"); + + // Check whether we have an already split load. + auto SplitLoadsMapI = SplitLoadsMap.find(LI); + std::vector<LoadInst *> *SplitLoads = nullptr; + if (SplitLoadsMapI != SplitLoadsMap.end()) { + SplitLoads = &SplitLoadsMapI->second; + assert(SplitLoads->size() == Offsets.Splits.size() + 1 && + "Too few split loads for the number of splits in the store!"); + } else { + DEBUG(dbgs() << " of load: " << *LI << "\n"); + } + + uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); + int Idx = 0, Size = Offsets.Splits.size(); + for (;;) { + auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); + auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace()); + + // Either lookup a split load or create one. + LoadInst *PLoad; + if (SplitLoads) { + PLoad = (*SplitLoads)[Idx]; + } else { + IRB.SetInsertPoint(BasicBlock::iterator(LI)); + PLoad = IRB.CreateAlignedLoad( + getAdjustedPtr(IRB, *DL, LoadBasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, LoadBasePtr->getName() + "."), + getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false, + LI->getName()); + } + + // And store this partition. + IRB.SetInsertPoint(BasicBlock::iterator(SI)); + StoreInst *PStore = IRB.CreateAlignedStore( + PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, StoreBasePtr->getName() + "."), + getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false); + + // Now build a new slice for the alloca. + NewSlices.push_back( + Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, + &PStore->getOperandUse(PStore->getPointerOperandIndex()), + /*IsSplittable*/ false)); + DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() + << ", " << NewSlices.back().endOffset() << "): " << *PStore + << "\n"); + if (!SplitLoads) { + DEBUG(dbgs() << " of split load: " << *PLoad << "\n"); + } + + // See if we've finished all the splits. + if (Idx >= Size) + break; + + // Setup the next partition. + PartOffset = Offsets.Splits[Idx]; + ++Idx; + PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset; + } + + // We want to immediately iterate on any allocas impacted by splitting + // this load, which is only relevant if it isn't a load of this alloca and + // thus we didn't already split the loads above. We also have to keep track + // of any promotable allocas we split loads on as they can no longer be + // promoted. + if (!SplitLoads) { + if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) { + assert(OtherAI != &AI && "We can't re-split our own alloca!"); + ResplitPromotableAllocas.insert(OtherAI); + Worklist.insert(OtherAI); + } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( + LoadBasePtr->stripInBoundsOffsets())) { + assert(OtherAI != &AI && "We can't re-split our own alloca!"); + Worklist.insert(OtherAI); + } + } + + // Mark the original store as dead now that we've split it up and kill its + // slice. Note that we leave the original load in place unless this store + // was its ownly use. It may in turn be split up if it is an alloca load + // for some other alloca, but it may be a normal load. This may introduce + // redundant loads, but where those can be merged the rest of the optimizer + // should handle the merging, and this uncovers SSA splits which is more + // important. In practice, the original loads will almost always be fully + // split and removed eventually, and the splits will be merged by any + // trivial CSE, including instcombine. + if (LI->hasOneUse()) { + assert(*LI->user_begin() == SI && "Single use isn't this store!"); + DeadInsts.insert(LI); + } + DeadInsts.insert(SI); + Offsets.S->kill(); + } + + // Remove the killed slices that have ben pre-split. + AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) { + return S.isDead(); + }), AS.end()); + + // Insert our new slices. This will sort and merge them into the sorted + // sequence. + AS.insert(NewSlices); + + DEBUG(dbgs() << " Pre-split slices:\n"); +#ifndef NDEBUG + for (auto I = AS.begin(), E = AS.end(); I != E; ++I) + DEBUG(AS.print(dbgs(), I, " ")); +#endif + + // Finally, don't try to promote any allocas that new require re-splitting. + // They have already been added to the worklist above. + PromotableAllocas.erase( + std::remove_if( + PromotableAllocas.begin(), PromotableAllocas.end(), + [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }), + PromotableAllocas.end()); + + return true; +} + /// \brief Rewrite an alloca partition's users. /// /// This routine drives both of the rewriting goals of the SROA pass. It tries @@ -3181,40 +3964,31 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, /// appropriate new offsets. It also evaluates how successful the rewrite was /// at enabling promotion and if it was successful queues the alloca to be /// promoted. -bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::iterator B, AllocaSlices::iterator E, - int64_t BeginOffset, int64_t EndOffset, - ArrayRef<AllocaSlices::iterator> SplitUses) { - assert(BeginOffset < EndOffset); - uint64_t SliceSize = EndOffset - BeginOffset; - +AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, + AllocaSlices::Partition &P) { // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. Type *SliceTy = nullptr; - if (Type *CommonUseTy = findCommonType(B, E, EndOffset)) - if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize) + if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset())) + if (DL->getTypeAllocSize(CommonUseTy) >= P.size()) SliceTy = CommonUseTy; if (!SliceTy) if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(), - BeginOffset, SliceSize)) + P.beginOffset(), P.size())) SliceTy = TypePartitionTy; if ((!SliceTy || (SliceTy->isArrayTy() && SliceTy->getArrayElementType()->isIntegerTy())) && - DL->isLegalInteger(SliceSize * 8)) - SliceTy = Type::getIntNTy(*C, SliceSize * 8); + DL->isLegalInteger(P.size() * 8)) + SliceTy = Type::getIntNTy(*C, P.size() * 8); if (!SliceTy) - SliceTy = ArrayType::get(Type::getInt8Ty(*C), SliceSize); - assert(DL->getTypeAllocSize(SliceTy) >= SliceSize); + SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size()); + assert(DL->getTypeAllocSize(SliceTy) >= P.size()); - bool IsIntegerPromotable = isIntegerWideningViable( - *DL, SliceTy, BeginOffset, AllocaSlices::const_range(B, E), SplitUses); + bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, *DL); VectorType *VecTy = - IsIntegerPromotable - ? nullptr - : isVectorPromotionViable(*DL, SliceTy, BeginOffset, EndOffset, - AllocaSlices::const_range(B, E), SplitUses); + IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, *DL); if (VecTy) SliceTy = VecTy; @@ -3224,11 +3998,12 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // perform phi and select speculation. AllocaInst *NewAI; if (SliceTy == AI.getAllocatedType()) { - assert(BeginOffset == 0 && + assert(P.beginOffset() == 0 && "Non-zero begin offset but same alloca type"); NewAI = &AI; // FIXME: We should be able to bail at this point with "nothing changed". // FIXME: We might want to defer PHI speculation until after here. + // FIXME: return nullptr; } else { unsigned Alignment = AI.getAlignment(); if (!Alignment) { @@ -3237,20 +4012,20 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // type. Alignment = DL->getABITypeAlignment(AI.getAllocatedType()); } - Alignment = MinAlign(Alignment, BeginOffset); + Alignment = MinAlign(Alignment, P.beginOffset()); // If we will get at least this much alignment from the type alone, leave // the alloca's alignment unconstrained. if (Alignment <= DL->getABITypeAlignment(SliceTy)) Alignment = 0; - NewAI = - new AllocaInst(SliceTy, nullptr, Alignment, - AI.getName() + ".sroa." + Twine(B - AS.begin()), &AI); + NewAI = new AllocaInst( + SliceTy, nullptr, Alignment, + AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI); ++NumNewAllocas; } DEBUG(dbgs() << "Rewriting alloca partition " - << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI - << "\n"); + << "[" << P.beginOffset() << "," << P.endOffset() + << ") to: " << *NewAI << "\n"); // Track the high watermark on the worklist as it is only relevant for // promoted allocas. We will reset it to this point if the alloca is not in @@ -3260,20 +4035,16 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, SmallPtrSet<PHINode *, 8> PHIUsers; SmallPtrSet<SelectInst *, 8> SelectUsers; - AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, BeginOffset, - EndOffset, IsIntegerPromotable, VecTy, PHIUsers, - SelectUsers); + AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, P.beginOffset(), + P.endOffset(), IsIntegerPromotable, VecTy, + PHIUsers, SelectUsers); bool Promotable = true; - for (auto & SplitUse : SplitUses) { - DEBUG(dbgs() << " rewriting split "); - DEBUG(AS.printSlice(dbgs(), SplitUse, "")); - Promotable &= Rewriter.visit(SplitUse); + for (Slice *S : P.splitSliceTails()) { + Promotable &= Rewriter.visit(S); ++NumUses; } - for (AllocaSlices::iterator I = B; I != E; ++I) { - DEBUG(dbgs() << " rewriting "); - DEBUG(AS.printSlice(dbgs(), I, "")); - Promotable &= Rewriter.visit(I); + for (Slice &S : P) { + Promotable &= Rewriter.visit(&S); ++NumUses; } @@ -3328,32 +4099,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, PostPromotionWorklist.pop_back(); } - return true; -} - -static void -removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses, - uint64_t &MaxSplitUseEndOffset, uint64_t Offset) { - if (Offset >= MaxSplitUseEndOffset) { - SplitUses.clear(); - MaxSplitUseEndOffset = 0; - return; - } - - size_t SplitUsesOldSize = SplitUses.size(); - SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(), - [Offset](const AllocaSlices::iterator &I) { - return I->endOffset() <= Offset; - }), - SplitUses.end()); - if (SplitUsesOldSize == SplitUses.size()) - return; - - // Recompute the max. While this is linear, so is remove_if. - MaxSplitUseEndOffset = 0; - for (AllocaSlices::iterator SplitUse : SplitUses) - MaxSplitUseEndOffset = - std::max(SplitUse->endOffset(), MaxSplitUseEndOffset); + return NewAI; } /// \brief Walks the slices of an alloca and form partitions based on them, @@ -3364,108 +4110,100 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { unsigned NumPartitions = 0; bool Changed = false; - SmallVector<AllocaSlices::iterator, 4> SplitUses; - uint64_t MaxSplitUseEndOffset = 0; - - uint64_t BeginOffset = AS.begin()->beginOffset(); - - for (AllocaSlices::iterator SI = AS.begin(), SJ = std::next(SI), - SE = AS.end(); - SI != SE; SI = SJ) { - uint64_t MaxEndOffset = SI->endOffset(); - - if (!SI->isSplittable()) { - // When we're forming an unsplittable region, it must always start at the - // first slice and will extend through its end. - assert(BeginOffset == SI->beginOffset()); - - // Form a partition including all of the overlapping slices with this - // unsplittable slice. - while (SJ != SE && SJ->beginOffset() < MaxEndOffset) { - if (!SJ->isSplittable()) - MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); - ++SJ; - } - } else { - assert(SI->isSplittable()); // Established above. - - // Collect all of the overlapping splittable slices. - while (SJ != SE && SJ->beginOffset() < MaxEndOffset && - SJ->isSplittable()) { - MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); - ++SJ; - } - - // Back up MaxEndOffset and SJ if we ended the span early when - // encountering an unsplittable slice. - if (SJ != SE && SJ->beginOffset() < MaxEndOffset) { - assert(!SJ->isSplittable()); - MaxEndOffset = SJ->beginOffset(); - } - } - - // Check if we have managed to move the end offset forward yet. If so, - // we'll have to rewrite uses and erase old split uses. - if (BeginOffset < MaxEndOffset) { - // Rewrite a sequence of overlapping slices. - Changed |= rewritePartition(AI, AS, SI, SJ, BeginOffset, MaxEndOffset, - SplitUses); - ++NumPartitions; - - removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, MaxEndOffset); - } - // Accumulate all the splittable slices from the [SI,SJ) region which - // overlap going forward. - for (AllocaSlices::iterator SK = SI; SK != SJ; ++SK) - if (SK->isSplittable() && SK->endOffset() > MaxEndOffset) { - SplitUses.push_back(SK); - MaxSplitUseEndOffset = std::max(SK->endOffset(), MaxSplitUseEndOffset); - } - - // If we're already at the end and we have no split uses, we're done. - if (SJ == SE && SplitUses.empty()) - break; + // First try to pre-split loads and stores. + Changed |= presplitLoadsAndStores(AI, AS); - // If we have no split uses or no gap in offsets, we're ready to move to - // the next slice. - if (SplitUses.empty() || (SJ != SE && MaxEndOffset == SJ->beginOffset())) { - BeginOffset = SJ->beginOffset(); + // Now that we have identified any pre-splitting opportunities, mark any + // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail + // to split these during pre-splitting, we want to force them to be + // rewritten into a partition. + bool IsSorted = true; + for (Slice &S : AS) { + if (!S.isSplittable()) continue; - } - - // Even if we have split slices, if the next slice is splittable and the - // split slices reach it, we can simply set up the beginning offset of the - // next iteration to bridge between them. - if (SJ != SE && SJ->isSplittable() && - MaxSplitUseEndOffset > SJ->beginOffset()) { - BeginOffset = MaxEndOffset; + // FIXME: We currently leave whole-alloca splittable loads and stores. This + // used to be the only splittable loads and stores and we need to be + // confident that the above handling of splittable loads and stores is + // completely sufficient before we forcibly disable the remaining handling. + if (S.beginOffset() == 0 && + S.endOffset() >= DL->getTypeAllocSize(AI.getAllocatedType())) continue; + if (isa<LoadInst>(S.getUse()->getUser()) || + isa<StoreInst>(S.getUse()->getUser())) { + S.makeUnsplittable(); + IsSorted = false; + } + } + if (!IsSorted) + std::sort(AS.begin(), AS.end()); + + /// \brief Describes the allocas introduced by rewritePartition + /// in order to migrate the debug info. + struct Piece { + AllocaInst *Alloca; + uint64_t Offset; + uint64_t Size; + Piece(AllocaInst *AI, uint64_t O, uint64_t S) + : Alloca(AI), Offset(O), Size(S) {} + }; + SmallVector<Piece, 4> Pieces; + + // Rewrite each partition. + for (auto &P : AS.partitions()) { + if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) { + Changed = true; + if (NewAI != &AI) { + uint64_t SizeOfByte = 8; + uint64_t AllocaSize = DL->getTypeSizeInBits(NewAI->getAllocatedType()); + // Don't include any padding. + uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte); + Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size)); + } } - - // Otherwise, we have a tail of split slices. Rewrite them with an empty - // range of slices. - uint64_t PostSplitEndOffset = - SJ == SE ? MaxSplitUseEndOffset : SJ->beginOffset(); - - Changed |= rewritePartition(AI, AS, SJ, SJ, MaxEndOffset, - PostSplitEndOffset, SplitUses); ++NumPartitions; - - if (SJ == SE) - break; // Skip the rest, we don't need to do any cleanup. - - removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, - PostSplitEndOffset); - - // Now just reset the begin offset for the next iteration. - BeginOffset = SJ->beginOffset(); } NumAllocaPartitions += NumPartitions; MaxPartitionsPerAlloca = std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca); + // Migrate debug information from the old alloca to the new alloca(s) + // and the individial partitions. + if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) { + DIVariable Var(DbgDecl->getVariable()); + DIExpression Expr(DbgDecl->getExpression()); + DIBuilder DIB(*AI.getParent()->getParent()->getParent(), + /*AllowUnresolved*/ false); + bool IsSplit = Pieces.size() > 1; + for (auto Piece : Pieces) { + // Create a piece expression describing the new partition or reuse AI's + // expression if there is only one partition. + DIExpression PieceExpr = Expr; + if (IsSplit || Expr.isBitPiece()) { + // If this alloca is already a scalar replacement of a larger aggregate, + // Piece.Offset describes the offset inside the scalar. + uint64_t Offset = Expr.isBitPiece() ? Expr.getBitPieceOffset() : 0; + uint64_t Start = Offset + Piece.Offset; + uint64_t Size = Piece.Size; + if (Expr.isBitPiece()) { + uint64_t AbsEnd = Expr.getBitPieceOffset() + Expr.getBitPieceSize(); + if (Start >= AbsEnd) + // No need to describe a SROAed padding. + continue; + Size = std::min(Size, AbsEnd - Start); + } + PieceExpr = DIB.createBitPieceExpression(Start, Size); + } + + // Remove any existing dbg.declare intrinsic describing the same alloca. + if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca)) + OldDDI->eraseFromParent(); + + auto *NewDDI = DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, &AI); + NewDDI->setDebugLoc(DbgDecl->getDebugLoc()); + } + } return Changed; } @@ -3561,7 +4299,8 @@ bool SROA::runOnAlloca(AllocaInst &AI) { /// /// We also record the alloca instructions deleted here so that they aren't /// subsequently handed to mem2reg to promote. -void SROA::deleteDeadInstructions(SmallPtrSetImpl<AllocaInst*> &DeletedAllocas) { +void SROA::deleteDeadInstructions( + SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) { while (!DeadInsts.empty()) { Instruction *I = DeadInsts.pop_back_val(); DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); @@ -3576,8 +4315,11 @@ void SROA::deleteDeadInstructions(SmallPtrSetImpl<AllocaInst*> &DeletedAllocas) DeadInsts.insert(U); } - if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { DeletedAllocas.insert(AI); + if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI)) + DbgDecl->eraseFromParent(); + } ++NumDeleted; I->eraseFromParent(); @@ -3608,14 +4350,14 @@ bool SROA::promoteAllocas(Function &F) { if (DT && !ForceSSAUpdater) { DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); - PromoteMemToReg(PromotableAllocas, *DT, nullptr, AT); + PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC); PromotableAllocas.clear(); return true; } DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); SSAUpdater SSA; - DIBuilder DIB(*F.getParent()); + DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); SmallVector<Instruction *, 64> Insts; // We need a worklist to walk the uses of each alloca. @@ -3690,13 +4432,14 @@ bool SROA::runOnFunction(Function &F) { DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; - AT = &getAnalysis<AssumptionTracker>(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); - I != E; ++I) + I != E; ++I) { if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) Worklist.insert(AI); + } bool Changed = false; // A set of deleted alloca instruction pointers which should be removed from @@ -3711,9 +4454,7 @@ bool SROA::runOnFunction(Function &F) { // Remove the deleted allocas from various lists so that we don't try to // continue processing them. if (!DeletedAllocas.empty()) { - auto IsInSet = [&](AllocaInst *AI) { - return DeletedAllocas.count(AI); - }; + auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); }; Worklist.remove_if(IsInSet); PostPromotionWorklist.remove_if(IsInSet); PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), @@ -3734,7 +4475,7 @@ bool SROA::runOnFunction(Function &F) { } void SROA::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); if (RequiresDomTree) AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp index 179bbf7..c7232a9 100644 --- a/lib/Transforms/Scalar/SampleProfile.cpp +++ b/lib/Transforms/Scalar/SampleProfile.cpp @@ -95,7 +95,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<PostDominatorTree>(); } @@ -731,7 +731,7 @@ INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile", "Sample Profile loader", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AddDiscriminators) INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile", "Sample Profile loader", false, false) @@ -762,7 +762,7 @@ bool SampleProfileLoader::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); PDT = &getAnalysis<PostDominatorTree>(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); Ctx = &F.getParent()->getContext(); Samples = Reader->getSamplesFor(F); if (!Samples->empty()) diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index a16e9e2..621633b 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -20,7 +20,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" using namespace llvm; @@ -28,6 +28,7 @@ using namespace llvm; /// ScalarOpts library. void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCEPass(Registry); + initializeBDCEPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); initializeSampleProfileLoaderPass(Registry); initializeConstantHoistingPass(Registry); @@ -38,12 +39,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeScalarizerPass(Registry); initializeDSEPass(Registry); initializeGVNPass(Registry); - initializeEarlyCSEPass(Registry); + initializeEarlyCSELegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); + initializeInductiveRangeCheckEliminationPass(Registry); initializeIndVarSimplifyPass(Registry); initializeJumpThreadingPass(Registry); initializeLICMPass(Registry); initializeLoopDeletionPass(Registry); + initializeLoopAccessAnalysisPass(Registry); initializeLoopInstSimplifyPass(Registry); initializeLoopRotatePass(Registry); initializeLoopStrengthReducePass(Registry); @@ -58,6 +61,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializePartiallyInlineLibCallsPass(Registry); initializeReassociatePass(Registry); initializeRegToMemPass(Registry); + initializeRewriteStatepointsForGCPass(Registry); initializeSCCPPass(Registry); initializeIPSCCPPass(Registry); initializeSROAPass(Registry); @@ -68,7 +72,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeSinkingPass(Registry); initializeTailCallElimPass(Registry); initializeSeparateConstOffsetFromGEPPass(Registry); + initializeStraightLineStrengthReducePass(Registry); initializeLoadCombinePass(Registry); + initializePlaceBackedgeSafepointsImplPass(Registry); + initializePlaceSafepointsPass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { @@ -79,6 +86,10 @@ void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createAggressiveDCEPass()); } +void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createBitTrackingDCEPass()); +} + void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createAlignmentFromAssumptionsPass()); } diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index f7fa917..5c49a55 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -23,7 +23,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" @@ -198,7 +198,7 @@ namespace { // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } @@ -216,7 +216,7 @@ namespace { // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.setPreservesCFG(); } }; @@ -228,14 +228,14 @@ char SROA_SSAUp::ID = 0; INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa", "Scalar Replacement of Aggregates (SSAUp)", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa", "Scalar Replacement of Aggregates (SSAUp)", false, false) @@ -1068,12 +1068,14 @@ public: void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) { // Remember which alloca we're promoting (for isInstInList). this->AI = AI; - if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI)) { - for (User *U : DebugNode->users()) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) - DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) - DVIs.push_back(DVI); + if (auto *L = LocalAsMetadata::getIfExists(AI)) { + if (auto *DebugNode = MetadataAsValue::getIfExists(AI->getContext(), L)) { + for (User *U : DebugNode->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) + DDIs.push_back(DDI); + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) + DVIs.push_back(DVI); + } } LoadAndStorePromoter::run(Insts); @@ -1417,10 +1419,11 @@ bool SROA::performPromotion(Function &F) { DominatorTree *DT = nullptr; if (HasDomTree) DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - AssumptionTracker *AT = &getAnalysis<AssumptionTracker>(); + AssumptionCache &AC = + getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function - DIBuilder DIB(*F.getParent()); + DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); bool Changed = false; SmallVector<Instruction*, 64> Insts; while (1) { @@ -1436,7 +1439,7 @@ bool SROA::performPromotion(Function &F) { if (Allocas.empty()) break; if (HasDomTree) - PromoteMemToReg(Allocas, *DT, nullptr, AT); + PromoteMemToReg(Allocas, *DT, nullptr, &AC); else { SSAUpdater SSA; for (unsigned i = 0, e = Allocas.size(); i != e; ++i) { diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 6157746..bffe8df 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -313,7 +313,8 @@ class SeparateConstOffsetFromGEP : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DataLayoutPass>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); } bool doInitialization(Module &M) override { @@ -384,7 +385,7 @@ INITIALIZE_PASS_BEGIN( SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", "Split GEPs to a variadic base and a constant offset for better CSE", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DataLayoutPass) INITIALIZE_PASS_END( SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", @@ -857,7 +858,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // of variable indices. Therefore, we don't check for addressing modes in that // case. if (!LowerGEP) { - TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); + TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *GEP->getParent()->getParent()); if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(), /*BaseGV=*/nullptr, AccumulativeByteOffset, /*HasBaseReg=*/true, /*Scale=*/0)) { @@ -910,7 +913,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { if (LowerGEP) { // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to // arithmetic operations if the target uses alias analysis in codegen. - if (TM && TM->getSubtarget<TargetSubtargetInfo>().useAA()) + if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA()) lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset); else lowerToArithmetics(GEP, AccumulativeByteOffset); @@ -996,6 +999,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { } bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + if (DisableSeparateConstOffsetFromGEP) return false; diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 046a7cb..fb8fe38 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -21,11 +21,11 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/SimplifyCFG.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CFG.h" @@ -37,6 +37,7 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; #define DEBUG_TYPE "simplifycfg" @@ -47,36 +48,6 @@ UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1), STATISTIC(NumSimpl, "Number of blocks simplified"); -namespace { -struct CFGSimplifyPass : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - unsigned BonusInstThreshold; - CFGSimplifyPass(int T = -1) : FunctionPass(ID) { - BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T); - initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); - } - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); - AU.addRequired<TargetTransformInfo>(); - } -}; -} - -char CFGSimplifyPass::ID = 0; -INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, - false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) -INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, - false) - -// Public interface to the CFGSimplification pass -FunctionPass *llvm::createCFGSimplificationPass(int Threshold) { - return new CFGSimplifyPass(Threshold); -} - /// mergeEmptyReturnBlocks - If we have more than one empty (other than phi /// node) return blocks, merge them together to promote recursive block merging. static bool mergeEmptyReturnBlocks(Function &F) { @@ -156,8 +127,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, - const DataLayout *DL, - AssumptionTracker *AT, + const DataLayout *DL, AssumptionCache *AC, unsigned BonusInstThreshold) { bool Changed = false; bool LocalChange = true; @@ -167,7 +137,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AT)) { + if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AC)) { LocalChange = true; ++NumSimpl; } @@ -177,20 +147,12 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, return Changed; } -// It is possible that we may require multiple passes over the code to fully -// simplify the CFG. -// -bool CFGSimplifyPass::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - - AssumptionTracker *AT = &getAnalysis<AssumptionTracker>(); - const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; +static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI, + const DataLayout *DL, AssumptionCache *AC, + int BonusInstThreshold) { bool EverChanged = removeUnreachableBlocks(F); EverChanged |= mergeEmptyReturnBlocks(F); - EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AT, BonusInstThreshold); + EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -204,9 +166,69 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { return true; do { - EverChanged = iterativelySimplifyCFG(F, TTI, DL, AT, BonusInstThreshold); + EverChanged = iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold); EverChanged |= removeUnreachableBlocks(F); } while (EverChanged); return true; } + +SimplifyCFGPass::SimplifyCFGPass() + : BonusInstThreshold(UserBonusInstThreshold) {} + +SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold) + : BonusInstThreshold(BonusInstThreshold) {} + +PreservedAnalyses SimplifyCFGPass::run(Function &F, + AnalysisManager<Function> *AM) { + auto *DL = F.getParent()->getDataLayout(); + auto &TTI = AM->getResult<TargetIRAnalysis>(F); + auto &AC = AM->getResult<AssumptionAnalysis>(F); + + if (!simplifyFunctionCFG(F, TTI, DL, &AC, BonusInstThreshold)) + return PreservedAnalyses::none(); + + return PreservedAnalyses::all(); +} + +namespace { +struct CFGSimplifyPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + unsigned BonusInstThreshold; + CFGSimplifyPass(int T = -1) : FunctionPass(ID) { + BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T); + initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override { + if (skipOptnoneFunction(F)) + return false; + + AssumptionCache *AC = + &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + const TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; + return simplifyFunctionCFG(F, TTI, DL, AC, BonusInstThreshold); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + } +}; +} + +char CFGSimplifyPass::ID = 0; +INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, + false) + +// Public interface to the CFGSimplification pass +FunctionPass *llvm::createCFGSimplificationPass(int Threshold) { + return new CFGSimplifyPass(Threshold); +} + diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index 903b675..d0ee0a6 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -50,9 +50,9 @@ namespace { FunctionPass::getAnalysisUsage(AU); AU.addRequired<AliasAnalysis>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); } private: bool ProcessBlock(BasicBlock &BB); @@ -64,7 +64,7 @@ namespace { char Sinking::ID = 0; INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false) @@ -98,7 +98,7 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, bool Sinking::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); AA = &getAnalysis<AliasAnalysis>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp new file mode 100644 index 0000000..4edc86c --- /dev/null +++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -0,0 +1,274 @@ +//===-- StraightLineStrengthReduce.cpp - ------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements straight-line strength reduction (SLSR). Unlike loop +// strength reduction, this algorithm is designed to reduce arithmetic +// redundancy in straight-line code instead of loops. It has proven to be +// effective in simplifying arithmetic statements derived from an unrolled loop. +// It can also simplify the logic of SeparateConstOffsetFromGEP. +// +// There are many optimizations we can perform in the domain of SLSR. This file +// for now contains only an initial step. Specifically, we look for strength +// reduction candidate in the form of +// +// (B + i) * S +// +// where B and S are integer constants or variables, and i is a constant +// integer. If we found two such candidates +// +// S1: X = (B + i) * S S2: Y = (B + i') * S +// +// and S1 dominates S2, we call S1 a basis of S2, and can replace S2 with +// +// Y = X + (i' - i) * S +// +// where (i' - i) * S is folded to the extent possible. When S2 has multiple +// bases, we pick the one that is closest to S2, or S2's "immediate" basis. +// +// TODO: +// +// - Handle candidates in the form of B + i * S +// +// - Handle candidates in the form of pointer arithmetics. e.g., B[i * S] +// +// - Floating point arithmetics when fast math is enabled. +// +// - SLSR may decrease ILP at the architecture level. Targets that are very +// sensitive to ILP may want to disable it. Having SLSR to consider ILP is +// left as future work. +#include <vector> + +#include "llvm/ADT/DenseSet.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; +using namespace PatternMatch; + +namespace { + +class StraightLineStrengthReduce : public FunctionPass { + public: + // SLSR candidate. Such a candidate must be in the form of + // (Base + Index) * Stride + struct Candidate : public ilist_node<Candidate> { + Candidate(Value *B = nullptr, ConstantInt *Idx = nullptr, + Value *S = nullptr, Instruction *I = nullptr) + : Base(B), Index(Idx), Stride(S), Ins(I), Basis(nullptr) {} + Value *Base; + ConstantInt *Index; + Value *Stride; + // The instruction this candidate corresponds to. It helps us to rewrite a + // candidate with respect to its immediate basis. Note that one instruction + // can corresponds to multiple candidates depending on how you associate the + // expression. For instance, + // + // (a + 1) * (b + 2) + // + // can be treated as + // + // <Base: a, Index: 1, Stride: b + 2> + // + // or + // + // <Base: b, Index: 2, Stride: a + 1> + Instruction *Ins; + // Points to the immediate basis of this candidate, or nullptr if we cannot + // find any basis for this candidate. + Candidate *Basis; + }; + + static char ID; + + StraightLineStrengthReduce() : FunctionPass(ID), DT(nullptr) { + initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + // We do not modify the shape of the CFG. + AU.setPreservesCFG(); + } + + bool runOnFunction(Function &F) override; + + private: + // Returns true if Basis is a basis for C, i.e., Basis dominates C and they + // share the same base and stride. + bool isBasisFor(const Candidate &Basis, const Candidate &C); + // Checks whether I is in a candidate form. If so, adds all the matching forms + // to Candidates, and tries to find the immediate basis for each of them. + void allocateCandidateAndFindBasis(Instruction *I); + // Given that I is in the form of "(B + Idx) * S", adds this form to + // Candidates, and finds its immediate basis. + void allocateCandidateAndFindBasis(Value *B, ConstantInt *Idx, Value *S, + Instruction *I); + // Rewrites candidate C with respect to Basis. + void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis); + + DominatorTree *DT; + ilist<Candidate> Candidates; + // Temporarily holds all instructions that are unlinked (but not deleted) by + // rewriteCandidateWithBasis. These instructions will be actually removed + // after all rewriting finishes. + DenseSet<Instruction *> UnlinkedInstructions; +}; +} // anonymous namespace + +char StraightLineStrengthReduce::ID = 0; +INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr", + "Straight line strength reduction", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr", + "Straight line strength reduction", false, false) + +FunctionPass *llvm::createStraightLineStrengthReducePass() { + return new StraightLineStrengthReduce(); +} + +bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, + const Candidate &C) { + return (Basis.Ins != C.Ins && // skip the same instruction + // Basis must dominate C in order to rewrite C with respect to Basis. + DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) && + // They share the same base and stride. + Basis.Base == C.Base && + Basis.Stride == C.Stride); +} + +// TODO: We currently implement an algorithm whose time complexity is linear to +// the number of existing candidates. However, a better algorithm exists. We +// could depth-first search the dominator tree, and maintain a hash table that +// contains all candidates that dominate the node being traversed. This hash +// table is indexed by the base and the stride of a candidate. Therefore, +// finding the immediate basis of a candidate boils down to one hash-table look +// up. +void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Value *B, + ConstantInt *Idx, + Value *S, + Instruction *I) { + Candidate C(B, Idx, S, I); + // Try to compute the immediate basis of C. + unsigned NumIterations = 0; + // Limit the scan radius to avoid running forever. + static const unsigned MaxNumIterations = 50; + for (auto Basis = Candidates.rbegin(); + Basis != Candidates.rend() && NumIterations < MaxNumIterations; + ++Basis, ++NumIterations) { + if (isBasisFor(*Basis, C)) { + C.Basis = &(*Basis); + break; + } + } + // Regardless of whether we find a basis for C, we need to push C to the + // candidate list. + Candidates.push_back(C); +} + +void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Instruction *I) { + Value *B = nullptr; + ConstantInt *Idx = nullptr; + // "(Base + Index) * Stride" must be a Mul instruction at the first hand. + if (I->getOpcode() == Instruction::Mul) { + if (IntegerType *ITy = dyn_cast<IntegerType>(I->getType())) { + Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); + for (unsigned Swapped = 0; Swapped < 2; ++Swapped) { + // Only handle the canonical operand ordering. + if (match(LHS, m_Add(m_Value(B), m_ConstantInt(Idx)))) { + // If LHS is in the form of "Base + Index", then I is in the form of + // "(Base + Index) * RHS". + allocateCandidateAndFindBasis(B, Idx, RHS, I); + } else { + // Otherwise, at least try the form (LHS + 0) * RHS. + allocateCandidateAndFindBasis(LHS, ConstantInt::get(ITy, 0), RHS, I); + } + // Swap LHS and RHS so that we also cover the cases where LHS is the + // stride. + if (LHS == RHS) + break; + std::swap(LHS, RHS); + } + } + } +} + +void StraightLineStrengthReduce::rewriteCandidateWithBasis( + const Candidate &C, const Candidate &Basis) { + // An instruction can correspond to multiple candidates. Therefore, instead of + // simply deleting an instruction when we rewrite it, we mark its parent as + // nullptr (i.e. unlink it) so that we can skip the candidates whose + // instruction is already rewritten. + if (!C.Ins->getParent()) + return; + assert(C.Base == Basis.Base && C.Stride == Basis.Stride); + // Basis = (B + i) * S + // C = (B + i') * S + // ==> + // C = Basis + (i' - i) * S + IRBuilder<> Builder(C.Ins); + ConstantInt *IndexOffset = ConstantInt::get( + C.Ins->getContext(), C.Index->getValue() - Basis.Index->getValue()); + Value *Reduced; + // TODO: preserve nsw/nuw in some cases. + if (IndexOffset->isOne()) { + // If (i' - i) is 1, fold C into Basis + S. + Reduced = Builder.CreateAdd(Basis.Ins, C.Stride); + } else if (IndexOffset->isMinusOne()) { + // If (i' - i) is -1, fold C into Basis - S. + Reduced = Builder.CreateSub(Basis.Ins, C.Stride); + } else { + Value *Bump = Builder.CreateMul(C.Stride, IndexOffset); + Reduced = Builder.CreateAdd(Basis.Ins, Bump); + } + Reduced->takeName(C.Ins); + C.Ins->replaceAllUsesWith(Reduced); + C.Ins->dropAllReferences(); + // Unlink C.Ins so that we can skip other candidates also corresponding to + // C.Ins. The actual deletion is postponed to the end of runOnFunction. + C.Ins->removeFromParent(); + UnlinkedInstructions.insert(C.Ins); +} + +bool StraightLineStrengthReduce::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + // Traverse the dominator tree in the depth-first order. This order makes sure + // all bases of a candidate are in Candidates when we process it. + for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT); + node != GraphTraits<DominatorTree *>::nodes_end(DT); ++node) { + BasicBlock *B = node->getBlock(); + for (auto I = B->begin(); I != B->end(); ++I) { + allocateCandidateAndFindBasis(I); + } + } + + // Rewrite candidates in the reverse depth-first order. This order makes sure + // a candidate being rewritten is not a basis for any other candidate. + while (!Candidates.empty()) { + const Candidate &C = Candidates.back(); + if (C.Basis != nullptr) { + rewriteCandidateWithBasis(C, *C.Basis); + } + Candidates.pop_back(); + } + + // Delete all unlink instructions. + for (auto I : UnlinkedInstructions) { + delete I; + } + bool Ret = !UnlinkedInstructions.empty(); + UnlinkedInstructions.clear(); + return Ret; +} diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index b9673ed..aaf6f9a 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -10,11 +10,14 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/RegionPass.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; @@ -166,6 +169,7 @@ class StructurizeCFG : public RegionPass { Region *ParentRegion; DominatorTree *DT; + LoopInfo *LI; RNVector Order; BBSet Visited; @@ -247,6 +251,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(LowerSwitchID); AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); RegionPass::getAnalysisUsage(AU); } @@ -278,11 +283,65 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { /// \brief Build up the general order of nodes void StructurizeCFG::orderNodes() { - scc_iterator<Region *> I = scc_begin(ParentRegion); - for (Order.clear(); !I.isAtEnd(); ++I) { - const std::vector<RegionNode *> &Nodes = *I; - Order.append(Nodes.begin(), Nodes.end()); + RNVector TempOrder; + ReversePostOrderTraversal<Region*> RPOT(ParentRegion); + TempOrder.append(RPOT.begin(), RPOT.end()); + + std::map<Loop*, unsigned> LoopBlocks; + + + // The reverse post-order traversal of the list gives us an ordering close + // to what we want. The only problem with it is that sometimes backedges + // for outer loops will be visited before backedges for inner loops. + for (RegionNode *RN : TempOrder) { + BasicBlock *BB = RN->getEntry(); + Loop *Loop = LI->getLoopFor(BB); + if (!LoopBlocks.count(Loop)) { + LoopBlocks[Loop] = 1; + continue; + } + LoopBlocks[Loop]++; } + + unsigned CurrentLoopDepth = 0; + Loop *CurrentLoop = nullptr; + BBSet TempVisited; + for (RNVector::iterator I = TempOrder.begin(), E = TempOrder.end(); I != E; ++I) { + BasicBlock *BB = (*I)->getEntry(); + unsigned LoopDepth = LI->getLoopDepth(BB); + + if (std::find(Order.begin(), Order.end(), *I) != Order.end()) + continue; + + if (LoopDepth < CurrentLoopDepth) { + // Make sure we have visited all blocks in this loop before moving back to + // the outer loop. + + RNVector::iterator LoopI = I; + while(LoopBlocks[CurrentLoop]) { + LoopI++; + BasicBlock *LoopBB = (*LoopI)->getEntry(); + if (LI->getLoopFor(LoopBB) == CurrentLoop) { + LoopBlocks[CurrentLoop]--; + Order.push_back(*LoopI); + } + } + } + + CurrentLoop = LI->getLoopFor(BB); + if (CurrentLoop) { + LoopBlocks[CurrentLoop]--; + } + + CurrentLoopDepth = LoopDepth; + Order.push_back(*I); + } + + // This pass originally used a post-order traversal and then operated on + // the list in reverse. Now that we are using a reverse post-order traversal + // rather than re-working the whole pass to operate on the list in order, + // we just reverse the list and continue to operate on it in reverse. + std::reverse(Order.begin(), Order.end()); } /// \brief Determine the end of the loops @@ -301,8 +360,9 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { BasicBlock *Succ = Term->getSuccessor(i); - if (Visited.count(Succ)) + if (Visited.count(Succ)) { Loops[Succ] = BB; + } } } } @@ -437,6 +497,10 @@ void StructurizeCFG::collectInfos() { for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend(); OI != OE; ++OI) { + DEBUG(dbgs() << "Visiting: " << + ((*OI)->isSubRegion() ? "SubRegion with entry: " : "") << + (*OI)->getEntry()->getName() << " Loop Depth: " << LI->getLoopDepth((*OI)->getEntry()) << "\n"); + // Analyze all the conditions leading to a node gatherPredicates(*OI); @@ -862,6 +926,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { ParentRegion = R; DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); orderNodes(); collectInfos(); diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index f3c3e30..715ddeb 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -126,7 +126,7 @@ namespace { char TailCallElim::ID = 0; INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination", false, false) @@ -136,7 +136,7 @@ FunctionPass *llvm::createTailCallEliminationPass() { } void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } /// \brief Scan the specified function for alloca instructions. @@ -386,7 +386,7 @@ bool TailCallElim::runTRE(Function &F) { // right, so don't even try to convert it... if (F.getFunctionType()->isVarArg()) return false; - TTI = &getAnalysis<TargetTransformInfo>(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); BasicBlock *OldEntry = nullptr; bool TailCallsAreMarkedTail = false; SmallVector<PHINode*, 8> ArgumentPHIs; diff --git a/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/lib/Transforms/Utils/ASanStackFrameLayout.cpp index cce016a..03c3a80 100644 --- a/lib/Transforms/Utils/ASanStackFrameLayout.cpp +++ b/lib/Transforms/Utils/ASanStackFrameLayout.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Utils/ASanStackFrameLayout.h" #include "llvm/ADT/SmallString.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/MathExtras.h" #include <algorithm> namespace llvm { @@ -33,11 +34,6 @@ static inline bool CompareVars(const ASanStackVariableDescription &a, // with e.g. alignment 1 and alignment 16 do not get reordered by CompareVars. static const size_t kMinAlignment = 16; -static size_t RoundUpTo(size_t X, size_t RoundTo) { - assert((RoundTo & (RoundTo - 1)) == 0); - return (X + RoundTo - 1) & ~(RoundTo - 1); -} - // The larger the variable Size the larger is the redzone. // The resulting frame size is a multiple of Alignment. static size_t VarAndRedzoneSize(size_t Size, size_t Alignment) { @@ -48,7 +44,7 @@ static size_t VarAndRedzoneSize(size_t Size, size_t Alignment) { else if (Size <= 512) Res = Size + 64; else if (Size <= 4096) Res = Size + 128; else Res = Size + 256; - return RoundUpTo(Res, Alignment); + return RoundUpToAlignment(Res, Alignment); } void diff --git a/lib/Transforms/Utils/AddDiscriminators.cpp b/lib/Transforms/Utils/AddDiscriminators.cpp index f8e5af5..820544b 100644 --- a/lib/Transforms/Utils/AddDiscriminators.cpp +++ b/lib/Transforms/Utils/AddDiscriminators.cpp @@ -167,7 +167,7 @@ bool AddDiscriminators::runOnFunction(Function &F) { bool Changed = false; Module *M = F.getParent(); LLVMContext &Ctx = M->getContext(); - DIBuilder Builder(*M); + DIBuilder Builder(*M, /*AllowUnresolved*/ false); // Traverse all the blocks looking for instructions in different // blocks that are at the same file:line location. diff --git a/lib/Transforms/Utils/Android.mk b/lib/Transforms/Utils/Android.mk index e20dc0a..4d24928 100644 --- a/lib/Transforms/Utils/Android.mk +++ b/lib/Transforms/Utils/Android.mk @@ -22,7 +22,6 @@ transforms_utils_SRC_FILES := \ LoopSimplify.cpp \ LoopUnroll.cpp \ LoopUnrollRuntime.cpp \ - LowerExpectIntrinsic.cpp \ LowerInvoke.cpp \ LowerSwitch.cpp \ Mem2Reg.cpp \ diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp index 983f025..b455257 100644 --- a/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -65,16 +65,10 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) { /// any single-entry PHI nodes in it, fold them away. This handles the case /// when all entries to the PHI nodes in a block are guaranteed equal, such as /// when the block has exactly one predecessor. -void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P) { +void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, AliasAnalysis *AA, + MemoryDependenceAnalysis *MemDep) { if (!isa<PHINode>(BB->begin())) return; - AliasAnalysis *AA = nullptr; - MemoryDependenceAnalysis *MemDep = nullptr; - if (P) { - AA = P->getAnalysisIfAvailable<AliasAnalysis>(); - MemDep = P->getAnalysisIfAvailable<MemoryDependenceAnalysis>(); - } - while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) { if (PN->getIncomingValue(0) != PN) PN->replaceAllUsesWith(PN->getIncomingValue(0)); @@ -113,7 +107,9 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) { /// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor, /// if possible. The return value indicates success or failure. -bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) { +bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT, + LoopInfo *LI, AliasAnalysis *AA, + MemoryDependenceAnalysis *MemDep) { // Don't merge away blocks who have their address taken. if (BB->hasAddressTaken()) return false; @@ -149,7 +145,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) { // Begin by getting rid of unneeded PHIs. if (isa<PHINode>(BB->front())) - FoldSingleEntryPHINodes(BB, P); + FoldSingleEntryPHINodes(BB, AA, MemDep); // Delete the unconditional branch from the predecessor... PredBB->getInstList().pop_back(); @@ -166,28 +162,23 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) { PredBB->takeName(BB); // Finally, erase the old block and update dominator info. - if (P) { - if (DominatorTreeWrapperPass *DTWP = - P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DominatorTree &DT = DTWP->getDomTree(); - if (DomTreeNode *DTN = DT.getNode(BB)) { - DomTreeNode *PredDTN = DT.getNode(PredBB); - SmallVector<DomTreeNode*, 8> Children(DTN->begin(), DTN->end()); - for (SmallVectorImpl<DomTreeNode *>::iterator DI = Children.begin(), - DE = Children.end(); DI != DE; ++DI) - DT.changeImmediateDominator(*DI, PredDTN); - - DT.eraseNode(BB); - } + if (DT) + if (DomTreeNode *DTN = DT->getNode(BB)) { + DomTreeNode *PredDTN = DT->getNode(PredBB); + SmallVector<DomTreeNode *, 8> Children(DTN->begin(), DTN->end()); + for (SmallVectorImpl<DomTreeNode *>::iterator DI = Children.begin(), + DE = Children.end(); + DI != DE; ++DI) + DT->changeImmediateDominator(*DI, PredDTN); + + DT->eraseNode(BB); + } - if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>()) - LI->removeBlock(BB); + if (LI) + LI->removeBlock(BB); - if (MemoryDependenceAnalysis *MD = - P->getAnalysisIfAvailable<MemoryDependenceAnalysis>()) - MD->invalidateCachedPredecessors(); - } - } + if (MemDep) + MemDep->invalidateCachedPredecessors(); BB->eraseFromParent(); return true; @@ -240,12 +231,14 @@ void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) { /// SplitEdge - Split the edge connecting specified block. Pass P must /// not be NULL. -BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) { +BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT, + LoopInfo *LI) { unsigned SuccNum = GetSuccessorNumber(BB, Succ); // If this is a critical edge, let SplitCriticalEdge do it. TerminatorInst *LatchTerm = BB->getTerminator(); - if (SplitCriticalEdge(LatchTerm, SuccNum, P)) + if (SplitCriticalEdge(LatchTerm, SuccNum, CriticalEdgeSplittingOptions(DT, LI) + .setPreserveLCSSA())) return LatchTerm->getSuccessor(SuccNum); // If the edge isn't critical, then BB has a single successor or Succ has a @@ -255,23 +248,25 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) { // block. assert(SP == BB && "CFG broken"); SP = nullptr; - return SplitBlock(Succ, Succ->begin(), P); + return SplitBlock(Succ, Succ->begin(), DT, LI); } // Otherwise, if BB has a single successor, split it at the bottom of the // block. assert(BB->getTerminator()->getNumSuccessors() == 1 && "Should have a single succ!"); - return SplitBlock(BB, BB->getTerminator(), P); + return SplitBlock(BB, BB->getTerminator(), DT, LI); } -unsigned llvm::SplitAllCriticalEdges(Function &F, Pass *P) { +unsigned +llvm::SplitAllCriticalEdges(Function &F, + const CriticalEdgeSplittingOptions &Options) { unsigned NumBroken = 0; for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { TerminatorInst *TI = I->getTerminator(); if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI)) for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - if (SplitCriticalEdge(TI, i, P)) + if (SplitCriticalEdge(TI, i, Options)) ++NumBroken; } return NumBroken; @@ -282,7 +277,8 @@ unsigned llvm::SplitAllCriticalEdges(Function &F, Pass *P) { /// to a new block. The two blocks are joined by an unconditional branch and /// the loop info is updated. /// -BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) { +BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, + DominatorTree *DT, LoopInfo *LI) { BasicBlock::iterator SplitIt = SplitPt; while (isa<PHINode>(SplitIt) || isa<LandingPadInst>(SplitIt)) ++SplitIt; @@ -290,26 +286,23 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) { // The new block lives in whichever loop the old one did. This preserves // LCSSA as well, because we force the split point to be after any PHI nodes. - if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>()) + if (LI) if (Loop *L = LI->getLoopFor(Old)) - L->addBasicBlockToLoop(New, LI->getBase()); + L->addBasicBlockToLoop(New, *LI); - if (DominatorTreeWrapperPass *DTWP = - P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DominatorTree &DT = DTWP->getDomTree(); + if (DT) // Old dominates New. New node dominates all other nodes dominated by Old. - if (DomTreeNode *OldNode = DT.getNode(Old)) { + if (DomTreeNode *OldNode = DT->getNode(Old)) { std::vector<DomTreeNode *> Children; for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end(); I != E; ++I) Children.push_back(*I); - DomTreeNode *NewNode = DT.addNewBlock(New, Old); + DomTreeNode *NewNode = DT->addNewBlock(New, Old); for (std::vector<DomTreeNode *>::iterator I = Children.begin(), E = Children.end(); I != E; ++I) - DT.changeImmediateDominator(*I, NewNode); + DT->changeImmediateDominator(*I, NewNode); } - } return New; } @@ -318,45 +311,46 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) { /// analysis information. static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, ArrayRef<BasicBlock *> Preds, - Pass *P, bool &HasLoopExit) { - if (!P) return; + DominatorTree *DT, LoopInfo *LI, + bool PreserveLCSSA, bool &HasLoopExit) { + // Update dominator tree if available. + if (DT) + DT->splitBlock(NewBB); + + // The rest of the logic is only relevant for updating the loop structures. + if (!LI) + return; - LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>(); - Loop *L = LI ? LI->getLoopFor(OldBB) : nullptr; + Loop *L = LI->getLoopFor(OldBB); // If we need to preserve loop analyses, collect some information about how // this split will affect loops. bool IsLoopEntry = !!L; bool SplitMakesNewLoopHeader = false; - if (LI) { - bool PreserveLCSSA = P->mustPreserveAnalysisID(LCSSAID); - for (ArrayRef<BasicBlock*>::iterator - i = Preds.begin(), e = Preds.end(); i != e; ++i) { - BasicBlock *Pred = *i; - - // If we need to preserve LCSSA, determine if any of the preds is a loop - // exit. - if (PreserveLCSSA) - if (Loop *PL = LI->getLoopFor(Pred)) - if (!PL->contains(OldBB)) - HasLoopExit = true; - - // If we need to preserve LoopInfo, note whether any of the preds crosses - // an interesting loop boundary. - if (!L) continue; - if (L->contains(Pred)) - IsLoopEntry = false; - else - SplitMakesNewLoopHeader = true; - } + for (ArrayRef<BasicBlock *>::iterator i = Preds.begin(), e = Preds.end(); + i != e; ++i) { + BasicBlock *Pred = *i; + + // If we need to preserve LCSSA, determine if any of the preds is a loop + // exit. + if (PreserveLCSSA) + if (Loop *PL = LI->getLoopFor(Pred)) + if (!PL->contains(OldBB)) + HasLoopExit = true; + + // If we need to preserve LoopInfo, note whether any of the preds crosses + // an interesting loop boundary. + if (!L) + continue; + if (L->contains(Pred)) + IsLoopEntry = false; + else + SplitMakesNewLoopHeader = true; } - // Update dominator tree if available. - if (DominatorTreeWrapperPass *DTWP = - P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) - DTWP->getDomTree().splitBlock(NewBB); - - if (!L) return; + // Unless we have a loop for OldBB, nothing else to do here. + if (!L) + return; if (IsLoopEntry) { // Add the new block to the nearest enclosing loop (and not an adjacent @@ -382,9 +376,9 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, } if (InnermostPredLoop) - InnermostPredLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + InnermostPredLoop->addBasicBlockToLoop(NewBB, *LI); } else { - L->addBasicBlockToLoop(NewBB, LI->getBase()); + L->addBasicBlockToLoop(NewBB, *LI); if (SplitMakesNewLoopHeader) L->moveToHeader(NewBB); } @@ -393,10 +387,9 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, /// UpdatePHINodes - Update the PHI nodes in OrigBB to include the values coming /// from NewBB. This also updates AliasAnalysis, if available. static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, - ArrayRef<BasicBlock*> Preds, BranchInst *BI, - Pass *P, bool HasLoopExit) { + ArrayRef<BasicBlock *> Preds, BranchInst *BI, + AliasAnalysis *AA, bool HasLoopExit) { // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB. - AliasAnalysis *AA = P ? P->getAnalysisIfAvailable<AliasAnalysis>() : nullptr; SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end()); for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) { PHINode *PN = cast<PHINode>(I++); @@ -461,11 +454,15 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, } } -/// SplitBlockPredecessors - This method transforms BB by introducing a new -/// basic block into the function, and moving some of the predecessors of BB to -/// be predecessors of the new block. The new predecessors are indicated by the -/// Preds array, which has NumPreds elements in it. The new block is given a -/// suffix of 'Suffix'. +/// SplitBlockPredecessors - This method introduces at least one new basic block +/// into the function and moves some of the predecessors of BB to be +/// predecessors of the new block. The new predecessors are indicated by the +/// Preds array. The new block is given a suffix of 'Suffix'. Returns new basic +/// block to which predecessors from Preds are now pointing. +/// +/// If BB is a landingpad block then additional basicblock might be introduced. +/// It will have suffix of 'Suffix'+".split_lp". +/// See SplitLandingPadPredecessors for more details on this case. /// /// This currently updates the LLVM IR, AliasAnalysis, DominatorTree, /// LoopInfo, and LCCSA but no other analyses. In particular, it does not @@ -473,8 +470,21 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, /// of the edges being split is an exit of a loop with other exits). /// BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, - ArrayRef<BasicBlock*> Preds, - const char *Suffix, Pass *P) { + ArrayRef<BasicBlock *> Preds, + const char *Suffix, AliasAnalysis *AA, + DominatorTree *DT, LoopInfo *LI, + bool PreserveLCSSA) { + // For the landingpads we need to act a bit differently. + // Delegate this work to the SplitLandingPadPredecessors. + if (BB->isLandingPad()) { + SmallVector<BasicBlock*, 2> NewBBs; + std::string NewName = std::string(Suffix) + ".split-lp"; + + SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), + NewBBs, AA, DT, LI, PreserveLCSSA); + return NewBBs[0]; + } + // Create new basic block, insert right before the original block. BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+Suffix, BB->getParent(), BB); @@ -505,10 +515,11 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, // Update DominatorTree, LoopInfo, and LCCSA analysis information. bool HasLoopExit = false; - UpdateAnalysisInformation(BB, NewBB, Preds, P, HasLoopExit); + UpdateAnalysisInformation(BB, NewBB, Preds, DT, LI, PreserveLCSSA, + HasLoopExit); // Update the PHI nodes in BB with the values coming from NewBB. - UpdatePHINodes(BB, NewBB, Preds, BI, P, HasLoopExit); + UpdatePHINodes(BB, NewBB, Preds, BI, AA, HasLoopExit); return NewBB; } @@ -526,10 +537,11 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, /// exits). /// void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, - ArrayRef<BasicBlock*> Preds, + ArrayRef<BasicBlock *> Preds, const char *Suffix1, const char *Suffix2, - Pass *P, - SmallVectorImpl<BasicBlock*> &NewBBs) { + SmallVectorImpl<BasicBlock *> &NewBBs, + AliasAnalysis *AA, DominatorTree *DT, + LoopInfo *LI, bool PreserveLCSSA) { assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!"); // Create a new basic block for OrigBB's predecessors listed in Preds. Insert @@ -552,12 +564,12 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, Preds[i]->getTerminator()->replaceUsesOfWith(OrigBB, NewBB1); } - // Update DominatorTree, LoopInfo, and LCCSA analysis information. bool HasLoopExit = false; - UpdateAnalysisInformation(OrigBB, NewBB1, Preds, P, HasLoopExit); + UpdateAnalysisInformation(OrigBB, NewBB1, Preds, DT, LI, PreserveLCSSA, + HasLoopExit); // Update the PHI nodes in OrigBB with the values coming from NewBB1. - UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, P, HasLoopExit); + UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, AA, HasLoopExit); // Move the remaining edges from OrigBB to point to NewBB2. SmallVector<BasicBlock*, 8> NewBB2Preds; @@ -589,10 +601,11 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, // Update DominatorTree, LoopInfo, and LCCSA analysis information. HasLoopExit = false; - UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, P, HasLoopExit); + UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, DT, LI, + PreserveLCSSA, HasLoopExit); // Update the PHI nodes in OrigBB with the values coming from NewBB2. - UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, P, HasLoopExit); + UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, AA, HasLoopExit); } LandingPadInst *LPad = OrigBB->getLandingPadInst(); diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp index eda22cf..7e83c9e 100644 --- a/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -18,6 +18,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/CFG.h" @@ -41,14 +42,19 @@ namespace { } bool runOnFunction(Function &F) override { - unsigned N = SplitAllCriticalEdges(F, this); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); + auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + unsigned N = + SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI)); NumBroken += N; return N > 0; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); // No loop canonicalization guarantees are broken by this pass. AU.addPreservedID(LoopSimplifyID); @@ -125,10 +131,9 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds, /// to. /// BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, - Pass *P, bool MergeIdenticalEdges, - bool DontDeleteUselessPhis, - bool SplitLandingPads) { - if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return nullptr; + const CriticalEdgeSplittingOptions &Options) { + if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges)) + return nullptr; assert(!isa<IndirectBrInst>(TI) && "Cannot split critical edge from IndirectBrInst"); @@ -179,29 +184,22 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, // If there are any other edges from TIBB to DestBB, update those to go // through the split block, making those edges non-critical as well (and // reducing the number of phi entries in the DestBB if relevant). - if (MergeIdenticalEdges) { + if (Options.MergeIdenticalEdges) { for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) { if (TI->getSuccessor(i) != DestBB) continue; // Remove an entry for TIBB from DestBB phi nodes. - DestBB->removePredecessor(TIBB, DontDeleteUselessPhis); + DestBB->removePredecessor(TIBB, Options.DontDeleteUselessPHIs); // We found another edge to DestBB, go to NewBB instead. TI->setSuccessor(i, NewBB); } } - - - // If we don't have a pass object, we can't update anything... - if (!P) return NewBB; - - DominatorTreeWrapperPass *DTWP = - P->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>(); - // If we have nothing to update, just return. + auto *AA = Options.AA; + auto *DT = Options.DT; + auto *LI = Options.LI; if (!DT && !LI) return NewBB; @@ -268,13 +266,13 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, if (Loop *DestLoop = LI->getLoopFor(DestBB)) { if (TIL == DestLoop) { // Both in the same loop, the NewBB joins loop. - DestLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + DestLoop->addBasicBlockToLoop(NewBB, *LI); } else if (TIL->contains(DestLoop)) { // Edge from an outer loop to an inner loop. Add to the outer loop. - TIL->addBasicBlockToLoop(NewBB, LI->getBase()); + TIL->addBasicBlockToLoop(NewBB, *LI); } else if (DestLoop->contains(TIL)) { // Edge from an inner loop to an outer loop. Add to the outer loop. - DestLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + DestLoop->addBasicBlockToLoop(NewBB, *LI); } else { // Edge from two loops with no containment relation. Because these // are natural loops, we know that the destination block must be the @@ -283,19 +281,20 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, assert(DestLoop->getHeader() == DestBB && "Should not create irreducible loops!"); if (Loop *P = DestLoop->getParentLoop()) - P->addBasicBlockToLoop(NewBB, LI->getBase()); + P->addBasicBlockToLoop(NewBB, *LI); } } + // If TIBB is in a loop and DestBB is outside of that loop, we may need // to update LoopSimplify form and LCSSA form. - if (!TIL->contains(DestBB) && - P->mustPreserveAnalysisID(LoopSimplifyID)) { + if (!TIL->contains(DestBB)) { assert(!TIL->contains(NewBB) && "Split point for loop exit is contained in loop!"); // Update LCSSA form in the newly created exit block. - if (P->mustPreserveAnalysisID(LCSSAID)) + if (Options.PreserveLCSSA) { createPHIsForSplitLoopExit(TIBB, NewBB, DestBB); + } // The only that we can break LoopSimplify form by splitting a critical // edge is if after the split there exists some edge from TIL to DestBB @@ -322,20 +321,12 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, if (!LoopPreds.empty()) { assert(!DestBB->isLandingPad() && "We don't split edges to landing pads!"); - BasicBlock *NewExitBB = - SplitBlockPredecessors(DestBB, LoopPreds, "split", P); - if (P->mustPreserveAnalysisID(LCSSAID)) + BasicBlock *NewExitBB = SplitBlockPredecessors( + DestBB, LoopPreds, "split", AA, DT, LI, Options.PreserveLCSSA); + if (Options.PreserveLCSSA) createPHIsForSplitLoopExit(LoopPreds, NewExitBB, DestBB); } } - // LCSSA form was updated above for the case where LoopSimplify is - // available, which means that all predecessors of loop exit blocks - // are within the loop. Without LoopSimplify form, it would be - // necessary to insert a new phi. - assert((!P->mustPreserveAnalysisID(LCSSAID) || - P->mustPreserveAnalysisID(LoopSimplifyID)) && - "SplitCriticalEdge doesn't know how to update LCCSA form " - "without LoopSimplify!"); } } diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp index 112d26c..762a83f 100644 --- a/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/lib/Transforms/Utils/BuildLibCalls.cpp @@ -21,7 +21,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; @@ -486,135 +486,3 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, CI->setCallingConv(Fn->getCallingConv()); return CI; } - -SimplifyFortifiedLibCalls::~SimplifyFortifiedLibCalls() { } - -bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const DataLayout *TD, - const TargetLibraryInfo *TLI) { - // We really need DataLayout for later. - if (!TD) return false; - - this->CI = CI; - Function *Callee = CI->getCalledFunction(); - StringRef Name = Callee->getName(); - FunctionType *FT = Callee->getFunctionType(); - LLVMContext &Context = CI->getParent()->getContext(); - IRBuilder<> B(CI); - - if (Name == "__memcpy_chk") { - // Check if this has the right signature. - if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != TD->getIntPtrType(Context) || - FT->getParamType(3) != TD->getIntPtrType(Context)) - return false; - - if (isFoldable(3, 2, false)) { - B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), 1); - replaceCall(CI->getArgOperand(0)); - return true; - } - return false; - } - - // Should be similar to memcpy. - if (Name == "__mempcpy_chk") { - return false; - } - - if (Name == "__memmove_chk") { - // Check if this has the right signature. - if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != TD->getIntPtrType(Context) || - FT->getParamType(3) != TD->getIntPtrType(Context)) - return false; - - if (isFoldable(3, 2, false)) { - B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), 1); - replaceCall(CI->getArgOperand(0)); - return true; - } - return false; - } - - if (Name == "__memset_chk") { - // Check if this has the right signature. - if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isIntegerTy() || - FT->getParamType(2) != TD->getIntPtrType(Context) || - FT->getParamType(3) != TD->getIntPtrType(Context)) - return false; - - if (isFoldable(3, 2, false)) { - Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), - false); - B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); - replaceCall(CI->getArgOperand(0)); - return true; - } - return false; - } - - if (Name == "__strcpy_chk" || Name == "__stpcpy_chk") { - // Check if this has the right signature. - if (FT->getNumParams() != 3 || - FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != Type::getInt8PtrTy(Context) || - FT->getParamType(2) != TD->getIntPtrType(Context)) - return 0; - - - // If a) we don't have any length information, or b) we know this will - // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our - // st[rp]cpy_chk call which may fail at runtime if the size is too long. - // TODO: It might be nice to get a maximum length out of the possible - // string lengths for varying. - if (isFoldable(2, 1, true)) { - Value *Ret = EmitStrCpy(CI->getArgOperand(0), CI->getArgOperand(1), B, TD, - TLI, Name.substr(2, 6)); - if (!Ret) - return false; - replaceCall(Ret); - return true; - } - return false; - } - - if (Name == "__strncpy_chk" || Name == "__stpncpy_chk") { - // Check if this has the right signature. - if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != Type::getInt8PtrTy(Context) || - !FT->getParamType(2)->isIntegerTy() || - FT->getParamType(3) != TD->getIntPtrType(Context)) - return false; - - if (isFoldable(3, 2, false)) { - Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TD, TLI, - Name.substr(2, 7)); - if (!Ret) - return false; - replaceCall(Ret); - return true; - } - return false; - } - - if (Name == "__strcat_chk") { - return false; - } - - if (Name == "__strncat_chk") { - return false; - } - - return false; -} diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt index 6ce22b1..01e811f 100644 --- a/lib/Transforms/Utils/CMakeLists.txt +++ b/lib/Transforms/Utils/CMakeLists.txt @@ -21,7 +21,6 @@ add_llvm_library(LLVMTransformUtils LoopSimplify.cpp LoopUnroll.cpp LoopUnrollRuntime.cpp - LowerExpectIntrinsic.cpp LowerInvoke.cpp LowerSwitch.cpp Mem2Reg.cpp @@ -37,6 +36,10 @@ add_llvm_library(LLVMTransformUtils UnifyFunctionExitNodes.cpp Utils.cpp ValueMapper.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Utils ) add_dependencies(LLVMTransformUtils intrinsics_gen) diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index 5c8f20d..09279b6 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -164,14 +164,13 @@ static MDNode* FindSubprogram(const Function *F, DebugInfoFinder &Finder) { // Add an operand to an existing MDNode. The new operand will be added at the // back of the operand list. -static void AddOperand(MDNode *Node, Value *Operand) { - SmallVector<Value*, 16> Operands; - for (unsigned i = 0; i < Node->getNumOperands(); i++) { - Operands.push_back(Node->getOperand(i)); - } - Operands.push_back(Operand); - MDNode *NewNode = MDNode::get(Node->getContext(), Operands); - Node->replaceAllUsesWith(NewNode); +static void AddOperand(DICompileUnit CU, DIArray SPs, Metadata *NewSP) { + SmallVector<Metadata *, 16> NewSPs; + NewSPs.reserve(SPs->getNumOperands() + 1); + for (unsigned I = 0, E = SPs->getNumOperands(); I != E; ++I) + NewSPs.push_back(SPs->getOperand(I)); + NewSPs.push_back(NewSP); + CU.replaceSubprograms(DIArray(MDNode::get(CU->getContext(), NewSPs))); } // Clone the module-level debug info associated with OldFunc. The cloned data @@ -187,7 +186,7 @@ static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc, // Ensure that OldFunc appears in the map. // (if it's already there it must point to NewFunc anyway) VMap[OldFunc] = NewFunc; - DISubprogram NewSubprogram(MapValue(OldSubprogramMDNode, VMap)); + DISubprogram NewSubprogram(MapMetadata(OldSubprogramMDNode, VMap)); for (DICompileUnit CU : Finder.compile_units()) { DIArray Subprograms(CU.getSubprograms()); @@ -196,7 +195,8 @@ static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc, // also contain the new one. for (unsigned i = 0; i < Subprograms.getNumElements(); i++) { if ((MDNode*)Subprograms.getElement(i) == OldSubprogramMDNode) { - AddOperand(Subprograms, NewSubprogram); + AddOperand(CU, Subprograms, NewSubprogram); + break; } } } @@ -260,21 +260,36 @@ namespace { const char *NameSuffix; ClonedCodeInfo *CodeInfo; const DataLayout *DL; + CloningDirector *Director; + ValueMapTypeRemapper *TypeMapper; + ValueMaterializer *Materializer; + public: PruningFunctionCloner(Function *newFunc, const Function *oldFunc, ValueToValueMapTy &valueMap, bool moduleLevelChanges, const char *nameSuffix, ClonedCodeInfo *codeInfo, - const DataLayout *DL) + const DataLayout *DL, + CloningDirector *Director) : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap), ModuleLevelChanges(moduleLevelChanges), - NameSuffix(nameSuffix), CodeInfo(codeInfo), DL(DL) { + NameSuffix(nameSuffix), CodeInfo(codeInfo), DL(DL), + Director(Director) { + // These are optional components. The Director may return null. + if (Director) { + TypeMapper = Director->getTypeRemapper(); + Materializer = Director->getValueMaterializer(); + } else { + TypeMapper = nullptr; + Materializer = nullptr; + } } /// CloneBlock - The specified block is found to be reachable, clone it and /// anything that it can reach. - void CloneBlock(const BasicBlock *BB, + void CloneBlock(const BasicBlock *BB, + BasicBlock::const_iterator StartingInst, std::vector<const BasicBlock*> &ToClone); }; } @@ -282,6 +297,7 @@ namespace { /// CloneBlock - The specified block is found to be reachable, clone it and /// anything that it can reach. void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, + BasicBlock::const_iterator StartingInst, std::vector<const BasicBlock*> &ToClone){ WeakVH &BBEntry = VMap[BB]; @@ -307,21 +323,39 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, const_cast<BasicBlock*>(BB)); VMap[OldBBAddr] = BlockAddress::get(NewFunc, NewBB); } - bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false; - + // Loop over all instructions, and copy them over, DCE'ing as we go. This // loop doesn't include the terminator. - for (BasicBlock::const_iterator II = BB->begin(), IE = --BB->end(); + for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end(); II != IE; ++II) { + // If the "Director" remaps the instruction, don't clone it. + if (Director) { + CloningDirector::CloningAction Action + = Director->handleInstruction(VMap, II, NewBB); + // If the cloning director says stop, we want to stop everything, not + // just break out of the loop (which would cause the terminator to be + // cloned). The cloning director is responsible for inserting a proper + // terminator into the new basic block in this case. + if (Action == CloningDirector::StopCloningBB) + return; + // If the cloning director says skip, continue to the next instruction. + // In this case, the cloning director is responsible for mapping the + // skipped instruction to some value that is defined in the new + // basic block. + if (Action == CloningDirector::SkipInstruction) + continue; + } + Instruction *NewInst = II->clone(); // Eagerly remap operands to the newly cloned instruction, except for PHI // nodes for which we defer processing until we update the CFG. if (!isa<PHINode>(NewInst)) { RemapInstruction(NewInst, VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer); // If we can simplify this instruction to some other value, simply add // a mapping to that value rather than inserting a new instruction into @@ -354,6 +388,18 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, // Finally, clone over the terminator. const TerminatorInst *OldTI = BB->getTerminator(); bool TerminatorDone = false; + if (Director) { + CloningDirector::CloningAction Action + = Director->handleInstruction(VMap, OldTI, NewBB); + // If the cloning director says stop, we want to stop everything, not + // just break out of the loop (which would cause the terminator to be + // cloned). The cloning director is responsible for inserting a proper + // terminator into the new basic block in this case. + if (Action == CloningDirector::StopCloningBB) + return; + assert(Action != CloningDirector::SkipInstruction && + "SkipInstruction is not valid for terminators."); + } if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) { if (BI->isConditional()) { // If the condition was a known constant in the callee... @@ -409,39 +455,55 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, } } -/// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto, -/// except that it does some simple constant prop and DCE on the fly. The -/// effect of this is to copy significantly less code in cases where (for -/// example) a function call with constant arguments is inlined, and those -/// constant arguments cause a significant amount of code in the callee to be -/// dead. Since this doesn't produce an exact copy of the input, it can't be -/// used for things like CloneFunction or CloneModule. -void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, +/// CloneAndPruneIntoFromInst - This works like CloneAndPruneFunctionInto, except +/// that it does not clone the entire function. Instead it starts at an +/// instruction provided by the caller and copies (and prunes) only the code +/// reachable from that instruction. +void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, + const Instruction *StartingInst, ValueToValueMapTy &VMap, bool ModuleLevelChanges, - SmallVectorImpl<ReturnInst*> &Returns, + SmallVectorImpl<ReturnInst *> &Returns, const char *NameSuffix, ClonedCodeInfo *CodeInfo, const DataLayout *DL, - Instruction *TheCall) { + CloningDirector *Director) { assert(NameSuffix && "NameSuffix cannot be null!"); - + + ValueMapTypeRemapper *TypeMapper = nullptr; + ValueMaterializer *Materializer = nullptr; + + if (Director) { + TypeMapper = Director->getTypeRemapper(); + Materializer = Director->getValueMaterializer(); + } + #ifndef NDEBUG - for (Function::const_arg_iterator II = OldFunc->arg_begin(), - E = OldFunc->arg_end(); II != E; ++II) - assert(VMap.count(II) && "No mapping from source argument specified!"); + // If the cloning starts at the begining of the function, verify that + // the function arguments are mapped. + if (!StartingInst) + for (Function::const_arg_iterator II = OldFunc->arg_begin(), + E = OldFunc->arg_end(); II != E; ++II) + assert(VMap.count(II) && "No mapping from source argument specified!"); #endif PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges, - NameSuffix, CodeInfo, DL); + NameSuffix, CodeInfo, DL, Director); + const BasicBlock *StartingBB; + if (StartingInst) + StartingBB = StartingInst->getParent(); + else { + StartingBB = &OldFunc->getEntryBlock(); + StartingInst = StartingBB->begin(); + } // Clone the entry block, and anything recursively reachable from it. std::vector<const BasicBlock*> CloneWorklist; - CloneWorklist.push_back(&OldFunc->getEntryBlock()); + PFC.CloneBlock(StartingBB, StartingInst, CloneWorklist); while (!CloneWorklist.empty()) { const BasicBlock *BB = CloneWorklist.back(); CloneWorklist.pop_back(); - PFC.CloneBlock(BB, CloneWorklist); + PFC.CloneBlock(BB, BB->begin(), CloneWorklist); } // Loop over all of the basic blocks in the old function. If the block was @@ -470,7 +532,8 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, // Finally, remap the terminator instructions, as those can't be remapped // until all BBs are mapped. RemapInstruction(NewBB->getTerminator(), VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer); } // Defer PHI resolution until rest of function is resolved, PHI resolution @@ -569,7 +632,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, // and zap unconditional fall-through branches. This happen all the time when // specializing code: code specialization turns conditional branches into // uncond branches, and this code folds them. - Function::iterator Begin = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]); + Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB]); Function::iterator I = Begin; while (I != NewFunc->end()) { // Check if this block has become dead during inlining or other @@ -620,9 +683,30 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, // Make a final pass over the basic blocks from theh old function to gather // any return instructions which survived folding. We have to do this here // because we can iteratively remove and merge returns above. - for (Function::iterator I = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]), + for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB]), E = NewFunc->end(); I != E; ++I) if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) Returns.push_back(RI); } + + +/// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto, +/// except that it does some simple constant prop and DCE on the fly. The +/// effect of this is to copy significantly less code in cases where (for +/// example) a function call with constant arguments is inlined, and those +/// constant arguments cause a significant amount of code in the callee to be +/// dead. Since this doesn't produce an exact copy of the input, it can't be +/// used for things like CloneFunction or CloneModule. +void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, + ValueToValueMapTy &VMap, + bool ModuleLevelChanges, + SmallVectorImpl<ReturnInst*> &Returns, + const char *NameSuffix, + ClonedCodeInfo *CodeInfo, + const DataLayout *DL, + Instruction *TheCall) { + CloneAndPruneIntoFromInst(NewFunc, OldFunc, OldFunc->front().begin(), + VMap, ModuleLevelChanges, Returns, NameSuffix, + CodeInfo, DL, nullptr); +} diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp index d078c96..fae9ff5 100644 --- a/lib/Transforms/Utils/CloneModule.cpp +++ b/lib/Transforms/Utils/CloneModule.cpp @@ -109,7 +109,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { I != E; ++I) { GlobalAlias *GA = cast<GlobalAlias>(VMap[I]); if (const Constant *C = I->getAliasee()) - GA->setAliasee(cast<GlobalObject>(MapValue(C, VMap))); + GA->setAliasee(MapValue(C, VMap)); } // And named metadata.... @@ -118,7 +118,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { const NamedMDNode &NMD = *I; NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName()); for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i) - NewNMD->addOperand(MapValue(NMD.getOperand(i), VMap)); + NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap)); } return New; diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp index 9972b22..003da58 100644 --- a/lib/Transforms/Utils/DemoteRegToStack.cpp +++ b/lib/Transforms/Utils/DemoteRegToStack.cpp @@ -39,6 +39,19 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, F->getEntryBlock().begin()); } + // We cannot demote invoke instructions to the stack if their normal edge + // is critical. Therefore, split the critical edge and create a basic block + // into which the store can be inserted. + if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { + if (!II->getNormalDest()->getSinglePredecessor()) { + unsigned SuccNum = GetSuccessorNumber(II->getParent(), II->getNormalDest()); + assert(isCriticalEdge(II, SuccNum) && "Expected a critical edge!"); + BasicBlock *BB = SplitCriticalEdge(II, SuccNum); + assert(BB && "Unable to split critical edge."); + (void)BB; + } + } + // Change all of the users of the instruction to read from the stack slot. while (!I.use_empty()) { Instruction *U = cast<Instruction>(I.user_back()); @@ -71,7 +84,6 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, } } - // Insert stores of the computed value into the stack slot. We have to be // careful if I is an invoke instruction, because we can't insert the store // AFTER the terminator instruction. @@ -79,27 +91,13 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, if (!isa<TerminatorInst>(I)) { InsertPt = &I; ++InsertPt; + for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt) + /* empty */; // Don't insert before PHI nodes or landingpad instrs. } else { InvokeInst &II = cast<InvokeInst>(I); - if (II.getNormalDest()->getSinglePredecessor()) - InsertPt = II.getNormalDest()->getFirstInsertionPt(); - else { - // We cannot demote invoke instructions to the stack if their normal edge - // is critical. Therefore, split the critical edge and insert the store - // in the newly created basic block. - unsigned SuccNum = GetSuccessorNumber(I.getParent(), II.getNormalDest()); - TerminatorInst *TI = &cast<TerminatorInst>(I); - assert (isCriticalEdge(TI, SuccNum) && - "Expected a critical edge!"); - BasicBlock *BB = SplitCriticalEdge(TI, SuccNum); - assert (BB && "Unable to split critical edge."); - InsertPt = BB->getFirstInsertionPt(); - } + InsertPt = II.getNormalDest()->getFirstInsertionPt(); } - for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt) - /* empty */; // Don't insert before PHI nodes or landingpad instrs. - new StoreInst(&I, Slot, InsertPt); return Slot; } diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index 2d0b7dc..c2ef1ac 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -18,7 +18,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -30,6 +30,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -308,7 +309,7 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { // Walk the existing metadata, adding the complete (perhaps cyclic) chain to // the set. - SmallVector<const Value *, 16> Queue(MD.begin(), MD.end()); + SmallVector<const Metadata *, 16> Queue(MD.begin(), MD.end()); while (!Queue.empty()) { const MDNode *M = cast<MDNode>(Queue.pop_back_val()); for (unsigned i = 0, ie = M->getNumOperands(); i != ie; ++i) @@ -319,13 +320,12 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { // Now we have a complete set of all metadata in the chains used to specify // the noalias scopes and the lists of those scopes. - SmallVector<MDNode *, 16> DummyNodes; - DenseMap<const MDNode *, TrackingVH<MDNode> > MDMap; + SmallVector<TempMDTuple, 16> DummyNodes; + DenseMap<const MDNode *, TrackingMDNodeRef> MDMap; for (SetVector<const MDNode *>::iterator I = MD.begin(), IE = MD.end(); I != IE; ++I) { - MDNode *Dummy = MDNode::getTemporary(CalledFunc->getContext(), None); - DummyNodes.push_back(Dummy); - MDMap[*I] = Dummy; + DummyNodes.push_back(MDTuple::getTemporary(CalledFunc->getContext(), None)); + MDMap[*I].reset(DummyNodes.back().get()); } // Create new metadata nodes to replace the dummy nodes, replacing old @@ -333,17 +333,18 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { // node. for (SetVector<const MDNode *>::iterator I = MD.begin(), IE = MD.end(); I != IE; ++I) { - SmallVector<Value *, 4> NewOps; + SmallVector<Metadata *, 4> NewOps; for (unsigned i = 0, ie = (*I)->getNumOperands(); i != ie; ++i) { - const Value *V = (*I)->getOperand(i); + const Metadata *V = (*I)->getOperand(i); if (const MDNode *M = dyn_cast<MDNode>(V)) NewOps.push_back(MDMap[M]); else - NewOps.push_back(const_cast<Value *>(V)); + NewOps.push_back(const_cast<Metadata *>(V)); } - MDNode *NewM = MDNode::get(CalledFunc->getContext(), NewOps), - *TempM = MDMap[*I]; + MDNode *NewM = MDNode::get(CalledFunc->getContext(), NewOps); + MDTuple *TempM = cast<MDTuple>(MDMap[*I]); + assert(TempM->isTemporary() && "Expected temporary node"); TempM->replaceAllUsesWith(NewM); } @@ -388,10 +389,6 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { NI->setMetadata(LLVMContext::MD_noalias, M); } } - - // Now that everything has been replaced, delete the dummy nodes. - for (unsigned i = 0, ie = DummyNodes.size(); i != ie; ++i) - MDNode::deleteTemporary(DummyNodes[i]); } /// AddAliasScopeMetadata - If the inlined function has noalias arguments, then @@ -516,7 +513,7 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, // need to go through several PHIs to see it, and thus could be // repeated in the Objects list. SmallPtrSet<const Value *, 4> ObjSet; - SmallVector<Value *, 4> Scopes, NoAliases; + SmallVector<Metadata *, 4> Scopes, NoAliases; SmallSetVector<const Argument *, 4> NAPtrArgs; for (unsigned i = 0, ie = PtrArgs.size(); i != ie; ++i) { @@ -633,9 +630,10 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) { DominatorTree DT; bool DTCalculated = false; - const Function *CalledFunc = CS.getCalledFunction(); - for (Function::const_arg_iterator I = CalledFunc->arg_begin(), - E = CalledFunc->arg_end(); I != E; ++I) { + Function *CalledFunc = CS.getCalledFunction(); + for (Function::arg_iterator I = CalledFunc->arg_begin(), + E = CalledFunc->arg_end(); + I != E; ++I) { unsigned Align = I->getType()->isPointerTy() ? I->getParamAlignment() : 0; if (Align && !I->hasByValOrInAllocaAttr() && !I->hasNUses(0)) { if (!DTCalculated) { @@ -647,8 +645,9 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) { // If we can already prove the asserted alignment in the context of the // caller, then don't bother inserting the assumption. Value *Arg = CS.getArgument(I->getArgNo()); - if (getKnownAlignment(Arg, IFI.DL, IFI.AT, CS.getInstruction(), - &DT) >= Align) + if (getKnownAlignment(Arg, IFI.DL, + &IFI.ACT->getAssumptionCache(*CalledFunc), + CS.getInstruction(), &DT) >= Align) continue; IRBuilder<>(CS.getInstruction()).CreateAlignmentAssumption(*IFI.DL, Arg, @@ -748,6 +747,8 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, PointerType *ArgTy = cast<PointerType>(Arg->getType()); Type *AggTy = ArgTy->getElementType(); + Function *Caller = TheCall->getParent()->getParent(); + // If the called function is readonly, then it could not mutate the caller's // copy of the byval'd memory. In this case, it is safe to elide the copy and // temporary. @@ -760,8 +761,9 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, // If the pointer is already known to be sufficiently aligned, or if we can // round it up to a larger alignment, then we don't need a temporary. - if (getOrEnforceKnownAlignment(Arg, ByValAlignment, - IFI.DL, IFI.AT, TheCall) >= ByValAlignment) + if (getOrEnforceKnownAlignment(Arg, ByValAlignment, IFI.DL, + &IFI.ACT->getAssumptionCache(*Caller), + TheCall) >= ByValAlignment) return Arg; // Otherwise, we have to make a memcpy to get a safe alignment. This is bad @@ -778,8 +780,6 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, // pointer inside the callee). Align = std::max(Align, ByValAlignment); - Function *Caller = TheCall->getParent()->getParent(); - Value *NewAlloca = new AllocaInst(AggTy, nullptr, Align, Arg->getName(), &*Caller->begin()->begin()); IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca)); @@ -824,20 +824,42 @@ static bool hasLifetimeMarkers(AllocaInst *AI) { return false; } -/// updateInlinedAtInfo - Helper function used by fixupLineNumbers to -/// recursively update InlinedAtEntry of a DebugLoc. -static DebugLoc updateInlinedAtInfo(const DebugLoc &DL, - const DebugLoc &InlinedAtDL, - LLVMContext &Ctx) { - if (MDNode *IA = DL.getInlinedAt(Ctx)) { - DebugLoc NewInlinedAtDL - = updateInlinedAtInfo(DebugLoc::getFromDILocation(IA), InlinedAtDL, Ctx); - return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx), - NewInlinedAtDL.getAsMDNode(Ctx)); +/// Rebuild the entire inlined-at chain for this instruction so that the top of +/// the chain now is inlined-at the new call site. +static DebugLoc +updateInlinedAtInfo(DebugLoc DL, MDLocation *InlinedAtNode, + LLVMContext &Ctx, + DenseMap<const MDLocation *, MDLocation *> &IANodes) { + SmallVector<MDLocation*, 3> InlinedAtLocations; + MDLocation *Last = InlinedAtNode; + DebugLoc CurInlinedAt = DL; + + // Gather all the inlined-at nodes + while (MDLocation *IA = + cast_or_null<MDLocation>(CurInlinedAt.getInlinedAt(Ctx))) { + // Skip any we've already built nodes for + if (MDLocation *Found = IANodes[IA]) { + Last = Found; + break; + } + + InlinedAtLocations.push_back(IA); + CurInlinedAt = DebugLoc::getFromDILocation(IA); + } + + // Starting from the top, rebuild the nodes to point to the new inlined-at + // location (then rebuilding the rest of the chain behind it) and update the + // map of already-constructed inlined-at nodes. + for (auto I = InlinedAtLocations.rbegin(), E = InlinedAtLocations.rend(); + I != E; ++I) { + const MDLocation *MD = *I; + Last = IANodes[MD] = MDLocation::getDistinct( + Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last); } - return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx), - InlinedAtDL.getAsMDNode(Ctx)); + // And finally create the normal location for this instruction, referring to + // the new inlined-at chain. + return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx), Last); } /// fixupLineNumbers - Update inlined instructions' line numbers to @@ -848,6 +870,20 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, if (TheCallDL.isUnknown()) return; + auto &Ctx = Fn->getContext(); + auto *InlinedAtNode = cast<MDLocation>(TheCallDL.getAsMDNode(Ctx)); + + // Create a unique call site, not to be confused with any other call from the + // same location. + InlinedAtNode = MDLocation::getDistinct( + Ctx, InlinedAtNode->getLine(), InlinedAtNode->getColumn(), + InlinedAtNode->getScope(), InlinedAtNode->getInlinedAt()); + + // Cache the inlined-at nodes as they're built so they are reused, without + // this every instruction's inlined-at chain would become distinct from each + // other. + DenseMap<const MDLocation *, MDLocation *> IANodes; + for (; FI != Fn->end(); ++FI) { for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) { @@ -865,12 +901,19 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, BI->setDebugLoc(TheCallDL); } else { - BI->setDebugLoc(updateInlinedAtInfo(DL, TheCallDL, BI->getContext())); + BI->setDebugLoc(updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes)); if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(BI)) { LLVMContext &Ctx = BI->getContext(); MDNode *InlinedAt = BI->getDebugLoc().getInlinedAt(Ctx); - DVI->setOperand(2, createInlinedVariable(DVI->getVariable(), - InlinedAt, Ctx)); + DVI->setOperand(2, MetadataAsValue::get( + Ctx, createInlinedVariable(DVI->getVariable(), + InlinedAt, Ctx))); + } else if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(BI)) { + LLVMContext &Ctx = BI->getContext(); + MDNode *InlinedAt = BI->getDebugLoc().getInlinedAt(Ctx); + DDI->setOperand(1, MetadataAsValue::get( + Ctx, createInlinedVariable(DDI->getVariable(), + InlinedAt, Ctx))); } } } @@ -1026,8 +1069,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // FIXME: We could register any cloned assumptions instead of clearing the // whole function's cache. - if (IFI.AT) - IFI.AT->forgetCachedAssumptions(Caller); + if (IFI.ACT) + IFI.ACT->getAssumptionCache(*Caller).clear(); } // If there are any alloca instructions in the block that used to be the entry @@ -1069,6 +1112,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, FirstNewBlock->getInstList(), AI, I); } + // Move any dbg.declares describing the allocas into the entry basic block. + DIBuilder DIB(*Caller->getParent()); + for (auto &AI : IFI.StaticAllocas) + replaceDbgDeclareForAlloca(AI, AI, DIB, /*Deref=*/false); } bool InlinedMustTailCalls = false; @@ -1398,7 +1445,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // the entries are the same or undef). If so, remove the PHI so it doesn't // block other optimizations. if (PHI) { - if (Value *V = SimplifyInstruction(PHI, IFI.DL, nullptr, nullptr, IFI.AT)) { + if (Value *V = SimplifyInstruction(PHI, IFI.DL, nullptr, nullptr, + &IFI.ACT->getAssumptionCache(*Caller))) { PHI->replaceAllUsesWith(V); PHI->eraseFromParent(); } diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp index 51a3d9c..1cba367 100644 --- a/lib/Transforms/Utils/LCSSA.cpp +++ b/lib/Transforms/Utils/LCSSA.cpp @@ -61,7 +61,7 @@ static bool isExitBlock(BasicBlock *BB, /// uses. static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, const SmallVectorImpl<BasicBlock *> &ExitBlocks, - PredIteratorCache &PredCache) { + PredIteratorCache &PredCache, LoopInfo *LI) { SmallVector<Use *, 16> UsesToRewrite; BasicBlock *InstBB = Inst.getParent(); @@ -94,6 +94,7 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, DomTreeNode *DomNode = DT.getNode(DomBB); SmallVector<PHINode *, 16> AddedPHIs; + SmallVector<PHINode *, 8> PostProcessPHIs; SSAUpdater SSAUpdate; SSAUpdate.Initialize(Inst.getType(), Inst.getName()); @@ -131,6 +132,18 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, // Remember that this phi makes the value alive in this block. SSAUpdate.AddAvailableValue(ExitBB, PN); + + // LoopSimplify might fail to simplify some loops (e.g. when indirect + // branches are involved). In such situations, it might happen that an exit + // for Loop L1 is the header of a disjoint Loop L2. Thus, when we create + // PHIs in such an exit block, we are also inserting PHIs into L2's header. + // This could break LCSSA form for L2 because these inserted PHIs can also + // have uses outside of L2. Remember all PHIs in such situation as to + // revisit than later on. FIXME: Remove this if indirectbr support into + // LoopSimplify gets improved. + if (auto *OtherLoop = LI->getLoopFor(ExitBB)) + if (!L.contains(OtherLoop)) + PostProcessPHIs.push_back(PN); } // Rewrite all uses outside the loop in terms of the new PHIs we just @@ -157,6 +170,25 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, SSAUpdate.RewriteUse(*UsesToRewrite[i]); } + // Post process PHI instructions that were inserted into another disjoint loop + // and update their exits properly. + for (auto *I : PostProcessPHIs) { + if (I->use_empty()) + continue; + + BasicBlock *PHIBB = I->getParent(); + Loop *OtherLoop = LI->getLoopFor(PHIBB); + SmallVector<BasicBlock *, 8> EBs; + OtherLoop->getExitBlocks(EBs); + if (EBs.empty()) + continue; + + // Recurse and re-process each PHI instruction. FIXME: we should really + // convert this entire thing to a worklist approach where we process a + // vector of instructions... + processInstruction(*OtherLoop, *I, DT, EBs, PredCache, LI); + } + // Remove PHI nodes that did not have any uses rewritten. for (unsigned i = 0, e = AddedPHIs.size(); i != e; ++i) { if (AddedPHIs[i]->use_empty()) @@ -180,7 +212,8 @@ blockDominatesAnExit(BasicBlock *BB, return false; } -bool llvm::formLCSSA(Loop &L, DominatorTree &DT, ScalarEvolution *SE) { +bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI, + ScalarEvolution *SE) { bool Changed = false; // Get the set of exiting blocks. @@ -212,7 +245,7 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, ScalarEvolution *SE) { !isa<PHINode>(I->user_back()))) continue; - Changed |= processInstruction(L, *I, DT, ExitBlocks, PredCache); + Changed |= processInstruction(L, *I, DT, ExitBlocks, PredCache, LI); } } @@ -228,15 +261,15 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, ScalarEvolution *SE) { } /// Process a loop nest depth first. -bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, +bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution *SE) { bool Changed = false; // Recurse depth-first through inner loops. - for (Loop::iterator LI = L.begin(), LE = L.end(); LI != LE; ++LI) - Changed |= formLCSSARecursively(**LI, DT, SE); + for (Loop::iterator I = L.begin(), E = L.end(); I != E; ++I) + Changed |= formLCSSARecursively(**I, DT, LI, SE); - Changed |= formLCSSA(L, DT, SE); + Changed |= formLCSSA(L, DT, LI, SE); return Changed; } @@ -261,7 +294,7 @@ struct LCSSA : public FunctionPass { AU.setPreservesCFG(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addPreservedID(LoopSimplifyID); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<ScalarEvolution>(); @@ -275,7 +308,7 @@ private: char LCSSA::ID = 0; INITIALIZE_PASS_BEGIN(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) Pass *llvm::createLCSSAPass() { return new LCSSA(); } @@ -285,13 +318,13 @@ char &llvm::LCSSAID = LCSSA::ID; /// Process all loops in the function, inner-most out. bool LCSSA::runOnFunction(Function &F) { bool Changed = false; - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = getAnalysisIfAvailable<ScalarEvolution>(); // Simplify each loop nest in the function. for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) - Changed |= formLCSSARecursively(**I, *DT, SE); + Changed |= formLCSSARecursively(**I, *DT, LI, SE); return Changed; } diff --git a/lib/Transforms/Utils/LLVMBuild.txt b/lib/Transforms/Utils/LLVMBuild.txt index 88b2ffe..6b2d405 100644 --- a/lib/Transforms/Utils/LLVMBuild.txt +++ b/lib/Transforms/Utils/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = TransformUtils parent = Transforms -required_libraries = Analysis Core IPA Support Target +required_libraries = Analysis Core IPA Support diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index c963c51..4830568 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LibCallSemantics.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ValueTracking.h" @@ -110,11 +111,17 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, } if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) { - // If we are switching on a constant, we can convert the switch into a - // single branch instruction! + // If we are switching on a constant, we can convert the switch to an + // unconditional branch. ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition()); - BasicBlock *TheOnlyDest = SI->getDefaultDest(); - BasicBlock *DefaultDest = TheOnlyDest; + BasicBlock *DefaultDest = SI->getDefaultDest(); + BasicBlock *TheOnlyDest = DefaultDest; + + // If the default is unreachable, ignore it when searching for TheOnlyDest. + if (isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg()) && + SI->getNumCases() > 0) { + TheOnlyDest = SI->case_begin().getCaseSuccessor(); + } // Figure out which case it goes to. for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); @@ -137,7 +144,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, SmallVector<uint32_t, 8> Weights; for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e; ++MD_i) { - ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i)); + ConstantInt *CI = + mdconst::dyn_extract<ConstantInt>(MD->getOperand(MD_i)); assert(CI); Weights.push_back(CI->getValue().getZExtValue()); } @@ -208,8 +216,10 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, SI->getDefaultDest()); MDNode *MD = SI->getMetadata(LLVMContext::MD_prof); if (MD && MD->getNumOperands() == 3) { - ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2)); - ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1)); + ConstantInt *SICase = + mdconst::dyn_extract<ConstantInt>(MD->getOperand(2)); + ConstantInt *SIDef = + mdconst::dyn_extract<ConstantInt>(MD->getOperand(1)); assert(SICase && SIDef); // The TrueWeight should be the weight for the single case of SI. NewBr->setMetadata(LLVMContext::MD_prof, @@ -486,7 +496,7 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred, /// between them, moving the instructions in the predecessor into DestBB and /// deleting the predecessor block. /// -void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) { +void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) { // If BB has single-entry PHI nodes, fold them. while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) { Value *NewVal = PN->getIncomingValue(0); @@ -522,14 +532,10 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) { if (PredBB == &DestBB->getParent()->getEntryBlock()) DestBB->moveAfter(PredBB); - if (P) { - if (DominatorTreeWrapperPass *DTWP = - P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DominatorTree &DT = DTWP->getDomTree(); - BasicBlock *PredBBIDom = DT.getNode(PredBB)->getIDom()->getBlock(); - DT.changeImmediateDominator(DestBB, PredBBIDom); - DT.eraseNode(PredBB); - } + if (DT) { + BasicBlock *PredBBIDom = DT->getNode(PredBB)->getIDom()->getBlock(); + DT->changeImmediateDominator(DestBB, PredBBIDom); + DT->eraseNode(PredBB); } // Nuke BB. PredBB->eraseFromParent(); @@ -940,7 +946,7 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align, /// increase the alignment of the ultimate object, making this check succeed. unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, const DataLayout *DL, - AssumptionTracker *AT, + AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { assert(V->getType()->isPointerTy() && @@ -948,7 +954,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, unsigned BitWidth = DL ? DL->getPointerTypeSizeInBits(V->getType()) : 64; APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(V, KnownZero, KnownOne, DL, 0, AT, CxtI, DT); + computeKnownBits(V, KnownZero, KnownOne, DL, 0, AC, CxtI, DT); unsigned TrailZ = KnownZero.countTrailingOnes(); // Avoid trouble with ridiculously large TrailZ values, such as @@ -1048,7 +1054,7 @@ static bool isArray(AllocaInst *AI) { /// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set /// of llvm.dbg.value intrinsics. bool llvm::LowerDbgDeclare(Function &F) { - DIBuilder DIB(*F.getParent()); + DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); SmallVector<DbgDeclareInst *, 4> Dbgs; for (auto &FI : F) for (BasicBlock::iterator BI : FI) @@ -1091,19 +1097,21 @@ bool llvm::LowerDbgDeclare(Function &F) { /// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic describing the /// alloca 'V', if any. DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) { - if (MDNode *DebugNode = MDNode::getIfExists(V->getContext(), V)) - for (User *U : DebugNode->users()) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) - return DDI; + if (auto *L = LocalAsMetadata::getIfExists(V)) + if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L)) + for (User *U : MDV->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) + return DDI; return nullptr; } bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, - DIBuilder &Builder) { + DIBuilder &Builder, bool Deref) { DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI); if (!DDI) return false; + DebugLoc Loc = DDI->getDebugLoc(); DIVariable DIVar(DDI->getVariable()); DIExpression DIExpr(DDI->getExpression()); assert((!DIVar || DIVar.isVariable()) && @@ -1111,23 +1119,24 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, if (!DIVar) return false; - // Create a copy of the original DIDescriptor for user variable, appending - // "deref" operation to a list of address elements, as new llvm.dbg.declare - // will take a value storing address of the memory for variable, not - // alloca itself. - SmallVector<int64_t, 4> NewDIExpr; - if (DIExpr) { - for (unsigned i = 0, n = DIExpr.getNumElements(); i < n; ++i) { - NewDIExpr.push_back(DIExpr.getElement(i)); - } + if (Deref) { + // Create a copy of the original DIDescriptor for user variable, prepending + // "deref" operation to a list of address elements, as new llvm.dbg.declare + // will take a value storing address of the memory for variable, not + // alloca itself. + SmallVector<uint64_t, 4> NewDIExpr; + NewDIExpr.push_back(dwarf::DW_OP_deref); + if (DIExpr) + for (unsigned i = 0, n = DIExpr.getNumElements(); i < n; ++i) + NewDIExpr.push_back(DIExpr.getElement(i)); + DIExpr = Builder.createExpression(NewDIExpr); } - NewDIExpr.push_back(dwarf::DW_OP_deref); // Insert llvm.dbg.declare in the same basic block as the original alloca, // and remove old llvm.dbg.declare. BasicBlock *BB = AI->getParent(); - Builder.insertDeclare(NewAllocaAddress, DIVar, - Builder.createExpression(NewDIExpr), BB); + Builder.insertDeclare(NewAllocaAddress, DIVar, DIExpr, BB) + ->setDebugLoc(Loc); DDI->eraseFromParent(); return true; } @@ -1252,7 +1261,7 @@ static bool markAliveBlocks(BasicBlock *BB, if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { changeToUnreachable(II, true); Changed = true; - } else if (II->doesNotThrow()) { + } else if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(II)) { if (II->use_empty() && II->onlyReadsMemory()) { // jump to the normal destination branch. BranchInst::Create(II->getNormalDest(), II); @@ -1326,6 +1335,8 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsign K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD)); break; case LLVMContext::MD_alias_scope: + K->setMetadata(Kind, MDNode::getMostGenericAliasScope(JMD, KMD)); + break; case LLVMContext::MD_noalias: K->setMetadata(Kind, MDNode::intersect(JMD, KMD)); break; diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp index af0501f..a0f8268 100644 --- a/lib/Transforms/Utils/LoopSimplify.cpp +++ b/lib/Transforms/Utils/LoopSimplify.cpp @@ -44,7 +44,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" @@ -113,6 +113,14 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB, BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { BasicBlock *Header = L->getHeader(); + // Get analyses that we try to update. + auto *AA = PP->getAnalysisIfAvailable<AliasAnalysis>(); + auto *DTWP = PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *LIWP = PP->getAnalysisIfAvailable<LoopInfoWrapperPass>(); + auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); + // Compute the set of predecessors of the loop that are not in the loop. SmallVector<BasicBlock*, 8> OutsideBlocks; for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); @@ -131,15 +139,8 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { // Split out the loop pre-header. BasicBlock *PreheaderBB; - if (!Header->isLandingPad()) { - PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", - PP); - } else { - SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(Header, OutsideBlocks, ".preheader", - ".split-lp", PP, NewBBs); - PreheaderBB = NewBBs[0]; - } + PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", + AA, DT, LI, PreserveLCSSA); PreheaderBB->getTerminator()->setDebugLoc( Header->getFirstNonPHI()->getDebugLoc()); @@ -157,7 +158,9 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { /// /// This method is used to split exit blocks that have predecessors outside of /// the loop. -static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, Pass *PP) { +static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, + AliasAnalysis *AA, DominatorTree *DT, + LoopInfo *LI, Pass *PP) { SmallVector<BasicBlock*, 8> LoopBlocks; for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) { BasicBlock *P = *I; @@ -172,15 +175,10 @@ static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, Pass *PP) { assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?"); BasicBlock *NewExitBB = nullptr; - if (Exit->isLandingPad()) { - SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(Exit, LoopBlocks, - ".loopexit", ".nonloopexit", - PP, NewBBs); - NewExitBB = NewBBs[0]; - } else { - NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", PP); - } + bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); + + NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", AA, DT, + LI, PreserveLCSSA); DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block " << NewExitBB->getName() << "\n"); @@ -210,11 +208,11 @@ static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock, /// us how to partition the loops. static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA, DominatorTree *DT, - AssumptionTracker *AT) { + AssumptionCache *AC) { for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) { PHINode *PN = cast<PHINode>(I); ++I; - if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AT)) { + if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AC)) { // This is a degenerate PHI already, don't modify it! PN->replaceAllUsesWith(V); if (AA) AA->deleteValue(PN); @@ -254,7 +252,7 @@ static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA, static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, Pass *PP, - AssumptionTracker *AT) { + AssumptionCache *AC) { // Don't try to separate loops without a preheader. if (!Preheader) return nullptr; @@ -263,7 +261,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, assert(!L->getHeader()->isLandingPad() && "Can't insert backedge to landing pad"); - PHINode *PN = findPHIToPartitionLoops(L, AA, DT, AT); + PHINode *PN = findPHIToPartitionLoops(L, AA, DT, AC); if (!PN) return nullptr; // No known way to partition. // Pull out all predecessors that have varying values in the loop. This @@ -287,9 +285,11 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, if (SE) SE->forgetLoop(L); + bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); + BasicBlock *Header = L->getHeader(); - BasicBlock *NewBB = - SplitBlockPredecessors(Header, OuterLoopPreds, ".outer", PP); + BasicBlock *NewBB = SplitBlockPredecessors(Header, OuterLoopPreds, ".outer", + AA, DT, LI, PreserveLCSSA); // Make sure that NewBB is put someplace intelligent, which doesn't mess up // code layout too horribly. @@ -460,7 +460,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, // Update Loop Information - we know that this block is now in the current // loop and all parent loops. - L->addBasicBlockToLoop(BEBlock, LI->getBase()); + L->addBasicBlockToLoop(BEBlock, *LI); // Update dominator information DT->splitBlock(BEBlock); @@ -476,8 +476,8 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, /// explicit if they accepted the analysis directly and then updated it. static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist, AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, - ScalarEvolution *SE, Pass *PP, - const DataLayout *DL, AssumptionTracker *AT) { + ScalarEvolution *SE, Pass *PP, const DataLayout *DL, + AssumptionCache *AC) { bool Changed = false; ReprocessLoop: @@ -567,7 +567,7 @@ ReprocessLoop: // Must be exactly this loop: no subloops, parent loops, or non-loop preds // allowed. if (!L->contains(*PI)) { - if (rewriteLoopExitBlock(L, ExitBlock, PP)) { + if (rewriteLoopExitBlock(L, ExitBlock, AA, DT, LI, PP)) { ++NumInserted; Changed = true; } @@ -583,8 +583,8 @@ ReprocessLoop: // this for loops with a giant number of backedges, just factor them into a // common backedge instead. if (L->getNumBackEdges() < 8) { - if (Loop *OuterL = separateNestedLoop(L, Preheader, AA, DT, LI, SE, - PP, AT)) { + if (Loop *OuterL = + separateNestedLoop(L, Preheader, AA, DT, LI, SE, PP, AC)) { ++NumNested; // Enqueue the outer loop as it should be processed next in our // depth-first nest walk. @@ -614,7 +614,7 @@ ReprocessLoop: PHINode *PN; for (BasicBlock::iterator I = L->getHeader()->begin(); (PN = dyn_cast<PHINode>(I++)); ) - if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AT)) { + if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AC)) { if (AA) AA->deleteValue(PN); if (SE) SE->forgetValue(PN); PN->replaceAllUsesWith(V); @@ -714,7 +714,7 @@ ReprocessLoop: bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP, AliasAnalysis *AA, ScalarEvolution *SE, - const DataLayout *DL, AssumptionTracker *AT) { + const DataLayout *DL, AssumptionCache *AC) { bool Changed = false; // Worklist maintains our depth-first queue of loops in this nest to process. @@ -726,13 +726,12 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP, // order. We can use this simple process because loops form a tree. for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) { Loop *L2 = Worklist[Idx]; - for (Loop::iterator I = L2->begin(), E = L2->end(); I != E; ++I) - Worklist.push_back(*I); + Worklist.append(L2->begin(), L2->end()); } while (!Worklist.empty()) Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI, - SE, PP, DL, AT); + SE, PP, DL, AC); return Changed; } @@ -751,19 +750,19 @@ namespace { LoopInfo *LI; ScalarEvolution *SE; const DataLayout *DL; - AssumptionTracker *AT; + AssumptionCache *AC; bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); // We need loop information to identify the loops... AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<ScalarEvolution>(); @@ -779,9 +778,9 @@ namespace { char LoopSimplify::ID = 0; INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify", "Canonicalize natural loops", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops", false, false) @@ -795,16 +794,16 @@ Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); } bool LoopSimplify::runOnFunction(Function &F) { bool Changed = false; AA = getAnalysisIfAvailable<AliasAnalysis>(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = getAnalysisIfAvailable<ScalarEvolution>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - AT = &getAnalysis<AssumptionTracker>(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); // Simplify each loop nest in the function. for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) - Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, DL, AT); + Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, DL, AC); return Changed; } diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index 0e1baa1..accb731 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -19,7 +19,7 @@ #include "llvm/Transforms/Utils/UnrollLoop.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" @@ -154,9 +154,8 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM, /// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are /// available from the Pass it must also preserve those analyses. bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, - bool AllowRuntime, unsigned TripMultiple, - LoopInfo *LI, Pass *PP, LPPassManager *LPM, - AssumptionTracker *AT) { + bool AllowRuntime, unsigned TripMultiple, LoopInfo *LI, + Pass *PP, LPPassManager *LPM, AssumptionCache *AC) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n"); @@ -312,7 +311,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // Tell LI about New. if (*BB == Header) { assert(LI->getLoopFor(*BB) == L && "Header should not be in a sub-loop"); - L->addBasicBlockToLoop(New, LI->getBase()); + L->addBasicBlockToLoop(New, *LI); } else { // Figure out which loop New is in. const Loop *OldLoop = LI->getLoopFor(*BB); @@ -334,7 +333,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, if (SE) SE->forgetLoop(OldLoop); } - NewLoop->addBasicBlockToLoop(New, LI->getBase()); + NewLoop->addBasicBlockToLoop(New, *LI); } if (*BB == Header) @@ -473,7 +472,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // FIXME: We could register any cloned assumptions instead of clearing the // whole function's cache. - AT->forgetCachedAssumptions(F); + AC->clear(); DominatorTree *DT = nullptr; if (PP) { @@ -534,7 +533,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, if (OuterL) { DataLayoutPass *DLP = PP->getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, DL, AT); + simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, DL, AC); // LCSSA must be performed on the outermost affected loop. The unrolled // loop's last loop latch is guaranteed to be in the outermost loop after @@ -544,9 +543,32 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, while (OuterL->getParentLoop() != LatchLoop) OuterL = OuterL->getParentLoop(); - formLCSSARecursively(*OuterL, *DT, SE); + formLCSSARecursively(*OuterL, *DT, LI, SE); } } return true; } + +/// Given an llvm.loop loop id metadata node, returns the loop hint metadata +/// node with the given name (for example, "llvm.loop.unroll.count"). If no +/// such metadata node exists, then nullptr is returned. +MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) { + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { + MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (!MD) + continue; + + MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + if (!S) + continue; + + if (Name.equals(S->getString())) + return MD; + } + return nullptr; +} diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 3d91336..91b688c 100644 --- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -23,14 +23,17 @@ #include "llvm/Transforms/Utils/UnrollLoop.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Metadata.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include <algorithm> @@ -55,10 +58,11 @@ STATISTIC(NumRuntimeUnrolled, /// - Branch around the original loop if the trip count is less /// than the unroll factor. /// -static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, +static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, BasicBlock *LastPrologBB, BasicBlock *PrologEnd, BasicBlock *OrigPH, BasicBlock *NewPH, - ValueToValueMapTy &VMap, Pass *P) { + ValueToValueMapTy &VMap, AliasAnalysis *AA, + DominatorTree *DT, LoopInfo *LI, Pass *P) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); @@ -105,23 +109,25 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, } } - // Create a branch around the orignal loop, which is taken if the - // trip count is less than the unroll factor. + // Create a branch around the orignal loop, which is taken if there are no + // iterations remaining to be executed after running the prologue. Instruction *InsertPt = PrologEnd->getTerminator(); + + assert(Count != 0 && "nonsensical Count!"); + + // If BECount <u (Count - 1) then (BECount + 1) & (Count - 1) == (BECount + 1) + // (since Count is a power of 2). This means %xtraiter is (BECount + 1) and + // and all of the iterations of this loop were executed by the prologue. Note + // that if BECount <u (Count - 1) then (BECount + 1) cannot unsigned-overflow. Instruction *BrLoopExit = - new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, TripCount, - ConstantInt::get(TripCount->getType(), Count)); + new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, BECount, + ConstantInt::get(BECount->getType(), Count - 1)); BasicBlock *Exit = L->getUniqueExitBlock(); assert(Exit && "Loop must have a single exit block only"); // Split the exit to maintain loop canonicalization guarantees SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit)); - if (!Exit->isLandingPad()) { - SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", P); - } else { - SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(Exit, Preds, ".unr1-lcssa", ".unr2-lcssa", - P, NewBBs); - } + SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", AA, DT, LI, + P->mustPreserveAnalysisID(LCSSAID)); // Add the branch to the exit block (around the unrolled loop) BranchInst::Create(Exit, NewPH, BrLoopExit, InsertPt); InsertPt->eraseFromParent(); @@ -160,9 +166,9 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, NewBlocks.push_back(NewBB); if (NewLoop) - NewLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + NewLoop->addBasicBlockToLoop(NewBB, *LI); else if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + ParentLoop->addBasicBlockToLoop(NewBB, *LI); VMap[*BB] = NewBB; if (Header == *BB) { @@ -217,9 +223,9 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, } if (NewLoop) { // Add unroll disable metadata to disable future unrolling for this loop. - SmallVector<Value *, 4> Vals; + SmallVector<Metadata *, 4> MDs; // Reserve first location for self reference to the LoopID metadata node. - Vals.push_back(nullptr); + MDs.push_back(nullptr); MDNode *LoopID = NewLoop->getLoopID(); if (LoopID) { // First remove any existing loop unrolling metadata. @@ -230,17 +236,18 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); } - if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i)); + if (!IsUnrollMetadata) + MDs.push_back(LoopID->getOperand(i)); } } LLVMContext &Context = NewLoop->getHeader()->getContext(); - SmallVector<Value *, 1> DisableOperands; + SmallVector<Metadata *, 1> DisableOperands; DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable")); MDNode *DisableNode = MDNode::get(Context, DisableOperands); - Vals.push_back(DisableNode); + MDs.push_back(DisableNode); - MDNode *NewLoopID = MDNode::get(Context, Vals); + MDNode *NewLoopID = MDNode::get(Context, MDs); // Set operand 0 to refer to the loop id itself. NewLoopID->replaceOperandWith(0, NewLoopID); NewLoop->setLoopID(NewLoopID); @@ -291,23 +298,28 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, // Only unroll loops with a computable trip count and the trip count needs // to be an int value (allowing a pointer type is a TODO item) - const SCEV *BECount = SE->getBackedgeTakenCount(L); - if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy()) + const SCEV *BECountSC = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BECountSC) || + !BECountSC->getType()->isIntegerTy()) return false; - // If BECount is INT_MAX, we can't compute trip-count without overflow. - if (BECount->isAllOnesValue()) - return false; + unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth(); // Add 1 since the backedge count doesn't include the first loop iteration const SCEV *TripCountSC = - SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1)); + SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); if (isa<SCEVCouldNotCompute>(TripCountSC)) return false; // We only handle cases when the unroll factor is a power of 2. // Count is the loop unroll factor, the number of extra copies added + 1. - if ((Count & (Count-1)) != 0) + if (!isPowerOf2_32(Count)) + return false; + + // This constraint lets us deal with an overflowing trip count easily; see the + // comment on ModVal below. This check is equivalent to `Log2(Count) < + // BEWidth`. + if (static_cast<uint64_t>(Count) > (1ULL << BEWidth)) return false; // If this loop is nested, then the loop unroller changes the code in @@ -315,13 +327,17 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, if (Loop *ParentLoop = L->getParentLoop()) SE->forgetLoop(ParentLoop); + // Grab analyses that we preserve. + auto *DTWP = LPM->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + BasicBlock *PH = L->getLoopPreheader(); BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); // It helps to splits the original preheader twice, one for the end of the // prolog code and one for a new loop preheader - BasicBlock *PEnd = SplitEdge(PH, Header, LPM->getAsPass()); - BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), LPM->getAsPass()); + BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI); + BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI); BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator()); // Compute the number of extra iterations required, which is: @@ -329,16 +345,23 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, SCEVExpander Expander(*SE, "loop-unroll"); Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), PreHeaderBR); + Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(), + PreHeaderBR); IRBuilder<> B(PreHeaderBR); Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); - // Check if for no extra iterations, then jump to cloned/unrolled loop. - // We have to check that the trip count computation didn't overflow when - // adding one to the backedge taken count. - Value *LCmp = B.CreateIsNotNull(ModVal, "lcmp.mod"); - Value *OverflowCheck = B.CreateIsNull(TripCount, "lcmp.overflow"); - Value *BranchVal = B.CreateOr(OverflowCheck, LCmp, "lcmp.or"); + // If ModVal is zero, we know that either + // 1. there are no iteration to be run in the prologue loop + // OR + // 2. the addition computing TripCount overflowed + // + // If (2) is true, we know that TripCount really is (1 << BEWidth) and so the + // number of iterations that remain to be run in the original loop is a + // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we + // explicitly check this above). + + Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod"); // Branch to either the extra iterations or the cloned/unrolled loop // We will fix up the true branch label when adding loop body copies @@ -361,10 +384,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, std::vector<BasicBlock *> NewBlocks; ValueToValueMapTy VMap; - // If unroll count is 2 and we can't overflow in tripcount computation (which - // is BECount + 1), then we don't need a loop for prologue, and we can unroll - // it. We can be sure that we don't overflow only if tripcount is a constant. - bool UnrollPrologue = (Count == 2 && isa<ConstantInt>(TripCount)); + bool UnrollPrologue = Count == 2; // Clone all the basic blocks in the loop. If Count is 2, we don't clone // the loop, otherwise we create a cloned loop to execute the extra @@ -390,8 +410,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, // Connect the prolog code to the original loop and update the // PHI functions. BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]); - ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, - LPM->getAsPass()); + ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, + /*AliasAnalysis*/ nullptr, DT, LI, LPM->getAsPass()); NumRuntimeUnrolled++; return true; } diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp index a0105c2..b3bdae4 100644 --- a/lib/Transforms/Utils/LowerSwitch.cpp +++ b/lib/Transforms/Utils/LowerSwitch.cpp @@ -32,6 +32,23 @@ using namespace llvm; #define DEBUG_TYPE "lower-switch" namespace { + struct IntRange { + int64_t Low, High; + }; + // Return true iff R is covered by Ranges. + static bool IsInRanges(const IntRange &R, + const std::vector<IntRange> &Ranges) { + // Note: Ranges must be sorted, non-overlapping and non-adjacent. + + // Find the first range whose High field is >= R.High, + // then check if the Low field is <= R.Low. If so, we + // have a Range that covers R. + auto I = std::lower_bound( + Ranges.begin(), Ranges.end(), R, + [](const IntRange &A, const IntRange &B) { return A.High < B.High; }); + return I != Ranges.end() && I->Low <= R.Low; + } + /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch /// instructions. class LowerSwitch : public FunctionPass { @@ -46,18 +63,16 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { // This is a cluster of orthogonal Transforms AU.addPreserved<UnifyFunctionExitNodes>(); - AU.addPreserved("mem2reg"); AU.addPreservedID(LowerInvokePassID); } struct CaseRange { - Constant* Low; - Constant* High; + ConstantInt* Low; + ConstantInt* High; BasicBlock* BB; - CaseRange(Constant *low = nullptr, Constant *high = nullptr, - BasicBlock *bb = nullptr) : - Low(low), High(high), BB(bb) { } + CaseRange(ConstantInt *low, ConstantInt *high, BasicBlock *bb) + : Low(low), High(high), BB(bb) {} }; typedef std::vector<CaseRange> CaseVector; @@ -68,7 +83,8 @@ namespace { BasicBlock *switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, ConstantInt *UpperBound, Value *Val, BasicBlock *Predecessor, - BasicBlock *OrigBlock, BasicBlock *Default); + BasicBlock *OrigBlock, BasicBlock *Default, + const std::vector<IntRange> &UnreachableRanges); BasicBlock *newLeafBlock(CaseRange &Leaf, Value *Val, BasicBlock *OrigBlock, BasicBlock *Default); unsigned Clusterify(CaseVector &Cases, SwitchInst *SI); @@ -131,25 +147,39 @@ static raw_ostream& operator<<(raw_ostream &O, return O << "]"; } -/// \brief Update the first occurrence of the "switch statement" BB in the PHI -/// node with the "new" BB. The other occurrences will be updated by subsequent -/// calls to this function. -/// -/// Switch statements may have more than one incoming edge into the same BB if -/// they all have the same value. When the switch statement is converted these -/// incoming edges are now coming from multiple BBs. -static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB) { - for (BasicBlock::iterator I = SuccBB->begin(), E = SuccBB->getFirstNonPHI(); - I != E; ++I) { +// \brief Update the first occurrence of the "switch statement" BB in the PHI +// node with the "new" BB. The other occurrences will: +// +// 1) Be updated by subsequent calls to this function. Switch statements may +// have more than one outcoming edge into the same BB if they all have the same +// value. When the switch statement is converted these incoming edges are now +// coming from multiple BBs. +// 2) Removed if subsequent incoming values now share the same case, i.e., +// multiple outcome edges are condensed into one. This is necessary to keep the +// number of phi values equal to the number of branches to SuccBB. +static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB, + unsigned NumMergedCases) { + for (BasicBlock::iterator I = SuccBB->begin(), IE = SuccBB->getFirstNonPHI(); + I != IE; ++I) { PHINode *PN = cast<PHINode>(I); // Only update the first occurence. - for (unsigned Idx = 0, E = PN->getNumIncomingValues(); Idx != E; ++Idx) { + unsigned Idx = 0, E = PN->getNumIncomingValues(); + unsigned LocalNumMergedCases = NumMergedCases; + for (; Idx != E; ++Idx) { if (PN->getIncomingBlock(Idx) == OrigBB) { PN->setIncomingBlock(Idx, NewBB); break; } } + + // Remove additional occurences coming from condensed cases and keep the + // number of incoming values equal to the number of branches to SuccBB. + for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx) + if (PN->getIncomingBlock(Idx) == OrigBB) { + PN->removeIncomingValue(Idx); + LocalNumMergedCases--; + } } } @@ -158,12 +188,12 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB) { // LowerBound and UpperBound are used to keep track of the bounds for Val // that have already been checked by a block emitted by one of the previous // calls to switchConvert in the call stack. -BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, - ConstantInt *LowerBound, - ConstantInt *UpperBound, Value *Val, - BasicBlock *Predecessor, - BasicBlock *OrigBlock, - BasicBlock *Default) { +BasicBlock * +LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, + ConstantInt *UpperBound, Value *Val, + BasicBlock *Predecessor, BasicBlock *OrigBlock, + BasicBlock *Default, + const std::vector<IntRange> &UnreachableRanges) { unsigned Size = End - Begin; if (Size == 1) { @@ -172,7 +202,11 @@ BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, // emitting the code that checks if the value actually falls in the range // because the bounds already tell us so. if (Begin->Low == LowerBound && Begin->High == UpperBound) { - fixPhis(Begin->BB, OrigBlock, Predecessor); + unsigned NumMergedCases = 0; + if (LowerBound && UpperBound) + NumMergedCases = + UpperBound->getSExtValue() - LowerBound->getSExtValue(); + fixPhis(Begin->BB, OrigBlock, Predecessor, NumMergedCases); return Begin->BB; } return newLeafBlock(*Begin, Val, OrigBlock, Default); @@ -186,32 +220,32 @@ BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, CaseRange &Pivot = *(Begin + Mid); DEBUG(dbgs() << "Pivot ==> " - << cast<ConstantInt>(Pivot.Low)->getValue() - << " -" << cast<ConstantInt>(Pivot.High)->getValue() << "\n"); + << Pivot.Low->getValue() + << " -" << Pivot.High->getValue() << "\n"); // NewLowerBound here should never be the integer minimal value. // This is because it is computed from a case range that is never // the smallest, so there is always a case range that has at least // a smaller value. - ConstantInt *NewLowerBound = cast<ConstantInt>(Pivot.Low); - ConstantInt *NewUpperBound; - - // If we don't have a Default block then it means that we can never - // have a value outside of a case range, so set the UpperBound to the highest - // value in the LHS part of the case ranges. - if (Default != nullptr) { - // Because NewLowerBound is never the smallest representable integer - // it is safe here to subtract one. - NewUpperBound = ConstantInt::get(NewLowerBound->getContext(), - NewLowerBound->getValue() - 1); - } else { - CaseItr LastLHS = LHS.begin() + LHS.size() - 1; - NewUpperBound = cast<ConstantInt>(LastLHS->High); + ConstantInt *NewLowerBound = Pivot.Low; + + // Because NewLowerBound is never the smallest representable integer + // it is safe here to subtract one. + ConstantInt *NewUpperBound = ConstantInt::get(NewLowerBound->getContext(), + NewLowerBound->getValue() - 1); + + if (!UnreachableRanges.empty()) { + // Check if the gap between LHS's highest and NewLowerBound is unreachable. + int64_t GapLow = LHS.back().High->getSExtValue() + 1; + int64_t GapHigh = NewLowerBound->getSExtValue() - 1; + IntRange Gap = { GapLow, GapHigh }; + if (GapHigh >= GapLow && IsInRanges(Gap, UnreachableRanges)) + NewUpperBound = LHS.back().High; } DEBUG(dbgs() << "LHS Bounds ==> "; if (LowerBound) { - dbgs() << cast<ConstantInt>(LowerBound)->getSExtValue(); + dbgs() << LowerBound->getSExtValue(); } else { dbgs() << "NONE"; } @@ -219,7 +253,7 @@ BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, dbgs() << "RHS Bounds ==> "; dbgs() << NewLowerBound->getSExtValue() << " - "; if (UpperBound) { - dbgs() << cast<ConstantInt>(UpperBound)->getSExtValue() << "\n"; + dbgs() << UpperBound->getSExtValue() << "\n"; } else { dbgs() << "NONE\n"; }); @@ -234,10 +268,10 @@ BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, BasicBlock *LBranch = switchConvert(LHS.begin(), LHS.end(), LowerBound, NewUpperBound, Val, NewNode, OrigBlock, - Default); + Default, UnreachableRanges); BasicBlock *RBranch = switchConvert(RHS.begin(), RHS.end(), NewLowerBound, UpperBound, Val, NewNode, OrigBlock, - Default); + Default, UnreachableRanges); Function::iterator FI = OrigBlock; F->getBasicBlockList().insert(++FI, NewNode); @@ -270,11 +304,11 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, Leaf.Low, "SwitchLeaf"); } else { // Make range comparison - if (cast<ConstantInt>(Leaf.Low)->isMinValue(true /*isSigned*/)) { + if (Leaf.Low->isMinValue(true /*isSigned*/)) { // Val >= Min && Val <= Hi --> Val <= Hi Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High, "SwitchLeaf"); - } else if (cast<ConstantInt>(Leaf.Low)->isZero()) { + } else if (Leaf.Low->isZero()) { // Val >= 0 && Val <= Hi --> Val <=u Hi Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High, "SwitchLeaf"); @@ -299,8 +333,8 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) { PHINode* PN = cast<PHINode>(I); // Remove all but one incoming entries from the cluster - uint64_t Range = cast<ConstantInt>(Leaf.High)->getSExtValue() - - cast<ConstantInt>(Leaf.Low)->getSExtValue(); + uint64_t Range = Leaf.High->getSExtValue() - + Leaf.Low->getSExtValue(); for (uint64_t j = 0; j < Range; ++j) { PN->removeIncomingValue(OrigBlock); } @@ -328,8 +362,8 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { if (Cases.size()>=2) for (CaseItr I = Cases.begin(), J = std::next(Cases.begin()); J != Cases.end();) { - int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue(); - int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue(); + int64_t nextValue = J->Low->getSExtValue(); + int64_t currentValue = I->High->getSExtValue(); BasicBlock* nextBB = J->BB; BasicBlock* currentBB = I->BB; @@ -362,26 +396,102 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) { Value *Val = SI->getCondition(); // The value we are switching on... BasicBlock* Default = SI->getDefaultDest(); - // If there is only the default destination, don't bother with the code below. + // If there is only the default destination, just branch. if (!SI->getNumCases()) { - BranchInst::Create(SI->getDefaultDest(), CurBlock); - CurBlock->getInstList().erase(SI); + BranchInst::Create(Default, CurBlock); + SI->eraseFromParent(); return; } - const bool DefaultIsUnreachable = - Default->size() == 1 && isa<UnreachableInst>(Default->getTerminator()); + // Prepare cases vector. + CaseVector Cases; + unsigned numCmps = Clusterify(Cases, SI); + DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size() + << ". Total compares: " << numCmps << "\n"); + DEBUG(dbgs() << "Cases: " << Cases << "\n"); + (void)numCmps; + + ConstantInt *LowerBound = nullptr; + ConstantInt *UpperBound = nullptr; + std::vector<IntRange> UnreachableRanges; + + if (isa<UnreachableInst>(Default->getFirstNonPHIOrDbg())) { + // Make the bounds tightly fitted around the case value range, becase we + // know that the value passed to the switch must be exactly one of the case + // values. + assert(!Cases.empty()); + LowerBound = Cases.front().Low; + UpperBound = Cases.back().High; + + DenseMap<BasicBlock *, unsigned> Popularity; + unsigned MaxPop = 0; + BasicBlock *PopSucc = nullptr; + + IntRange R = { INT64_MIN, INT64_MAX }; + UnreachableRanges.push_back(R); + for (const auto &I : Cases) { + int64_t Low = I.Low->getSExtValue(); + int64_t High = I.High->getSExtValue(); + + IntRange &LastRange = UnreachableRanges.back(); + if (LastRange.Low == Low) { + // There is nothing left of the previous range. + UnreachableRanges.pop_back(); + } else { + // Terminate the previous range. + assert(Low > LastRange.Low); + LastRange.High = Low - 1; + } + if (High != INT64_MAX) { + IntRange R = { High + 1, INT64_MAX }; + UnreachableRanges.push_back(R); + } + + // Count popularity. + int64_t N = High - Low + 1; + unsigned &Pop = Popularity[I.BB]; + if ((Pop += N) > MaxPop) { + MaxPop = Pop; + PopSucc = I.BB; + } + } +#ifndef NDEBUG + /* UnreachableRanges should be sorted and the ranges non-adjacent. */ + for (auto I = UnreachableRanges.begin(), E = UnreachableRanges.end(); + I != E; ++I) { + assert(I->Low <= I->High); + auto Next = I + 1; + if (Next != E) { + assert(Next->Low > I->High); + } + } +#endif + + // Use the most popular block as the new default, reducing the number of + // cases. + assert(MaxPop > 0 && PopSucc); + Default = PopSucc; + for (CaseItr I = Cases.begin(); I != Cases.end();) { + if (I->BB == PopSucc) + I = Cases.erase(I); + else + ++I; + } + + // If there are no cases left, just branch. + if (Cases.empty()) { + BranchInst::Create(Default, CurBlock); + SI->eraseFromParent(); + return; + } + } + // Create a new, empty default block so that the new hierarchy of // if-then statements go to this and the PHI nodes are happy. - // if the default block is set as an unreachable we avoid creating one - // because will never be a valid target. - BasicBlock *NewDefault = nullptr; - if (!DefaultIsUnreachable) { - NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault"); - F->getBasicBlockList().insert(Default, NewDefault); - - BranchInst::Create(Default, NewDefault); - } + BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault"); + F->getBasicBlockList().insert(Default, NewDefault); + BranchInst::Create(Default, NewDefault); + // If there is an entry in any PHI nodes for the default edge, make sure // to update them as well. for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) { @@ -391,40 +501,18 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) { PN->setIncomingBlock((unsigned)BlockIdx, NewDefault); } - // Prepare cases vector. - CaseVector Cases; - unsigned numCmps = Clusterify(Cases, SI); - - DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size() - << ". Total compares: " << numCmps << "\n"); - DEBUG(dbgs() << "Cases: " << Cases << "\n"); - (void)numCmps; - - ConstantInt *UpperBound = nullptr; - ConstantInt *LowerBound = nullptr; - - // Optimize the condition where Default is an unreachable block. In this case - // we can make the bounds tightly fitted around the case value ranges, - // because we know that the value passed to the switch should always be - // exactly one of the case values. - if (DefaultIsUnreachable) { - CaseItr LastCase = Cases.begin() + Cases.size() - 1; - UpperBound = cast<ConstantInt>(LastCase->High); - LowerBound = cast<ConstantInt>(Cases.begin()->Low); - } BasicBlock *SwitchBlock = switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val, - OrigBlock, OrigBlock, NewDefault); + OrigBlock, OrigBlock, NewDefault, UnreachableRanges); // Branch to our shiny new if-then stuff... BranchInst::Create(SwitchBlock, OrigBlock); // We are now done with the switch instruction, delete it. + BasicBlock *OldDefault = SI->getDefaultDest(); CurBlock->getInstList().erase(SI); - pred_iterator PI = pred_begin(Default), E = pred_end(Default); - // If the Default block has no more predecessors just remove it - if (PI == E) { - DeleteDeadBlock(Default); - } + // If the Default block has no more predecessors just remove it. + if (pred_begin(OldDefault) == pred_end(OldDefault)) + DeleteDeadBlock(OldDefault); } diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp index 477ee7a..00cf4e6 100644 --- a/lib/Transforms/Utils/Mem2Reg.cpp +++ b/lib/Transforms/Utils/Mem2Reg.cpp @@ -14,7 +14,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" @@ -39,7 +39,7 @@ namespace { bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); // This is a cluster of orthogonal Transforms @@ -53,7 +53,7 @@ namespace { char PromotePass::ID = 0; INITIALIZE_PASS_BEGIN(PromotePass, "mem2reg", "Promote Memory to Register", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(PromotePass, "mem2reg", "Promote Memory to Register", false, false) @@ -66,7 +66,8 @@ bool PromotePass::runOnFunction(Function &F) { bool Changed = false; DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - AssumptionTracker *AT = &getAnalysis<AssumptionTracker>(); + AssumptionCache &AC = + getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); while (1) { Allocas.clear(); @@ -80,7 +81,7 @@ bool PromotePass::runOnFunction(Function &F) { if (Allocas.empty()) break; - PromoteMemToReg(Allocas, DT, nullptr, AT); + PromoteMemToReg(Allocas, DT, nullptr, &AC); NumPromoted += Allocas.size(); Changed = true; } diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 1fd7071..dabadb7 100644 --- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -239,7 +239,7 @@ struct PromoteMem2Reg { AliasSetTracker *AST; /// A cache of @llvm.assume intrinsics used by SimplifyInstruction. - AssumptionTracker *AT; + AssumptionCache *AC; /// Reverse mapping of Allocas. DenseMap<AllocaInst *, unsigned> AllocaLookup; @@ -282,9 +282,10 @@ struct PromoteMem2Reg { public: PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT, - AliasSetTracker *AST, AssumptionTracker *AT) + AliasSetTracker *AST, AssumptionCache *AC) : Allocas(Allocas.begin(), Allocas.end()), DT(DT), - DIB(*DT.getRoot()->getParent()->getParent()), AST(AST), AT(AT) {} + DIB(*DT.getRoot()->getParent()->getParent(), /*AllowUnresolved*/ false), + AST(AST), AC(AC) {} void run(); @@ -415,7 +416,8 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, // Record debuginfo for the store and remove the declaration's // debuginfo. if (DbgDeclareInst *DDI = Info.DbgDeclare) { - DIBuilder DIB(*AI->getParent()->getParent()->getParent()); + DIBuilder DIB(*AI->getParent()->getParent()->getParent(), + /*AllowUnresolved*/ false); ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, DIB); DDI->eraseFromParent(); LBI.deleteValue(DDI); @@ -498,7 +500,8 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, StoreInst *SI = cast<StoreInst>(AI->user_back()); // Record debuginfo for the store before removing it. if (DbgDeclareInst *DDI = Info.DbgDeclare) { - DIBuilder DIB(*AI->getParent()->getParent()->getParent()); + DIBuilder DIB(*AI->getParent()->getParent()->getParent(), + /*AllowUnresolved*/ false); ConvertDebugDeclareToDebugValue(DDI, SI, DIB); } SI->eraseFromParent(); @@ -688,7 +691,7 @@ void PromoteMem2Reg::run() { PHINode *PN = I->second; // If this PHI node merges one value and/or undefs, get the value. - if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, &DT, AT)) { + if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, &DT, AC)) { if (AST && PN->getType()->isPointerTy()) AST->deleteValue(PN); PN->replaceAllUsesWith(V); @@ -1068,10 +1071,10 @@ NextIteration: } void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT, - AliasSetTracker *AST, AssumptionTracker *AT) { + AliasSetTracker *AST, AssumptionCache *AC) { // If there is nothing to do, bail out... if (Allocas.empty()) return; - PromoteMem2Reg(Allocas, DT, AST, AT).run(); + PromoteMem2Reg(Allocas, DT, AST, AC).run(); } diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp index 3fcb789..c057b06 100644 --- a/lib/Transforms/Utils/SSAUpdater.cpp +++ b/lib/Transforms/Utils/SSAUpdater.cpp @@ -150,8 +150,8 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { ProtoName, &BB->front()); // Fill in all the predecessors of the PHI. - for (unsigned i = 0, e = PredValues.size(); i != e; ++i) - InsertedPHI->addIncoming(PredValues[i].second, PredValues[i].first); + for (const auto &PredValue : PredValues) + InsertedPHI->addIncoming(PredValue.second, PredValue.first); // See if the PHI node can be merged to a single value. This can happen in // loop cases when we get a PHI of itself and one other value. @@ -245,8 +245,7 @@ public: // but it is relatively slow. If we already have PHI nodes in this // block, walk one of them to get the predecessor list instead. if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) { - for (unsigned PI = 0, E = SomePhi->getNumIncomingValues(); PI != E; ++PI) - Preds->push_back(SomePhi->getIncomingBlock(PI)); + Preds->append(SomePhi->block_begin(), SomePhi->block_end()); } else { for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) Preds->push_back(*PI); @@ -344,20 +343,17 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // This is important because we have to handle multiple defs/uses in a block // ourselves: SSAUpdater is purely for cross-block references. DenseMap<BasicBlock*, TinyPtrVector<Instruction*> > UsesByBlock; - - for (unsigned i = 0, e = Insts.size(); i != e; ++i) { - Instruction *User = Insts[i]; + + for (Instruction *User : Insts) UsesByBlock[User->getParent()].push_back(User); - } // Okay, now we can iterate over all the blocks in the function with uses, // processing them. Keep track of which loads are loading a live-in value. // Walk the uses in the use-list order to be determinstic. SmallVector<LoadInst*, 32> LiveInLoads; DenseMap<Value*, Value*> ReplacedLoads; - - for (unsigned i = 0, e = Insts.size(); i != e; ++i) { - Instruction *User = Insts[i]; + + for (Instruction *User : Insts) { BasicBlock *BB = User->getParent(); TinyPtrVector<Instruction*> &BlockUses = UsesByBlock[BB]; @@ -380,8 +376,8 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // Otherwise, check to see if this block is all loads. bool HasStore = false; - for (unsigned i = 0, e = BlockUses.size(); i != e; ++i) { - if (isa<StoreInst>(BlockUses[i])) { + for (Instruction *I : BlockUses) { + if (isa<StoreInst>(I)) { HasStore = true; break; } @@ -391,8 +387,8 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // efficient way to tell which on is first in the block and don't want to // scan large blocks, so just add all loads as live ins. if (!HasStore) { - for (unsigned i = 0, e = BlockUses.size(); i != e; ++i) - LiveInLoads.push_back(cast<LoadInst>(BlockUses[i])); + for (Instruction *I : BlockUses) + LiveInLoads.push_back(cast<LoadInst>(I)); BlockUses.clear(); continue; } @@ -403,8 +399,8 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // block is a load, then it uses the live in value. The last store defines // the live out value. We handle this by doing a linear scan of the block. Value *StoredValue = nullptr; - for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) { - if (LoadInst *L = dyn_cast<LoadInst>(II)) { + for (Instruction &I : *BB) { + if (LoadInst *L = dyn_cast<LoadInst>(&I)) { // If this is a load from an unrelated pointer, ignore it. if (!isInstInList(L, Insts)) continue; @@ -419,8 +415,8 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { } continue; } - - if (StoreInst *SI = dyn_cast<StoreInst>(II)) { + + if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { // If this is a store to an unrelated pointer, ignore it. if (!isInstInList(SI, Insts)) continue; updateDebugInfo(SI); @@ -438,8 +434,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // Okay, now we rewrite all loads that use live-in values in the loop, // inserting PHI nodes as necessary. - for (unsigned i = 0, e = LiveInLoads.size(); i != e; ++i) { - LoadInst *ALoad = LiveInLoads[i]; + for (LoadInst *ALoad : LiveInLoads) { Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent()); replaceLoadWithValue(ALoad, NewVal); @@ -454,9 +449,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // Now that everything is rewritten, delete the old instructions from the // function. They should all be dead now. - for (unsigned i = 0, e = Insts.size(); i != e; ++i) { - Instruction *User = Insts[i]; - + for (Instruction *User : Insts) { // If this is a load that still has uses, then the load must have been added // as a live value in the SSAUpdate data structure for a block (e.g. because // the loaded value was stored later). In this case, we need to recursively diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 92fd56a..3248a83 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -53,9 +53,13 @@ using namespace PatternMatch; #define DEBUG_TYPE "simplifycfg" +// Chosen as 2 so as to be cheap, but still to have enough power to fold +// a select, so the "clamp" idiom (of a min followed by a max) will be caught. +// To catch this, we need to fold a compare and a select, hence '2' being the +// minimum reasonable default. static cl::opt<unsigned> -PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(1), - cl::desc("Control the amount of phi node folding to perform (default = 1)")); +PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(2), + cl::desc("Control the amount of phi node folding to perform (default = 2)")); static cl::opt<bool> DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false), @@ -73,6 +77,7 @@ STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables"); STATISTIC(NumLookupTablesHoles, "Number of switch instructions turned into lookup tables (holes checked)"); +STATISTIC(NumTableCmpReuses, "Number of reused switch table lookup compares"); STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block"); STATISTIC(NumSpeculations, "Number of speculative executed instructions"); @@ -107,7 +112,7 @@ class SimplifyCFGOpt { const TargetTransformInfo &TTI; unsigned BonusInstThreshold; const DataLayout *const DL; - AssumptionTracker *AT; + AssumptionCache *AC; Value *isValueEqualityComparison(TerminatorInst *TI); BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases); @@ -127,8 +132,8 @@ class SimplifyCFGOpt { public: SimplifyCFGOpt(const TargetTransformInfo &TTI, unsigned BonusInstThreshold, - const DataLayout *DL, AssumptionTracker *AT) - : TTI(TTI), BonusInstThreshold(BonusInstThreshold), DL(DL), AT(AT) {} + const DataLayout *DL, AssumptionCache *AC) + : TTI(TTI), BonusInstThreshold(BonusInstThreshold), DL(DL), AC(AC) {} bool run(BasicBlock *BB); }; } @@ -215,45 +220,15 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred, } /// ComputeSpeculationCost - Compute an abstract "cost" of speculating the -/// given instruction, which is assumed to be safe to speculate. 1 means -/// cheap, 2 means less cheap, and UINT_MAX means prohibitively expensive. -static unsigned ComputeSpeculationCost(const User *I, const DataLayout *DL) { +/// given instruction, which is assumed to be safe to speculate. TCC_Free means +/// cheap, TCC_Basic means less cheap, and TCC_Expensive means prohibitively +/// expensive. +static unsigned ComputeSpeculationCost(const User *I, const DataLayout *DL, + const TargetTransformInfo &TTI) { assert(isSafeToSpeculativelyExecute(I, DL) && "Instruction is not safe to speculatively execute!"); - switch (Operator::getOpcode(I)) { - default: - // In doubt, be conservative. - return UINT_MAX; - case Instruction::GetElementPtr: - // GEPs are cheap if all indices are constant. - if (!cast<GEPOperator>(I)->hasAllConstantIndices()) - return UINT_MAX; - return 1; - case Instruction::ExtractValue: - case Instruction::Load: - case Instruction::Add: - case Instruction::Sub: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::ICmp: - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::BitCast: - case Instruction::ExtractElement: - case Instruction::InsertElement: - return 1; // These are all cheap. - - case Instruction::Call: - case Instruction::Select: - return 2; - } + return TTI.getUserCost(I); } - /// DominatesMergePoint - If we have a merge point of an "if condition" as /// accepted above, return true if the specified value dominates the block. We /// don't handle the true generality of domination here, just a special case @@ -274,7 +249,8 @@ static unsigned ComputeSpeculationCost(const User *I, const DataLayout *DL) { static bool DominatesMergePoint(Value *V, BasicBlock *BB, SmallPtrSetImpl<Instruction*> *AggressiveInsts, unsigned &CostRemaining, - const DataLayout *DL) { + const DataLayout *DL, + const TargetTransformInfo &TTI) { Instruction *I = dyn_cast<Instruction>(V); if (!I) { // Non-instructions all dominate instructions, but not all constantexprs @@ -310,7 +286,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, if (!isSafeToSpeculativelyExecute(I, DL)) return false; - unsigned Cost = ComputeSpeculationCost(I, DL); + unsigned Cost = ComputeSpeculationCost(I, DL, TTI); if (Cost > CostRemaining) return false; @@ -320,7 +296,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, // Okay, we can only really hoist these out if their operands do // not take us over the cost threshold. for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) - if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, DL)) + if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, DL, TTI)) return false; // Okay, it's safe to do this! Remember this instruction. AggressiveInsts->insert(I); @@ -383,10 +359,9 @@ struct ConstantComparesGatherer { } /// Prevent copy - ConstantComparesGatherer(const ConstantComparesGatherer &) - LLVM_DELETED_FUNCTION; + ConstantComparesGatherer(const ConstantComparesGatherer &) = delete; ConstantComparesGatherer & - operator=(const ConstantComparesGatherer &) LLVM_DELETED_FUNCTION; + operator=(const ConstantComparesGatherer &) = delete; private: @@ -712,8 +687,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, if (HasWeight) for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e; ++MD_i) { - ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i)); - assert(CI); + ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(MD_i)); Weights.push_back(CI->getValue().getZExtValue()); } for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) { @@ -818,7 +792,7 @@ static void GetBranchWeights(TerminatorInst *TI, MDNode *MD = TI->getMetadata(LLVMContext::MD_prof); assert(MD); for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) { - ConstantInt *CI = cast<ConstantInt>(MD->getOperand(i)); + ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(i)); Weights.push_back(CI->getValue().getZExtValue()); } @@ -1079,7 +1053,8 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I); /// HoistThenElseCodeToIf - Given a conditional branch that goes to BB1 and /// BB2, hoist any common code in the two blocks up into the branch block. The /// caller of this function guarantees that BI's block dominates BB1 and BB2. -static bool HoistThenElseCodeToIf(BranchInst *BI, const DataLayout *DL) { +static bool HoistThenElseCodeToIf(BranchInst *BI, const DataLayout *DL, + const TargetTransformInfo &TTI) { // This does very trivial matching, with limited scanning, to find identical // instructions in the two blocks. In particular, we don't want to get into // O(M*N) situations here where M and N are the sizes of BB1 and BB2. As @@ -1114,6 +1089,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, const DataLayout *DL) { if (isa<TerminatorInst>(I1)) goto HoistTerminator; + if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2)) + return Changed; + // For a normal instruction, we just move one to right before the branch, // then replace all uses of the other with the first. Finally, we remove // the now redundant second instruction. @@ -1244,14 +1222,13 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { return false; // Gather the PHI nodes in BBEnd. - std::map<Value*, std::pair<Value*, PHINode*> > MapValueFromBB1ToBB2; + SmallDenseMap<std::pair<Value *, Value *>, PHINode *> JointValueMap; Instruction *FirstNonPhiInBBEnd = nullptr; - for (BasicBlock::iterator I = BBEnd->begin(), E = BBEnd->end(); - I != E; ++I) { + for (BasicBlock::iterator I = BBEnd->begin(), E = BBEnd->end(); I != E; ++I) { if (PHINode *PN = dyn_cast<PHINode>(I)) { Value *BB1V = PN->getIncomingValueForBlock(BB1); Value *BB2V = PN->getIncomingValueForBlock(BB2); - MapValueFromBB1ToBB2[BB1V] = std::make_pair(BB2V, PN); + JointValueMap[std::make_pair(BB1V, BB2V)] = PN; } else { FirstNonPhiInBBEnd = &*I; break; @@ -1260,13 +1237,13 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { if (!FirstNonPhiInBBEnd) return false; - // This does very trivial matching, with limited scanning, to find identical // instructions in the two blocks. We scan backward for obviously identical // instructions in an identical order. BasicBlock::InstListType::reverse_iterator RI1 = BB1->getInstList().rbegin(), - RE1 = BB1->getInstList().rend(), RI2 = BB2->getInstList().rbegin(), - RE2 = BB2->getInstList().rend(); + RE1 = BB1->getInstList().rend(), + RI2 = BB2->getInstList().rbegin(), + RE2 = BB2->getInstList().rend(); // Skip debug info. while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1; if (RI1 == RE1) @@ -1289,6 +1266,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { return Changed; Instruction *I1 = &*RI1, *I2 = &*RI2; + auto InstPair = std::make_pair(I1, I2); // I1 and I2 should have a single use in the same PHI node, and they // perform the same operation. // Cannot move control-flow-involving, volatile loads, vaarg, etc. @@ -1299,11 +1277,11 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { I1->mayHaveSideEffects() || I2->mayHaveSideEffects() || I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() || !I1->hasOneUse() || !I2->hasOneUse() || - MapValueFromBB1ToBB2.find(I1) == MapValueFromBB1ToBB2.end() || - MapValueFromBB1ToBB2[I1].first != I2) + !JointValueMap.count(InstPair)) return Changed; // Check whether we should swap the operands of ICmpInst. + // TODO: Add support of communativity. ICmpInst *ICmp1 = dyn_cast<ICmpInst>(I1), *ICmp2 = dyn_cast<ICmpInst>(I2); bool SwapOpnds = false; if (ICmp1 && ICmp2 && @@ -1324,16 +1302,13 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { // with a PHI node after sinking. We only handle the case where there is // a single pair of different operands. Value *DifferentOp1 = nullptr, *DifferentOp2 = nullptr; - unsigned Op1Idx = 0; + unsigned Op1Idx = ~0U; for (unsigned I = 0, E = I1->getNumOperands(); I != E; ++I) { if (I1->getOperand(I) == I2->getOperand(I)) continue; - // Early exit if we have more-than one pair of different operands or - // the different operand is already in MapValueFromBB1ToBB2. - // Early exit if we need a PHI node to replace a constant. - if (DifferentOp1 || - MapValueFromBB1ToBB2.find(I1->getOperand(I)) != - MapValueFromBB1ToBB2.end() || + // Early exit if we have more-than one pair of different operands or if + // we need a PHI node to replace a constant. + if (Op1Idx != ~0U || isa<Constant>(I1->getOperand(I)) || isa<Constant>(I2->getOperand(I))) { // If we can't sink the instructions, undo the swapping. @@ -1346,24 +1321,27 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { DifferentOp2 = I2->getOperand(I); } - // We insert the pair of different operands to MapValueFromBB1ToBB2 and - // remove (I1, I2) from MapValueFromBB1ToBB2. - if (DifferentOp1) { - PHINode *NewPN = PHINode::Create(DifferentOp1->getType(), 2, - DifferentOp1->getName() + ".sink", - BBEnd->begin()); - MapValueFromBB1ToBB2[DifferentOp1] = std::make_pair(DifferentOp2, NewPN); + DEBUG(dbgs() << "SINK common instructions " << *I1 << "\n"); + DEBUG(dbgs() << " " << *I2 << "\n"); + + // We insert the pair of different operands to JointValueMap and + // remove (I1, I2) from JointValueMap. + if (Op1Idx != ~0U) { + auto &NewPN = JointValueMap[std::make_pair(DifferentOp1, DifferentOp2)]; + if (!NewPN) { + NewPN = + PHINode::Create(DifferentOp1->getType(), 2, + DifferentOp1->getName() + ".sink", BBEnd->begin()); + NewPN->addIncoming(DifferentOp1, BB1); + NewPN->addIncoming(DifferentOp2, BB2); + DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";); + } // I1 should use NewPN instead of DifferentOp1. I1->setOperand(Op1Idx, NewPN); - NewPN->addIncoming(DifferentOp1, BB1); - NewPN->addIncoming(DifferentOp2, BB2); - DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";); } - PHINode *OldPN = MapValueFromBB1ToBB2[I1].second; - MapValueFromBB1ToBB2.erase(I1); + PHINode *OldPN = JointValueMap[InstPair]; + JointValueMap.erase(InstPair); - DEBUG(dbgs() << "SINK common instructions " << *I1 << "\n";); - DEBUG(dbgs() << " " << *I2 << "\n";); // We need to update RE1 and RE2 if we are going to sink the first // instruction in the basic block down. bool UpdateRE1 = (I1 == BB1->begin()), UpdateRE2 = (I2 == BB2->begin()); @@ -1489,7 +1467,8 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, /// /// \returns true if the conditional block is removed. static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, - const DataLayout *DL) { + const DataLayout *DL, + const TargetTransformInfo &TTI) { // Be conservative for now. FP select instruction can often be expensive. Value *BrCond = BI->getCondition(); if (isa<FCmpInst>(BrCond)) @@ -1538,7 +1517,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, EndBB)))) return false; if (!SpeculatedStoreValue && - ComputeSpeculationCost(I, DL) > PHINodeFoldingThreshold) + ComputeSpeculationCost(I, DL, TTI) > PHINodeFoldingThreshold * + TargetTransformInfo::TCC_Basic) return false; // Store the store speculation candidate. @@ -1597,9 +1577,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE, DL)) || (OrigCE && !isSafeToSpeculativelyExecute(OrigCE, DL))) return false; - unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, DL) : 0; - unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, DL) : 0; - if (OrigCost + ThenCost > 2 * PHINodeFoldingThreshold) + unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, DL, TTI) : 0; + unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, DL, TTI) : 0; + unsigned MaxCost = 2 * PHINodeFoldingThreshold * + TargetTransformInfo::TCC_Basic; + if (OrigCost + ThenCost > MaxCost) return false; // Account for the cost of an unfolded ConstantExpr which could end up @@ -1804,7 +1786,8 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *DL) { /// FoldTwoEntryPHINode - Given a BB that starts with the specified two-entry /// PHI node, see if we can eliminate it. -static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) { +static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL, + const TargetTransformInfo &TTI) { // Ok, this is a two entry PHI node. Check to see if this is a simple "if // statement", which has a very simple dominance structure. Basically, we // are trying to find the condition that is being branched on, which @@ -1835,6 +1818,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) { SmallPtrSet<Instruction*, 4> AggressiveInsts; unsigned MaxCostVal0 = PHINodeFoldingThreshold, MaxCostVal1 = PHINodeFoldingThreshold; + MaxCostVal0 *= TargetTransformInfo::TCC_Basic; + MaxCostVal1 *= TargetTransformInfo::TCC_Basic; for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) { PHINode *PN = cast<PHINode>(II++); @@ -1845,9 +1830,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) { } if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts, - MaxCostVal0, DL) || + MaxCostVal0, DL, TTI) || !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts, - MaxCostVal1, DL)) + MaxCostVal1, DL, TTI)) return false; } @@ -2036,8 +2021,10 @@ static bool ExtractBranchMetadata(BranchInst *BI, "Looking for probabilities on unconditional branch?"); MDNode *ProfileData = BI->getMetadata(LLVMContext::MD_prof); if (!ProfileData || ProfileData->getNumOperands() != 3) return false; - ConstantInt *CITrue = dyn_cast<ConstantInt>(ProfileData->getOperand(1)); - ConstantInt *CIFalse = dyn_cast<ConstantInt>(ProfileData->getOperand(2)); + ConstantInt *CITrue = + mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(1)); + ConstantInt *CIFalse = + mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2)); if (!CITrue || !CIFalse) return false; ProbTrue = CITrue->getValue().getZExtValue(); ProbFalse = CIFalse->getValue().getZExtValue(); @@ -2534,17 +2521,15 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { // The weight to CommonDest should be PredCommon * SuccTotal + // PredOther * SuccCommon. // The weight to OtherDest should be PredOther * SuccOther. - SmallVector<uint64_t, 2> NewWeights; - NewWeights.push_back(PredCommon * (SuccCommon + SuccOther) + - PredOther * SuccCommon); - NewWeights.push_back(PredOther * SuccOther); + uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther) + + PredOther * SuccCommon, + PredOther * SuccOther}; // Halve the weights if any of them cannot fit in an uint32_t FitWeights(NewWeights); - SmallVector<uint32_t, 2> MDWeights(NewWeights.begin(),NewWeights.end()); PBI->setMetadata(LLVMContext::MD_prof, - MDBuilder(BI->getContext()). - createBranchWeights(MDWeights)); + MDBuilder(BI->getContext()) + .createBranchWeights(NewWeights[0], NewWeights[1])); } // OtherDest may have phi nodes. If so, add an entry from PBI's @@ -2718,7 +2703,7 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) { /// the PHI, merging the third icmp into the switch. static bool TryToSimplifyUncondBranchWithICmpInIt( ICmpInst *ICI, IRBuilder<> &Builder, const TargetTransformInfo &TTI, - unsigned BonusInstThreshold, const DataLayout *DL, AssumptionTracker *AT) { + unsigned BonusInstThreshold, const DataLayout *DL, AssumptionCache *AC) { BasicBlock *BB = ICI->getParent(); // If the block has any PHIs in it or the icmp has multiple uses, it is too @@ -2751,7 +2736,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt( ICI->eraseFromParent(); } // BB is now empty, so it is likely to simplify away. - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; } // Ok, the block is reachable from the default dest. If the constant we're @@ -2767,7 +2752,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt( ICI->replaceAllUsesWith(V); ICI->eraseFromParent(); // BB is now empty, so it is likely to simplify away. - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; } // The use of the icmp has to be in the 'end' block, by the only PHI node in @@ -2947,20 +2932,9 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) { return false; // Turn all invokes that unwind here into calls and delete the basic block. - bool InvokeRequiresTableEntry = false; - bool Changed = false; for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) { InvokeInst *II = cast<InvokeInst>((*PI++)->getTerminator()); - - if (II->hasFnAttr(Attribute::UWTable)) { - // Don't remove an `invoke' instruction if the ABI requires an entry into - // the table. - InvokeRequiresTableEntry = true; - continue; - } - SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); - // Insert a call instruction before the invoke. CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II); Call->takeName(II); @@ -2980,14 +2954,11 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) { // Finally, delete the invoke instruction! II->eraseFromParent(); - Changed = true; } - if (!InvokeRequiresTableEntry) - // The landingpad is now unreachable. Zap it. - BB->eraseFromParent(); - - return Changed; + // The landingpad is now unreachable. Zap it. + BB->eraseFromParent(); + return true; } bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) { @@ -3018,7 +2989,7 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) { } // If we eliminated all predecessors of the block, delete the block now. - if (pred_begin(BB) == pred_end(BB)) + if (pred_empty(BB)) // We know there are no successors, so just nuke the block. BB->eraseFromParent(); @@ -3119,55 +3090,6 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { --i; --e; Changed = true; } - // If the default value is unreachable, figure out the most popular - // destination and make it the default. - if (SI->getDefaultDest() == BB) { - std::map<BasicBlock*, std::pair<unsigned, unsigned> > Popularity; - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); - i != e; ++i) { - std::pair<unsigned, unsigned> &entry = - Popularity[i.getCaseSuccessor()]; - if (entry.first == 0) { - entry.first = 1; - entry.second = i.getCaseIndex(); - } else { - entry.first++; - } - } - - // Find the most popular block. - unsigned MaxPop = 0; - unsigned MaxIndex = 0; - BasicBlock *MaxBlock = nullptr; - for (std::map<BasicBlock*, std::pair<unsigned, unsigned> >::iterator - I = Popularity.begin(), E = Popularity.end(); I != E; ++I) { - if (I->second.first > MaxPop || - (I->second.first == MaxPop && MaxIndex > I->second.second)) { - MaxPop = I->second.first; - MaxIndex = I->second.second; - MaxBlock = I->first; - } - } - if (MaxBlock) { - // Make this the new default, allowing us to delete any explicit - // edges to it. - SI->setDefaultDest(MaxBlock); - Changed = true; - - // If MaxBlock has phinodes in it, remove MaxPop-1 entries from - // it. - if (isa<PHINode>(MaxBlock->begin())) - for (unsigned i = 0; i != MaxPop-1; ++i) - MaxBlock->removePredecessor(SI->getParent()); - - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); - i != e; ++i) - if (i.getCaseSuccessor() == MaxBlock) { - SI->removeCase(i); - --i; --e; - } - } - } } else if (InvokeInst *II = dyn_cast<InvokeInst>(TI)) { if (II->getUnwindDest() == BB) { // Convert the invoke to a call instruction. This would be a good @@ -3191,7 +3113,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { } // If this block is now dead, remove it. - if (pred_begin(BB) == pred_end(BB) && + if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) { // We know there are no successors, so just nuke the block. BB->eraseFromParent(); @@ -3201,70 +3123,122 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { return Changed; } -/// TurnSwitchRangeIntoICmp - Turns a switch with that contains only a -/// integer range comparison into a sub, an icmp and a branch. -static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { - assert(SI->getNumCases() > 1 && "Degenerate switch?"); +static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) { + assert(Cases.size() >= 1); - // Make sure all cases point to the same destination and gather the values. - SmallVector<ConstantInt *, 16> Cases; - SwitchInst::CaseIt I = SI->case_begin(); - Cases.push_back(I.getCaseValue()); - SwitchInst::CaseIt PrevI = I++; - for (SwitchInst::CaseIt E = SI->case_end(); I != E; PrevI = I++) { - if (PrevI.getCaseSuccessor() != I.getCaseSuccessor()) + array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate); + for (size_t I = 1, E = Cases.size(); I != E; ++I) { + if (Cases[I - 1]->getValue() != Cases[I]->getValue() + 1) return false; - Cases.push_back(I.getCaseValue()); } - assert(Cases.size() == SI->getNumCases() && "Not all cases gathered"); + return true; +} - // Sort the case values, then check if they form a range we can transform. - array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate); - for (unsigned I = 1, E = Cases.size(); I != E; ++I) { - if (Cases[I-1]->getValue() != Cases[I]->getValue()+1) - return false; +/// Turn a switch with two reachable destinations into an integer range +/// comparison and branch. +static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { + assert(SI->getNumCases() > 1 && "Degenerate switch?"); + + bool HasDefault = + !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg()); + + // Partition the cases into two sets with different destinations. + BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr; + BasicBlock *DestB = nullptr; + SmallVector <ConstantInt *, 16> CasesA; + SmallVector <ConstantInt *, 16> CasesB; + + for (SwitchInst::CaseIt I : SI->cases()) { + BasicBlock *Dest = I.getCaseSuccessor(); + if (!DestA) DestA = Dest; + if (Dest == DestA) { + CasesA.push_back(I.getCaseValue()); + continue; + } + if (!DestB) DestB = Dest; + if (Dest == DestB) { + CasesB.push_back(I.getCaseValue()); + continue; + } + return false; // More than two destinations. } - Constant *Offset = ConstantExpr::getNeg(Cases.back()); - Constant *NumCases = ConstantInt::get(Offset->getType(), SI->getNumCases()); + assert(DestA && DestB && "Single-destination switch should have been folded."); + assert(DestA != DestB); + assert(DestB != SI->getDefaultDest()); + assert(!CasesB.empty() && "There must be non-default cases."); + assert(!CasesA.empty() || HasDefault); + + // Figure out if one of the sets of cases form a contiguous range. + SmallVectorImpl<ConstantInt *> *ContiguousCases = nullptr; + BasicBlock *ContiguousDest = nullptr; + BasicBlock *OtherDest = nullptr; + if (!CasesA.empty() && CasesAreContiguous(CasesA)) { + ContiguousCases = &CasesA; + ContiguousDest = DestA; + OtherDest = DestB; + } else if (CasesAreContiguous(CasesB)) { + ContiguousCases = &CasesB; + ContiguousDest = DestB; + OtherDest = DestA; + } else + return false; + + // Start building the compare and branch. + + Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back()); + Constant *NumCases = ConstantInt::get(Offset->getType(), ContiguousCases->size()); Value *Sub = SI->getCondition(); if (!Offset->isNullValue()) - Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off"); + Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off"); + Value *Cmp; // If NumCases overflowed, then all possible values jump to the successor. - if (NumCases->isNullValue() && SI->getNumCases() != 0) + if (NumCases->isNullValue() && !ContiguousCases->empty()) Cmp = ConstantInt::getTrue(SI->getContext()); else Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch"); - BranchInst *NewBI = Builder.CreateCondBr( - Cmp, SI->case_begin().getCaseSuccessor(), SI->getDefaultDest()); + BranchInst *NewBI = Builder.CreateCondBr(Cmp, ContiguousDest, OtherDest); // Update weight for the newly-created conditional branch. - SmallVector<uint64_t, 8> Weights; - bool HasWeights = HasBranchWeights(SI); - if (HasWeights) { + if (HasBranchWeights(SI)) { + SmallVector<uint64_t, 8> Weights; GetBranchWeights(SI, Weights); if (Weights.size() == 1 + SI->getNumCases()) { - // Combine all weights for the cases to be the true weight of NewBI. - // We assume that the sum of all weights for a Terminator can fit into 32 - // bits. - uint32_t NewTrueWeight = 0; - for (unsigned I = 1, E = Weights.size(); I != E; ++I) - NewTrueWeight += (uint32_t)Weights[I]; + uint64_t TrueWeight = 0; + uint64_t FalseWeight = 0; + for (size_t I = 0, E = Weights.size(); I != E; ++I) { + if (SI->getSuccessor(I) == ContiguousDest) + TrueWeight += Weights[I]; + else + FalseWeight += Weights[I]; + } + while (TrueWeight > UINT32_MAX || FalseWeight > UINT32_MAX) { + TrueWeight /= 2; + FalseWeight /= 2; + } NewBI->setMetadata(LLVMContext::MD_prof, - MDBuilder(SI->getContext()). - createBranchWeights(NewTrueWeight, - (uint32_t)Weights[0])); + MDBuilder(SI->getContext()).createBranchWeights( + (uint32_t)TrueWeight, (uint32_t)FalseWeight)); } } - // Prune obsolete incoming values off the successor's PHI nodes. - for (BasicBlock::iterator BBI = SI->case_begin().getCaseSuccessor()->begin(); - isa<PHINode>(BBI); ++BBI) { - for (unsigned I = 0, E = SI->getNumCases()-1; I != E; ++I) + // Prune obsolete incoming values off the successors' PHI nodes. + for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) { + unsigned PreviousEdges = ContiguousCases->size(); + if (ContiguousDest == SI->getDefaultDest()) ++PreviousEdges; + for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) + cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); + } + for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) { + unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size(); + if (OtherDest == SI->getDefaultDest()) ++PreviousEdges; + for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); } + + // Drop the switch. SI->eraseFromParent(); return true; @@ -3273,11 +3247,11 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { /// EliminateDeadSwitchCases - Compute masked bits for the condition of a switch /// and use it to remove dead cases. static bool EliminateDeadSwitchCases(SwitchInst *SI, const DataLayout *DL, - AssumptionTracker *AT) { + AssumptionCache *AC) { Value *Cond = SI->getCondition(); unsigned Bits = Cond->getType()->getIntegerBitWidth(); APInt KnownZero(Bits, 0), KnownOne(Bits, 0); - computeKnownBits(Cond, KnownZero, KnownOne, DL, 0, AT, SI); + computeKnownBits(Cond, KnownZero, KnownOne, DL, 0, AC, SI); // Gather dead cases. SmallVector<ConstantInt*, 8> DeadCases; @@ -3484,6 +3458,21 @@ GetCaseResults(SwitchInst *SI, continue; } else if (Constant *C = ConstantFold(I, ConstantPool, DL)) { // Instruction is side-effect free and constant. + + // If the instruction has uses outside this block or a phi node slot for + // the block, it is not safe to bypass the instruction since it would then + // no longer dominate all its uses. + for (auto &Use : I->uses()) { + User *User = Use.getUser(); + if (Instruction *I = dyn_cast<Instruction>(User)) + if (I->getParent() == CaseDest) + continue; + if (PHINode *Phi = dyn_cast<PHINode>(User)) + if (Phi->getIncomingBlock(Use) == CaseDest) + continue; + return false; + } + ConstantPool.insert(std::make_pair(I, C)); } else { break; @@ -3509,12 +3498,6 @@ GetCaseResults(SwitchInst *SI, if (!ConstVal) return false; - // Note: If the constant comes from constant-propagating the case value - // through the CaseDest basic block, it will be safe to remove the - // instructions in that block. They cannot be used (except in the phi nodes - // we visit) outside CaseDest, because that block does not dominate its - // successor. If it did, we would not be in this phi node. - // Be conservative about which kinds of constants we support. if (!ValidLookupTableConstant(ConstVal)) return false; @@ -3655,7 +3638,7 @@ static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI, /// phi nodes in a common successor block with only two different /// constant values, replace the switch with select. static bool SwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder, - const DataLayout *DL, AssumptionTracker *AT) { + const DataLayout *DL, AssumptionCache *AC) { Value *const Cond = SI->getCondition(); PHINode *PHI = nullptr; BasicBlock *CommonDest = nullptr; @@ -3982,6 +3965,89 @@ static bool ShouldBuildLookupTable(SwitchInst *SI, return SI->getNumCases() * 10 >= TableSize * 4; } +/// Try to reuse the switch table index compare. Following pattern: +/// \code +/// if (idx < tablesize) +/// r = table[idx]; // table does not contain default_value +/// else +/// r = default_value; +/// if (r != default_value) +/// ... +/// \endcode +/// Is optimized to: +/// \code +/// cond = idx < tablesize; +/// if (cond) +/// r = table[idx]; +/// else +/// r = default_value; +/// if (cond) +/// ... +/// \endcode +/// Jump threading will then eliminate the second if(cond). +static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock, + BranchInst *RangeCheckBranch, Constant *DefaultValue, + const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values) { + + ICmpInst *CmpInst = dyn_cast<ICmpInst>(PhiUser); + if (!CmpInst) + return; + + // We require that the compare is in the same block as the phi so that jump + // threading can do its work afterwards. + if (CmpInst->getParent() != PhiBlock) + return; + + Constant *CmpOp1 = dyn_cast<Constant>(CmpInst->getOperand(1)); + if (!CmpOp1) + return; + + Value *RangeCmp = RangeCheckBranch->getCondition(); + Constant *TrueConst = ConstantInt::getTrue(RangeCmp->getType()); + Constant *FalseConst = ConstantInt::getFalse(RangeCmp->getType()); + + // Check if the compare with the default value is constant true or false. + Constant *DefaultConst = ConstantExpr::getICmp(CmpInst->getPredicate(), + DefaultValue, CmpOp1, true); + if (DefaultConst != TrueConst && DefaultConst != FalseConst) + return; + + // Check if the compare with the case values is distinct from the default + // compare result. + for (auto ValuePair : Values) { + Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(), + ValuePair.second, CmpOp1, true); + if (!CaseConst || CaseConst == DefaultConst) + return; + assert((CaseConst == TrueConst || CaseConst == FalseConst) && + "Expect true or false as compare result."); + } + + // Check if the branch instruction dominates the phi node. It's a simple + // dominance check, but sufficient for our needs. + // Although this check is invariant in the calling loops, it's better to do it + // at this late stage. Practically we do it at most once for a switch. + BasicBlock *BranchBlock = RangeCheckBranch->getParent(); + for (auto PI = pred_begin(PhiBlock), E = pred_end(PhiBlock); PI != E; ++PI) { + BasicBlock *Pred = *PI; + if (Pred != BranchBlock && Pred->getUniquePredecessor() != BranchBlock) + return; + } + + if (DefaultConst == FalseConst) { + // The compare yields the same result. We can replace it. + CmpInst->replaceAllUsesWith(RangeCmp); + ++NumTableCmpReuses; + } else { + // The compare yields the same result, just inverted. We can replace it. + Value *InvertedTableCmp = BinaryOperator::CreateXor(RangeCmp, + ConstantInt::get(RangeCmp->getType(), 1), "inverted.cmp", + RangeCheckBranch); + CmpInst->replaceAllUsesWith(InvertedTableCmp); + ++NumTableCmpReuses; + } +} + /// SwitchToLookupTable - If the switch is only used to initialize one or more /// phi nodes in a common successor block with different constant values, /// replace the switch with lookup tables. @@ -4058,11 +4124,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, // If the table has holes, we need a constant result for the default case // or a bitmask that fits in a register. SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList; - bool HasDefaultResults = false; - if (TableHasHoles) { - HasDefaultResults = GetCaseResults(SI, nullptr, SI->getDefaultDest(), + bool HasDefaultResults = GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResultsList, DL); - } bool NeedMask = (TableHasHoles && !HasDefaultResults); if (NeedMask) { @@ -4102,21 +4165,24 @@ static bool SwitchToLookupTable(SwitchInst *SI, "It is impossible for a switch to have more entries than the max " "representable value of its input integer type's size."); - // If we have a fully covered lookup table, unconditionally branch to the - // lookup table BB. Otherwise, check if the condition value is within the case - // range. If it is so, branch to the new BB. Otherwise branch to SI's default - // destination. - const bool GeneratingCoveredLookupTable = MaxTableSize == TableSize; - if (GeneratingCoveredLookupTable) { + // If the default destination is unreachable, or if the lookup table covers + // all values of the conditional variable, branch directly to the lookup table + // BB. Otherwise, check that the condition is within the case range. + const bool DefaultIsReachable = + !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg()); + const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize); + BranchInst *RangeCheckBranch = nullptr; + + if (!DefaultIsReachable || GeneratingCoveredLookupTable) { Builder.CreateBr(LookupBB); // We cached PHINodes in PHIs, to avoid accessing deleted PHINodes later, // do not delete PHINodes here. SI->getDefaultDest()->removePredecessor(SI->getParent(), - true/*DontDeleteUselessPHIs*/); + /*DontDeleteUselessPHIs=*/true); } else { Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get( MinCaseVal->getType(), TableSize)); - Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest()); + RangeCheckBranch = Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest()); } // Populate the BB that does the lookups. @@ -4167,11 +4233,11 @@ static bool SwitchToLookupTable(SwitchInst *SI, bool ReturnedEarly = false; for (size_t I = 0, E = PHIs.size(); I != E; ++I) { PHINode *PHI = PHIs[I]; + const ResultListTy &ResultList = ResultLists[PHI]; // If using a bitmask, use any value to fill the lookup table holes. Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI]; - SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultLists[PHI], - DV, DL); + SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL); Value *Result = Table.BuildLookup(TableIndex, Builder); @@ -4184,6 +4250,16 @@ static bool SwitchToLookupTable(SwitchInst *SI, break; } + // Do a small peephole optimization: re-use the switch table compare if + // possible. + if (!TableHasHoles && HasDefaultResults && RangeCheckBranch) { + BasicBlock *PhiBlock = PHI->getParent(); + // Search for compare instructions which use the phi. + for (auto *User : PHI->users()) { + reuseTableCompare(User, PhiBlock, RangeCheckBranch, DV, ResultList); + } + } + PHI->addIncoming(Result, LookupBB); } @@ -4214,12 +4290,12 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { // see if that predecessor totally determines the outcome of this switch. if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; Value *Cond = SI->getCondition(); if (SelectInst *Select = dyn_cast<SelectInst>(Cond)) if (SimplifySwitchOnSelect(SI, Select)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; // If the block only contains the switch, see if we can fold the block // away into any preds. @@ -4229,25 +4305,25 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { ++BBI; if (SI == &*BBI) if (FoldValueComparisonIntoPredecessors(SI, Builder)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; } // Try to transform the switch into an icmp and a branch. if (TurnSwitchRangeIntoICmp(SI, Builder)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; // Remove unreachable cases. - if (EliminateDeadSwitchCases(SI, DL, AT)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + if (EliminateDeadSwitchCases(SI, DL, AC)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; - if (SwitchToSelect(SI, Builder, DL, AT)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + if (SwitchToSelect(SI, Builder, DL, AC)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; if (ForwardSwitchConditionToPHI(SI)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; if (SwitchToLookupTable(SI, Builder, TTI, DL)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; return false; } @@ -4284,7 +4360,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) { if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) { if (SimplifyIndirectBrOnSelect(IBI, SI)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; } return Changed; } @@ -4309,7 +4385,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ ; if (I->isTerminator() && TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, TTI, - BonusInstThreshold, DL, AT)) + BonusInstThreshold, DL, AC)) return true; } @@ -4318,7 +4394,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ // predecessor and use logical operations to update the incoming value // for PHI nodes in common successor. if (FoldBranchToCommonDest(BI, DL, BonusInstThreshold)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; return false; } @@ -4333,7 +4409,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { // switch. if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; // This block must be empty, except for the setcond inst, if it exists. // Ignore dbg intrinsics. @@ -4343,14 +4419,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { ++I; if (&*I == BI) { if (FoldValueComparisonIntoPredecessors(BI, Builder)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; } else if (&*I == cast<Instruction>(BI->getCondition())){ ++I; // Ignore dbg intrinsics. while (isa<DbgInfoIntrinsic>(I)) ++I; if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; } } @@ -4362,7 +4438,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { // branches to us and one of our successors, fold the comparison into the // predecessor and use logical operations to pick the right destination. if (FoldBranchToCommonDest(BI, DL, BonusInstThreshold)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; // We have a conditional branch to two blocks that are only reachable // from BI. We know that the condbr dominates the two blocks, so see if @@ -4370,16 +4446,16 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { // can hoist it up to the branching block. if (BI->getSuccessor(0)->getSinglePredecessor()) { if (BI->getSuccessor(1)->getSinglePredecessor()) { - if (HoistThenElseCodeToIf(BI, DL)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + if (HoistThenElseCodeToIf(BI, DL, TTI)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; } else { // If Successor #1 has multiple preds, we may be able to conditionally // execute Successor #0 if it branches to Successor #1. TerminatorInst *Succ0TI = BI->getSuccessor(0)->getTerminator(); if (Succ0TI->getNumSuccessors() == 1 && Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) - if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), DL)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), DL, TTI)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; } } else if (BI->getSuccessor(1)->getSinglePredecessor()) { // If Successor #0 has multiple preds, we may be able to conditionally @@ -4387,8 +4463,8 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator(); if (Succ1TI->getNumSuccessors() == 1 && Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) - if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), DL)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), DL, TTI)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; } // If this is a branch on a phi node in the current block, thread control @@ -4396,14 +4472,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition())) if (PN->getParent() == BI->getParent()) if (FoldCondBranchOnPHI(BI, DL)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; // Scan predecessor blocks for conditional branches. for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator())) if (PBI != BI && PBI->isConditional()) if (SimplifyCondBranchToCondBranch(PBI, BI)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; return false; } @@ -4484,7 +4560,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { // Remove basic blocks that have no predecessors (except the entry block)... // or that just have themself as a predecessor. These are unreachable. - if ((pred_begin(BB) == pred_end(BB) && + if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) || BB->getSinglePredecessor() == BB) { DEBUG(dbgs() << "Removing BB: \n" << *BB); @@ -4515,7 +4591,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { // eliminate it, do so now. if (PHINode *PN = dyn_cast<PHINode>(BB->begin())) if (PN->getNumIncomingValues() == 2) - Changed |= FoldTwoEntryPHINode(PN, DL); + Changed |= FoldTwoEntryPHINode(PN, DL, TTI); Builder.SetInsertPoint(BB->getTerminator()); if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { @@ -4547,7 +4623,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { /// of the CFG. It returns true if a modification was made. /// bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, - unsigned BonusInstThreshold, - const DataLayout *DL, AssumptionTracker *AT) { - return SimplifyCFGOpt(TTI, BonusInstThreshold, DL, AT).run(BB); + unsigned BonusInstThreshold, const DataLayout *DL, + AssumptionCache *AC) { + return SimplifyCFGOpt(TTI, BonusInstThreshold, DL, AC).run(BB); } diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp index a4fdd55..6a5d885 100644 --- a/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -48,22 +48,15 @@ namespace { Loop *L; LoopInfo *LI; ScalarEvolution *SE; - const DataLayout *DL; // May be NULL SmallVectorImpl<WeakVH> &DeadInsts; bool Changed; public: - SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LPPassManager *LPM, - SmallVectorImpl<WeakVH> &Dead, IVUsers *IVU = nullptr) : - L(Loop), - LI(LPM->getAnalysisIfAvailable<LoopInfo>()), - SE(SE), - DeadInsts(Dead), - Changed(false) { - DataLayoutPass *DLP = LPM->getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; + SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LoopInfo *LI, + SmallVectorImpl<WeakVH> &Dead, IVUsers *IVU = nullptr) + : L(Loop), LI(LI), SE(SE), DeadInsts(Dead), Changed(false) { assert(LI && "IV simplification requires LoopInfo"); } @@ -80,6 +73,7 @@ namespace { void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand); void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand, bool IsSigned); + bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand); Instruction *splitOverflowIntrinsic(Instruction *IVUser, const DominatorTree *DT); @@ -271,6 +265,107 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst, return true; } +/// Annotate BO with nsw / nuw if it provably does not signed-overflow / +/// unsigned-overflow. Returns true if anything changed, false otherwise. +bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO, + Value *IVOperand) { + + // Currently we only handle instructions of the form "add <indvar> <value>" + unsigned Op = BO->getOpcode(); + if (Op != Instruction::Add) + return false; + + // If BO is already both nuw and nsw then there is nothing left to do + if (BO->hasNoUnsignedWrap() && BO->hasNoSignedWrap()) + return false; + + IntegerType *IT = cast<IntegerType>(IVOperand->getType()); + Value *OtherOperand = nullptr; + if (BO->getOperand(0) == IVOperand) { + OtherOperand = BO->getOperand(1); + } else { + assert(BO->getOperand(1) == IVOperand && "only other use!"); + OtherOperand = BO->getOperand(0); + } + + bool Changed = false; + const SCEV *OtherOpSCEV = SE->getSCEV(OtherOperand); + if (OtherOpSCEV == SE->getCouldNotCompute()) + return false; + + const SCEV *IVOpSCEV = SE->getSCEV(IVOperand); + const SCEV *ZeroSCEV = SE->getConstant(IVOpSCEV->getType(), 0); + + if (!BO->hasNoSignedWrap()) { + // Upgrade the add to an "add nsw" if we can prove that it will never + // sign-overflow or sign-underflow. + + const SCEV *SignedMax = + SE->getConstant(APInt::getSignedMaxValue(IT->getBitWidth())); + const SCEV *SignedMin = + SE->getConstant(APInt::getSignedMinValue(IT->getBitWidth())); + + // The addition "IVOperand + OtherOp" does not sign-overflow if the result + // is sign-representable in 2's complement in the given bit-width. + // + // If OtherOp is SLT 0, then for an IVOperand in [SignedMin - OtherOp, + // SignedMax], "IVOperand + OtherOp" is in [SignedMin, SignedMax + OtherOp]. + // Everything in [SignedMin, SignedMax + OtherOp] is representable since + // SignedMax + OtherOp is at least -1. + // + // If OtherOp is SGE 0, then for an IVOperand in [SignedMin, SignedMax - + // OtherOp], "IVOperand + OtherOp" is in [SignedMin + OtherOp, SignedMax]. + // Everything in [SignedMin + OtherOp, SignedMax] is representable since + // SignedMin + OtherOp is at most -1. + // + // It follows that for all values of IVOperand in [SignedMin - smin(0, + // OtherOp), SignedMax - smax(0, OtherOp)] the result of the add is + // representable (i.e. there is no sign-overflow). + + const SCEV *UpperDelta = SE->getSMaxExpr(ZeroSCEV, OtherOpSCEV); + const SCEV *UpperLimit = SE->getMinusSCEV(SignedMax, UpperDelta); + + bool NeverSignedOverflows = + SE->isKnownPredicate(ICmpInst::ICMP_SLE, IVOpSCEV, UpperLimit); + + if (NeverSignedOverflows) { + const SCEV *LowerDelta = SE->getSMinExpr(ZeroSCEV, OtherOpSCEV); + const SCEV *LowerLimit = SE->getMinusSCEV(SignedMin, LowerDelta); + + bool NeverSignedUnderflows = + SE->isKnownPredicate(ICmpInst::ICMP_SGE, IVOpSCEV, LowerLimit); + if (NeverSignedUnderflows) { + BO->setHasNoSignedWrap(true); + Changed = true; + } + } + } + + if (!BO->hasNoUnsignedWrap()) { + // Upgrade the add computing "IVOperand + OtherOp" to an "add nuw" if we can + // prove that it will never unsigned-overflow (i.e. the result will always + // be representable in the given bit-width). + // + // "IVOperand + OtherOp" is unsigned-representable in 2's complement iff it + // does not produce a carry. "IVOperand + OtherOp" produces no carry iff + // IVOperand ULE (UnsignedMax - OtherOp). + + const SCEV *UnsignedMax = + SE->getConstant(APInt::getMaxValue(IT->getBitWidth())); + const SCEV *UpperLimit = SE->getMinusSCEV(UnsignedMax, OtherOpSCEV); + + bool NeverUnsignedOverflows = + SE->isKnownPredicate(ICmpInst::ICMP_ULE, IVOpSCEV, UpperLimit); + + if (NeverUnsignedOverflows) { + BO->setHasNoUnsignedWrap(true); + Changed = true; + } + } + + return Changed; +} + /// \brief Split sadd.with.overflow into add + sadd.with.overflow to allow /// analysis and optimization. /// @@ -430,6 +525,16 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) { pushIVUsers(IVOperand, Simplified, SimpleIVUsers); continue; } + + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseOper.first)) { + if (isa<OverflowingBinaryOperator>(BO) && + strengthenOverflowingOperation(BO, IVOperand)) { + // re-queue uses of the now modified binary operator and fall + // through to the checks that remain. + pushIVUsers(IVOperand, Simplified, SimpleIVUsers); + } + } + CastInst *Cast = dyn_cast<CastInst>(UseOper.first); if (V && Cast) { V->visitCast(Cast); @@ -450,8 +555,8 @@ void IVVisitor::anchor() { } bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, LPPassManager *LPM, SmallVectorImpl<WeakVH> &Dead, IVVisitor *V) { - LoopInfo *LI = &LPM->getAnalysis<LoopInfo>(); - SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, LPM, Dead); + LoopInfo *LI = &LPM->getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, LI, Dead); SIV.simplifyUsers(CurrIV, V); return SIV.hasChanged(); } diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp index 5632095..55a4455 100644 --- a/lib/Transforms/Utils/SimplifyInstructions.cpp +++ b/lib/Transforms/Utils/SimplifyInstructions.cpp @@ -18,14 +18,14 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -42,8 +42,8 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<AssumptionTracker>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } /// runOnFunction - Remove instructions that simplify. @@ -53,8 +53,10 @@ namespace { const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); - AssumptionTracker *AT = &getAnalysis<AssumptionTracker>(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + AssumptionCache *AC = + &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; bool Changed = false; @@ -71,7 +73,7 @@ namespace { continue; // Don't waste time simplifying unused instructions. if (!I->use_empty()) - if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AT)) { + if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) { // Mark all uses for resimplification next time round the loop. for (User *U : I->users()) Next->insert(cast<Instruction>(U)); @@ -104,8 +106,8 @@ namespace { char InstSimplifier::ID = 0; INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify", "Remove redundant instructions", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(InstSimplifier, "instsimplify", "Remove redundant instructions", false, false) char &llvm::InstructionSimplifierID = InstSimplifier::ID; diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index a39f128..fb1d83f 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -30,7 +30,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" using namespace llvm; @@ -116,207 +116,68 @@ static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty, } } -//===----------------------------------------------------------------------===// -// Fortified Library Call Optimizations -//===----------------------------------------------------------------------===// - -static bool isFortifiedCallFoldable(CallInst *CI, unsigned SizeCIOp, unsigned SizeArgOp, - bool isString) { - if (CI->getArgOperand(SizeCIOp) == CI->getArgOperand(SizeArgOp)) - return true; - if (ConstantInt *SizeCI = - dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) { - if (SizeCI->isAllOnesValue()) - return true; - if (isString) { - uint64_t Len = GetStringLength(CI->getArgOperand(SizeArgOp)); - // If the length is 0 we don't know how long it is and so we can't - // remove the check. - if (Len == 0) - return false; - return SizeCI->getZExtValue() >= Len; - } - if (ConstantInt *Arg = dyn_cast<ConstantInt>(CI->getArgOperand(SizeArgOp))) - return SizeCI->getZExtValue() >= Arg->getZExtValue(); - } - return false; -} - -Value *LibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B) { - Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - LLVMContext &Context = CI->getContext(); - - // Check if this has the right signature. - if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != DL->getIntPtrType(Context) || - FT->getParamType(3) != DL->getIntPtrType(Context)) - return nullptr; - - if (isFortifiedCallFoldable(CI, 3, 2, false)) { - B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), 1); - return CI->getArgOperand(0); - } - return nullptr; -} - -Value *LibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B) { - Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - LLVMContext &Context = CI->getContext(); - - // Check if this has the right signature. - if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != DL->getIntPtrType(Context) || - FT->getParamType(3) != DL->getIntPtrType(Context)) - return nullptr; - - if (isFortifiedCallFoldable(CI, 3, 2, false)) { - B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), 1); - return CI->getArgOperand(0); - } - return nullptr; -} - -Value *LibCallSimplifier::optimizeMemSetChk(CallInst *CI, IRBuilder<> &B) { - Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - LLVMContext &Context = CI->getContext(); - - // Check if this has the right signature. - if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isIntegerTy() || - FT->getParamType(2) != DL->getIntPtrType(Context) || - FT->getParamType(3) != DL->getIntPtrType(Context)) - return nullptr; - - if (isFortifiedCallFoldable(CI, 3, 2, false)) { - Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); - B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); - return CI->getArgOperand(0); - } - return nullptr; -} - -Value *LibCallSimplifier::optimizeStrCpyChk(CallInst *CI, IRBuilder<> &B) { - Function *Callee = CI->getCalledFunction(); - StringRef Name = Callee->getName(); - FunctionType *FT = Callee->getFunctionType(); - LLVMContext &Context = CI->getContext(); - - // Check if this has the right signature. - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != Type::getInt8PtrTy(Context) || - FT->getParamType(2) != DL->getIntPtrType(Context)) - return nullptr; - - Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); - if (Dst == Src) // __strcpy_chk(x,x) -> x - return Src; - - // If a) we don't have any length information, or b) we know this will - // fit then just lower to a plain strcpy. Otherwise we'll keep our - // strcpy_chk call which may fail at runtime if the size is too long. - // TODO: It might be nice to get a maximum length out of the possible - // string lengths for varying. - if (isFortifiedCallFoldable(CI, 2, 1, true)) { - Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6)); - return Ret; - } else { - // Maybe we can stil fold __strcpy_chk to __memcpy_chk. - uint64_t Len = GetStringLength(Src); - if (Len == 0) - return nullptr; - - // This optimization require DataLayout. - if (!DL) - return nullptr; - - Value *Ret = EmitMemCpyChk( - Dst, Src, ConstantInt::get(DL->getIntPtrType(Context), Len), - CI->getArgOperand(2), B, DL, TLI); - return Ret; - } - return nullptr; -} - -Value *LibCallSimplifier::optimizeStpCpyChk(CallInst *CI, IRBuilder<> &B) { - Function *Callee = CI->getCalledFunction(); - StringRef Name = Callee->getName(); - FunctionType *FT = Callee->getFunctionType(); - LLVMContext &Context = CI->getContext(); - - // Check if this has the right signature. - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != Type::getInt8PtrTy(Context) || - FT->getParamType(2) != DL->getIntPtrType(FT->getParamType(0))) - return nullptr; +/// \brief Returns whether \p F matches the signature expected for the +/// string/memory copying library function \p Func. +/// Acceptable functions are st[rp][n]?cpy, memove, memcpy, and memset. +/// Their fortified (_chk) counterparts are also accepted. +static bool checkStringCopyLibFuncSignature(Function *F, LibFunc::Func Func, + const DataLayout *DL) { + FunctionType *FT = F->getFunctionType(); + LLVMContext &Context = F->getContext(); + Type *PCharTy = Type::getInt8PtrTy(Context); + Type *SizeTTy = DL ? DL->getIntPtrType(Context) : nullptr; + unsigned NumParams = FT->getNumParams(); - Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); - if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) - Value *StrLen = EmitStrLen(Src, B, DL, TLI); - return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr; - } - - // If a) we don't have any length information, or b) we know this will - // fit then just lower to a plain stpcpy. Otherwise we'll keep our - // stpcpy_chk call which may fail at runtime if the size is too long. - // TODO: It might be nice to get a maximum length out of the possible - // string lengths for varying. - if (isFortifiedCallFoldable(CI, 2, 1, true)) { - Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6)); - return Ret; - } else { - // Maybe we can stil fold __stpcpy_chk to __memcpy_chk. - uint64_t Len = GetStringLength(Src); - if (Len == 0) - return nullptr; - - // This optimization require DataLayout. - if (!DL) - return nullptr; - - Type *PT = FT->getParamType(0); - Value *LenV = ConstantInt::get(DL->getIntPtrType(PT), Len); - Value *DstEnd = - B.CreateGEP(Dst, ConstantInt::get(DL->getIntPtrType(PT), Len - 1)); - if (!EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, DL, TLI)) - return nullptr; - return DstEnd; - } - return nullptr; -} - -Value *LibCallSimplifier::optimizeStrNCpyChk(CallInst *CI, IRBuilder<> &B) { - Function *Callee = CI->getCalledFunction(); - StringRef Name = Callee->getName(); - FunctionType *FT = Callee->getFunctionType(); - LLVMContext &Context = CI->getContext(); - - // Check if this has the right signature. - if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != Type::getInt8PtrTy(Context) || - !FT->getParamType(2)->isIntegerTy() || - FT->getParamType(3) != DL->getIntPtrType(Context)) - return nullptr; + // All string libfuncs return the same type as the first parameter. + if (FT->getReturnType() != FT->getParamType(0)) + return false; - if (isFortifiedCallFoldable(CI, 3, 2, false)) { - Value *Ret = - EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, DL, TLI, Name.substr(2, 7)); - return Ret; - } - return nullptr; + switch (Func) { + default: + llvm_unreachable("Can't check signature for non-string-copy libfunc."); + case LibFunc::stpncpy_chk: + case LibFunc::strncpy_chk: + --NumParams; // fallthrough + case LibFunc::stpncpy: + case LibFunc::strncpy: { + if (NumParams != 3 || FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != PCharTy || !FT->getParamType(2)->isIntegerTy()) + return false; + break; + } + case LibFunc::strcpy_chk: + case LibFunc::stpcpy_chk: + --NumParams; // fallthrough + case LibFunc::stpcpy: + case LibFunc::strcpy: { + if (NumParams != 2 || FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != PCharTy) + return false; + break; + } + case LibFunc::memmove_chk: + case LibFunc::memcpy_chk: + --NumParams; // fallthrough + case LibFunc::memmove: + case LibFunc::memcpy: { + if (NumParams != 3 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || FT->getParamType(2) != SizeTTy) + return false; + break; + } + case LibFunc::memset_chk: + --NumParams; // fallthrough + case LibFunc::memset: { + if (NumParams != 3 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isIntegerTy() || FT->getParamType(2) != SizeTTy) + return false; + break; + } + } + // If this is a fortified libcall, the last parameter is a size_t. + if (NumParams == FT->getNumParams() - 1) + return FT->getParamType(FT->getNumParams() - 1) == SizeTTy; + return true; } //===----------------------------------------------------------------------===// @@ -600,11 +461,8 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - // Verify the "strcpy" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy()) + + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strcpy, DL)) return nullptr; Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); @@ -631,9 +489,8 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); // Verify the "stpcpy" function prototype. FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy()) + + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::stpcpy, DL)) return nullptr; // These optimizations require DataLayout. @@ -665,10 +522,8 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy() || - !FT->getParamType(2)->isIntegerTy()) + + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strncpy, DL)) return nullptr; Value *Dst = CI->getArgOperand(0); @@ -976,11 +831,7 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) { if (!DL) return nullptr; - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != DL->getIntPtrType(CI->getContext())) + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy, DL)) return nullptr; // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1) @@ -995,11 +846,7 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) { if (!DL) return nullptr; - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != DL->getIntPtrType(CI->getContext())) + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove, DL)) return nullptr; // memmove(x, y, n) -> llvm.memmove(x, y, n, 1) @@ -1014,11 +861,7 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) { if (!DL) return nullptr; - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isIntegerTy() || - FT->getParamType(2) != DL->getIntPtrType(FT->getParamType(0))) + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset, DL)) return nullptr; // memset(p, v, n) -> llvm.memset(p, v, n, 1) @@ -1031,6 +874,28 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) { // Math Library Optimizations //===----------------------------------------------------------------------===// +/// Return a variant of Val with float type. +/// Currently this works in two cases: If Val is an FPExtension of a float +/// value to something bigger, simply return the operand. +/// If Val is a ConstantFP but can be converted to a float ConstantFP without +/// loss of precision do so. +static Value *valueHasFloatPrecision(Value *Val) { + if (FPExtInst *Cast = dyn_cast<FPExtInst>(Val)) { + Value *Op = Cast->getOperand(0); + if (Op->getType()->isFloatTy()) + return Op; + } + if (ConstantFP *Const = dyn_cast<ConstantFP>(Val)) { + APFloat F = Const->getValueAPF(); + bool losesInfo; + (void)F.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, + &losesInfo); + if (!losesInfo) + return ConstantFP::get(Const->getContext(), F); + } + return nullptr; +} + //===----------------------------------------------------------------------===// // Double -> Float Shrinking Optimizations for Unary Functions like 'floor' @@ -1052,12 +917,11 @@ Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, } // If this is something like 'floor((double)floatval)', convert to floorf. - FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0)); - if (!Cast || !Cast->getOperand(0)->getType()->isFloatTy()) + Value *V = valueHasFloatPrecision(CI->getArgOperand(0)); + if (V == nullptr) return nullptr; // floor((double)floatval) -> (double)floorf(floatval) - Value *V = Cast->getOperand(0); if (Callee->isIntrinsic()) { Module *M = CI->getParent()->getParent()->getParent(); Intrinsic::ID IID = (Intrinsic::ID) Callee->getIntrinsicID(); @@ -1083,21 +947,19 @@ Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) { return nullptr; // If this is something like 'fmin((double)floatval1, (double)floatval2)', - // we convert it to fminf. - FPExtInst *Cast1 = dyn_cast<FPExtInst>(CI->getArgOperand(0)); - FPExtInst *Cast2 = dyn_cast<FPExtInst>(CI->getArgOperand(1)); - if (!Cast1 || !Cast1->getOperand(0)->getType()->isFloatTy() || !Cast2 || - !Cast2->getOperand(0)->getType()->isFloatTy()) + // or fmin(1.0, (double)floatval), then we convert it to fminf. + Value *V1 = valueHasFloatPrecision(CI->getArgOperand(0)); + if (V1 == nullptr) + return nullptr; + Value *V2 = valueHasFloatPrecision(CI->getArgOperand(1)); + if (V2 == nullptr) return nullptr; // fmin((double)floatval1, (double)floatval2) - // -> (double)fmin(floatval1, floatval2) - Value *V = nullptr; - Value *V1 = Cast1->getOperand(0); - Value *V2 = Cast2->getOperand(0); + // -> (double)fminf(floatval1, floatval2) // TODO: Handle intrinsics in the same way as in optimizeUnaryDoubleFP(). - V = EmitBinaryFloatFnCall(V1, V2, Callee->getName(), B, - Callee->getAttributes()); + Value *V = EmitBinaryFloatFnCall(V1, V2, Callee->getName(), B, + Callee->getAttributes()); return B.CreateFPExt(V, B.getDoubleTy()); } @@ -1995,53 +1857,18 @@ bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) { return false; } -Value *LibCallSimplifier::optimizeCall(CallInst *CI) { - if (CI->isNoBuiltin()) - return nullptr; - +Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, + IRBuilder<> &Builder) { LibFunc::Func Func; Function *Callee = CI->getCalledFunction(); StringRef FuncName = Callee->getName(); - IRBuilder<> Builder(CI); - bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C; - - // Command-line parameter overrides function attribute. - if (EnableUnsafeFPShrink.getNumOccurrences() > 0) - UnsafeFPShrink = EnableUnsafeFPShrink; - else if (Callee->hasFnAttribute("unsafe-fp-math")) { - // FIXME: This is the same problem as described in optimizeSqrt(). - // If calls gain access to IR-level FMF, then use that instead of a - // function attribute. - // Check for unsafe-fp-math = true. - Attribute Attr = Callee->getFnAttribute("unsafe-fp-math"); - if (Attr.getValueAsString() == "true") - UnsafeFPShrink = true; - } - - // First, check for intrinsics. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { - if (!isCallingConvC) - return nullptr; - switch (II->getIntrinsicID()) { - case Intrinsic::pow: - return optimizePow(CI, Builder); - case Intrinsic::exp2: - return optimizeExp2(CI, Builder); - case Intrinsic::fabs: - return optimizeFabs(CI, Builder); - case Intrinsic::sqrt: - return optimizeSqrt(CI, Builder); - default: - return nullptr; - } - } - - // Then check for known library functions. + // Check for string/memory library functions. if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) { - // We never change the calling convention. - if (!ignoreCallingConv(Func) && !isCallingConvC) - return nullptr; + // Make sure we never change the calling convention. + assert((ignoreCallingConv(Func) || + CI->getCallingConv() == llvm::CallingConv::C) && + "Optimizing string/memory libcall would change the calling convention"); switch (Func) { case LibFunc::strcat: return optimizeStrCat(CI, Builder); @@ -2087,6 +1914,77 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { return optimizeMemMove(CI, Builder); case LibFunc::memset: return optimizeMemSet(CI, Builder); + default: + break; + } + } + return nullptr; +} + +Value *LibCallSimplifier::optimizeCall(CallInst *CI) { + if (CI->isNoBuiltin()) + return nullptr; + + LibFunc::Func Func; + Function *Callee = CI->getCalledFunction(); + StringRef FuncName = Callee->getName(); + IRBuilder<> Builder(CI); + bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C; + + // Command-line parameter overrides function attribute. + if (EnableUnsafeFPShrink.getNumOccurrences() > 0) + UnsafeFPShrink = EnableUnsafeFPShrink; + else if (Callee->hasFnAttribute("unsafe-fp-math")) { + // FIXME: This is the same problem as described in optimizeSqrt(). + // If calls gain access to IR-level FMF, then use that instead of a + // function attribute. + + // Check for unsafe-fp-math = true. + Attribute Attr = Callee->getFnAttribute("unsafe-fp-math"); + if (Attr.getValueAsString() == "true") + UnsafeFPShrink = true; + } + + // First, check for intrinsics. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { + if (!isCallingConvC) + return nullptr; + switch (II->getIntrinsicID()) { + case Intrinsic::pow: + return optimizePow(CI, Builder); + case Intrinsic::exp2: + return optimizeExp2(CI, Builder); + case Intrinsic::fabs: + return optimizeFabs(CI, Builder); + case Intrinsic::sqrt: + return optimizeSqrt(CI, Builder); + default: + return nullptr; + } + } + + // Also try to simplify calls to fortified library functions. + if (Value *SimplifiedFortifiedCI = FortifiedSimplifier.optimizeCall(CI)) { + // Try to further simplify the result. + CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI); + if (SimplifiedCI && SimplifiedCI->getCalledFunction()) + if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) { + // If we were able to further simplify, remove the now redundant call. + SimplifiedCI->replaceAllUsesWith(V); + SimplifiedCI->eraseFromParent(); + return V; + } + return SimplifiedFortifiedCI; + } + + // Then check for known library functions. + if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) { + // We never change the calling convention. + if (!ignoreCallingConv(Func) && !isCallingConvC) + return nullptr; + if (Value *V = optimizeStringMemoryLibCall(CI, Builder)) + return V; + switch (Func) { case LibFunc::cosf: case LibFunc::cos: case LibFunc::cosl: @@ -2177,40 +2075,32 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { if (UnsafeFPShrink && hasFloatVersion(FuncName)) return optimizeUnaryDoubleFP(CI, Builder, true); return nullptr; + case LibFunc::copysign: case LibFunc::fmin: case LibFunc::fmax: if (hasFloatVersion(FuncName)) return optimizeBinaryDoubleFP(CI, Builder); return nullptr; - case LibFunc::memcpy_chk: - return optimizeMemCpyChk(CI, Builder); - case LibFunc::memmove_chk: - return optimizeMemMoveChk(CI, Builder); - case LibFunc::memset_chk: - return optimizeMemSetChk(CI, Builder); - case LibFunc::strcpy_chk: - return optimizeStrCpyChk(CI, Builder); - case LibFunc::stpcpy_chk: - return optimizeStpCpyChk(CI, Builder); - case LibFunc::stpncpy_chk: - case LibFunc::strncpy_chk: - return optimizeStrNCpyChk(CI, Builder); default: return nullptr; } } - return nullptr; } -LibCallSimplifier::LibCallSimplifier(const DataLayout *DL, - const TargetLibraryInfo *TLI) : - DL(DL), - TLI(TLI), - UnsafeFPShrink(false) { +LibCallSimplifier::LibCallSimplifier( + const DataLayout *DL, const TargetLibraryInfo *TLI, + function_ref<void(Instruction *, Value *)> Replacer) + : FortifiedSimplifier(DL, TLI), DL(DL), TLI(TLI), UnsafeFPShrink(false), + Replacer(Replacer) {} + +void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) { + // Indirect through the replacer used in this instance. + Replacer(I, With); } -void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const { +/*static*/ void LibCallSimplifier::replaceAllUsesWithDefault(Instruction *I, + Value *With) { I->replaceAllUsesWith(With); I->eraseFromParent(); } @@ -2262,3 +2152,184 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const { // * trunc(cnst) -> cnst' // // + +//===----------------------------------------------------------------------===// +// Fortified Library Call Optimizations +//===----------------------------------------------------------------------===// + +bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI, + unsigned ObjSizeOp, + unsigned SizeOp, + bool isString) { + if (CI->getArgOperand(ObjSizeOp) == CI->getArgOperand(SizeOp)) + return true; + if (ConstantInt *ObjSizeCI = + dyn_cast<ConstantInt>(CI->getArgOperand(ObjSizeOp))) { + if (ObjSizeCI->isAllOnesValue()) + return true; + // If the object size wasn't -1 (unknown), bail out if we were asked to. + if (OnlyLowerUnknownSize) + return false; + if (isString) { + uint64_t Len = GetStringLength(CI->getArgOperand(SizeOp)); + // If the length is 0 we don't know how long it is and so we can't + // remove the check. + if (Len == 0) + return false; + return ObjSizeCI->getZExtValue() >= Len; + } + if (ConstantInt *SizeCI = dyn_cast<ConstantInt>(CI->getArgOperand(SizeOp))) + return ObjSizeCI->getZExtValue() >= SizeCI->getZExtValue(); + } + return false; +} + +Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B) { + Function *Callee = CI->getCalledFunction(); + + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy_chk, DL)) + return nullptr; + + if (isFortifiedCallFoldable(CI, 3, 2, false)) { + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } + return nullptr; +} + +Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B) { + Function *Callee = CI->getCalledFunction(); + + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove_chk, DL)) + return nullptr; + + if (isFortifiedCallFoldable(CI, 3, 2, false)) { + B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } + return nullptr; +} + +Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, IRBuilder<> &B) { + Function *Callee = CI->getCalledFunction(); + + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset_chk, DL)) + return nullptr; + + if (isFortifiedCallFoldable(CI, 3, 2, false)) { + Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); + B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } + return nullptr; +} + +Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI, + IRBuilder<> &B, + LibFunc::Func Func) { + Function *Callee = CI->getCalledFunction(); + StringRef Name = Callee->getName(); + + if (!checkStringCopyLibFuncSignature(Callee, Func, DL)) + return nullptr; + + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1), + *ObjSize = CI->getArgOperand(2); + + // __stpcpy_chk(x,x,...) -> x+strlen(x) + if (Func == LibFunc::stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) { + Value *StrLen = EmitStrLen(Src, B, DL, TLI); + return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr; + } + + // If a) we don't have any length information, or b) we know this will + // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our + // st[rp]cpy_chk call which may fail at runtime if the size is too long. + // TODO: It might be nice to get a maximum length out of the possible + // string lengths for varying. + if (isFortifiedCallFoldable(CI, 2, 1, true)) { + Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6)); + return Ret; + } else if (!OnlyLowerUnknownSize) { + // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk. + uint64_t Len = GetStringLength(Src); + if (Len == 0) + return nullptr; + + // This optimization requires DataLayout. + if (!DL) + return nullptr; + + Type *SizeTTy = DL->getIntPtrType(CI->getContext()); + Value *LenV = ConstantInt::get(SizeTTy, Len); + Value *Ret = EmitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI); + // If the function was an __stpcpy_chk, and we were able to fold it into + // a __memcpy_chk, we still need to return the correct end pointer. + if (Ret && Func == LibFunc::stpcpy_chk) + return B.CreateGEP(Dst, ConstantInt::get(SizeTTy, Len - 1)); + return Ret; + } + return nullptr; +} + +Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI, + IRBuilder<> &B, + LibFunc::Func Func) { + Function *Callee = CI->getCalledFunction(); + StringRef Name = Callee->getName(); + + if (!checkStringCopyLibFuncSignature(Callee, Func, DL)) + return nullptr; + if (isFortifiedCallFoldable(CI, 3, 2, false)) { + Value *Ret = + EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, DL, TLI, Name.substr(2, 7)); + return Ret; + } + return nullptr; +} + +Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) { + if (CI->isNoBuiltin()) + return nullptr; + + LibFunc::Func Func; + Function *Callee = CI->getCalledFunction(); + StringRef FuncName = Callee->getName(); + IRBuilder<> Builder(CI); + bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C; + + // First, check that this is a known library functions. + if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func)) + return nullptr; + + // We never change the calling convention. + if (!ignoreCallingConv(Func) && !isCallingConvC) + return nullptr; + + switch (Func) { + case LibFunc::memcpy_chk: + return optimizeMemCpyChk(CI, Builder); + case LibFunc::memmove_chk: + return optimizeMemMoveChk(CI, Builder); + case LibFunc::memset_chk: + return optimizeMemSetChk(CI, Builder); + case LibFunc::stpcpy_chk: + case LibFunc::strcpy_chk: + return optimizeStrpCpyChk(CI, Builder, Func); + case LibFunc::stpncpy_chk: + case LibFunc::strncpy_chk: + return optimizeStrpNCpyChk(CI, Builder, Func); + default: + break; + } + return nullptr; +} + +FortifiedLibCallSimplifier:: +FortifiedLibCallSimplifier(const DataLayout *DL, const TargetLibraryInfo *TLI, + bool OnlyLowerUnknownSize) + : DL(DL), TLI(TLI), OnlyLowerUnknownSize(OnlyLowerUnknownSize) { +} diff --git a/lib/Transforms/Utils/SymbolRewriter.cpp b/lib/Transforms/Utils/SymbolRewriter.cpp index aacc945..b343cc4 100644 --- a/lib/Transforms/Utils/SymbolRewriter.cpp +++ b/lib/Transforms/Utils/SymbolRewriter.cpp @@ -60,7 +60,7 @@ #define DEBUG_TYPE "symbol-rewriter" #include "llvm/CodeGen/Passes.h" #include "llvm/Pass.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MemoryBuffer.h" @@ -79,6 +79,19 @@ static cl::list<std::string> RewriteMapFiles("rewrite-map-file", namespace llvm { namespace SymbolRewriter { +void rewriteComdat(Module &M, GlobalObject *GO, const std::string &Source, + const std::string &Target) { + if (Comdat *CD = GO->getComdat()) { + auto &Comdats = M.getComdatSymbolTable(); + + Comdat *C = M.getOrInsertComdat(Target); + C->setSelectionKind(CD->getSelectionKind()); + GO->setComdat(C); + + Comdats.erase(Comdats.find(Source)); + } +} + template <RewriteDescriptor::Type DT, typename ValueType, ValueType *(llvm::Module::*Get)(StringRef) const> class ExplicitRewriteDescriptor : public RewriteDescriptor { @@ -102,10 +115,14 @@ template <RewriteDescriptor::Type DT, typename ValueType, bool ExplicitRewriteDescriptor<DT, ValueType, Get>::performOnModule(Module &M) { bool Changed = false; if (ValueType *S = (M.*Get)(Source)) { + if (GlobalObject *GO = dyn_cast<GlobalObject>(S)) + rewriteComdat(M, GO, Source, Target); + if (Value *T = (M.*Get)(Target)) S->setValueName(T->getValueName()); else S->setName(Target); + Changed = true; } return Changed; @@ -113,7 +130,8 @@ bool ExplicitRewriteDescriptor<DT, ValueType, Get>::performOnModule(Module &M) { template <RewriteDescriptor::Type DT, typename ValueType, ValueType *(llvm::Module::*Get)(StringRef) const, - iterator_range<typename iplist<ValueType>::iterator> (llvm::Module::*Iterator)()> + iterator_range<typename iplist<ValueType>::iterator> + (llvm::Module::*Iterator)()> class PatternRewriteDescriptor : public RewriteDescriptor { public: const std::string Pattern; @@ -131,7 +149,8 @@ public: template <RewriteDescriptor::Type DT, typename ValueType, ValueType *(llvm::Module::*Get)(StringRef) const, - iterator_range<typename iplist<ValueType>::iterator> (llvm::Module::*Iterator)()> + iterator_range<typename iplist<ValueType>::iterator> + (llvm::Module::*Iterator)()> bool PatternRewriteDescriptor<DT, ValueType, Get, Iterator>:: performOnModule(Module &M) { bool Changed = false; @@ -143,6 +162,12 @@ performOnModule(Module &M) { report_fatal_error("unable to transforn " + C.getName() + " in " + M.getModuleIdentifier() + ": " + Error); + if (C.getName() == Name) + continue; + + if (GlobalObject *GO = dyn_cast<GlobalObject>(&C)) + rewriteComdat(M, GO, C.getName(), Name); + if (Value *V = (M.*Get)(Name)) C.setValueName(V->getValueName()); else @@ -492,7 +517,7 @@ RewriteSymbols::RewriteSymbols() : ModulePass(ID) { RewriteSymbols::RewriteSymbols(SymbolRewriter::RewriteDescriptorList &DL) : ModulePass(ID) { - std::swap(Descriptors, DL); + Descriptors.splice(Descriptors.begin(), DL); } bool RewriteSymbols::runOnModule(Module &M) { diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp index 0c2fc0a..7e00a80 100644 --- a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp +++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp @@ -35,7 +35,6 @@ void UnifyFunctionExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ // We preserve the non-critical-edgeness property AU.addPreservedID(BreakCriticalEdgesID); // This is a cluster of orthogonal Transforms - AU.addPreserved("mem2reg"); AU.addPreservedID(LowerSwitchID); } diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp index a2f69d1..49c0902 100644 --- a/lib/Transforms/Utils/ValueMapper.cpp +++ b/lib/Transforms/Utils/ValueMapper.cpp @@ -40,7 +40,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, // Global values do not need to be seeded into the VM if they // are using the identity mapping. - if (isa<GlobalValue>(V) || isa<MDString>(V)) + if (isa<GlobalValue>(V)) return VM[V] = const_cast<Value*>(V); if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) { @@ -56,57 +56,24 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, return VM[V] = const_cast<Value*>(V); } - - if (const MDNode *MD = dyn_cast<MDNode>(V)) { + if (const auto *MDV = dyn_cast<MetadataAsValue>(V)) { + const Metadata *MD = MDV->getMetadata(); // If this is a module-level metadata and we know that nothing at the module // level is changing, then use an identity mapping. - if (!MD->isFunctionLocal() && (Flags & RF_NoModuleLevelChanges)) - return VM[V] = const_cast<Value*>(V); - - // Create a dummy node in case we have a metadata cycle. - MDNode *Dummy = MDNode::getTemporary(V->getContext(), None); - VM[V] = Dummy; - - // Check all operands to see if any need to be remapped. - for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) { - Value *OP = MD->getOperand(i); - if (!OP) continue; - Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper, Materializer); - // Use identity map if Mapped_Op is null and we can ignore missing - // entries. - if (Mapped_OP == OP || - (Mapped_OP == nullptr && (Flags & RF_IgnoreMissingEntries))) - continue; - - // Ok, at least one operand needs remapping. - SmallVector<Value*, 4> Elts; - Elts.reserve(MD->getNumOperands()); - for (i = 0; i != e; ++i) { - Value *Op = MD->getOperand(i); - if (!Op) - Elts.push_back(nullptr); - else { - Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper, Materializer); - // Use identity map if Mapped_Op is null and we can ignore missing - // entries. - if (Mapped_Op == nullptr && (Flags & RF_IgnoreMissingEntries)) - Mapped_Op = Op; - Elts.push_back(Mapped_Op); - } - } - MDNode *NewMD = MDNode::get(V->getContext(), Elts); - Dummy->replaceAllUsesWith(NewMD); - VM[V] = NewMD; - MDNode::deleteTemporary(Dummy); - return NewMD; - } + if (!isa<LocalAsMetadata>(MD) && (Flags & RF_NoModuleLevelChanges)) + return VM[V] = const_cast<Value *>(V); - VM[V] = const_cast<Value*>(V); - MDNode::deleteTemporary(Dummy); + auto *MappedMD = MapMetadata(MD, VM, Flags, TypeMapper, Materializer); + if (MD == MappedMD || (!MappedMD && (Flags & RF_IgnoreMissingEntries))) + return VM[V] = const_cast<Value *>(V); - // No operands needed remapping. Use an identity mapping. - return const_cast<Value*>(V); + // FIXME: This assert crashes during bootstrap, but I think it should be + // correct. For now, just match behaviour from before the metadata/value + // split. + // + // assert(MappedMD && "Referenced metadata value not in value map"); + return VM[V] = MetadataAsValue::get(V->getContext(), MappedMD); } // Okay, this either must be a constant (which may or may not be mappable) or @@ -177,6 +144,198 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, return VM[V] = ConstantPointerNull::get(cast<PointerType>(NewTy)); } +static Metadata *mapToMetadata(ValueToValueMapTy &VM, const Metadata *Key, + Metadata *Val) { + VM.MD()[Key].reset(Val); + return Val; +} + +static Metadata *mapToSelf(ValueToValueMapTy &VM, const Metadata *MD) { + return mapToMetadata(VM, MD, const_cast<Metadata *>(MD)); +} + +static Metadata *MapMetadataImpl(const Metadata *MD, + SmallVectorImpl<MDNode *> &Cycles, + ValueToValueMapTy &VM, RemapFlags Flags, + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer); + +static Metadata *mapMetadataOp(Metadata *Op, SmallVectorImpl<MDNode *> &Cycles, + ValueToValueMapTy &VM, RemapFlags Flags, + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + if (!Op) + return nullptr; + if (Metadata *MappedOp = + MapMetadataImpl(Op, Cycles, VM, Flags, TypeMapper, Materializer)) + return MappedOp; + // Use identity map if MappedOp is null and we can ignore missing entries. + if (Flags & RF_IgnoreMissingEntries) + return Op; + + // FIXME: This assert crashes during bootstrap, but I think it should be + // correct. For now, just match behaviour from before the metadata/value + // split. + // + // llvm_unreachable("Referenced metadata not in value map!"); + return nullptr; +} + +/// \brief Remap nodes. +/// +/// Insert \c NewNode in the value map, and then remap \c OldNode's operands. +/// Assumes that \c NewNode is already a clone of \c OldNode. +/// +/// \pre \c NewNode is a clone of \c OldNode. +static bool remap(const MDNode *OldNode, MDNode *NewNode, + SmallVectorImpl<MDNode *> &Cycles, ValueToValueMapTy &VM, + RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + assert(OldNode->getNumOperands() == NewNode->getNumOperands() && + "Expected nodes to match"); + assert(OldNode->isResolved() && "Expected resolved node"); + assert(!NewNode->isUniqued() && "Expected non-uniqued node"); + + // Map the node upfront so it's available for cyclic references. + mapToMetadata(VM, OldNode, NewNode); + bool AnyChanged = false; + for (unsigned I = 0, E = OldNode->getNumOperands(); I != E; ++I) { + Metadata *Old = OldNode->getOperand(I); + assert(NewNode->getOperand(I) == Old && + "Expected old operands to already be in place"); + + Metadata *New = mapMetadataOp(OldNode->getOperand(I), Cycles, VM, Flags, + TypeMapper, Materializer); + if (Old != New) { + AnyChanged = true; + NewNode->replaceOperandWith(I, New); + } + } + + return AnyChanged; +} + +/// \brief Map a distinct MDNode. +/// +/// Distinct nodes are not uniqued, so they must always recreated. +static Metadata *mapDistinctNode(const MDNode *Node, + SmallVectorImpl<MDNode *> &Cycles, + ValueToValueMapTy &VM, RemapFlags Flags, + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + assert(Node->isDistinct() && "Expected distinct node"); + + MDNode *NewMD = MDNode::replaceWithDistinct(Node->clone()); + remap(Node, NewMD, Cycles, VM, Flags, TypeMapper, Materializer); + + // Track any cycles beneath this node. + for (Metadata *Op : NewMD->operands()) + if (auto *Node = dyn_cast_or_null<MDNode>(Op)) + if (!Node->isResolved()) + Cycles.push_back(Node); + + return NewMD; +} + +/// \brief Map a uniqued MDNode. +/// +/// Uniqued nodes may not need to be recreated (they may map to themselves). +static Metadata *mapUniquedNode(const MDNode *Node, + SmallVectorImpl<MDNode *> &Cycles, + ValueToValueMapTy &VM, RemapFlags Flags, + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + assert(Node->isUniqued() && "Expected uniqued node"); + + // Create a temporary node upfront in case we have a metadata cycle. + auto ClonedMD = Node->clone(); + if (!remap(Node, ClonedMD.get(), Cycles, VM, Flags, TypeMapper, Materializer)) + // No operands changed, so use the identity mapping. + return mapToSelf(VM, Node); + + // At least one operand has changed, so uniquify the cloned node. + return mapToMetadata(VM, Node, + MDNode::replaceWithUniqued(std::move(ClonedMD))); +} + +static Metadata *MapMetadataImpl(const Metadata *MD, + SmallVectorImpl<MDNode *> &Cycles, + ValueToValueMapTy &VM, RemapFlags Flags, + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + // If the value already exists in the map, use it. + if (Metadata *NewMD = VM.MD().lookup(MD).get()) + return NewMD; + + if (isa<MDString>(MD)) + return mapToSelf(VM, MD); + + if (isa<ConstantAsMetadata>(MD)) + if ((Flags & RF_NoModuleLevelChanges)) + return mapToSelf(VM, MD); + + if (const auto *VMD = dyn_cast<ValueAsMetadata>(MD)) { + Value *MappedV = + MapValue(VMD->getValue(), VM, Flags, TypeMapper, Materializer); + if (VMD->getValue() == MappedV || + (!MappedV && (Flags & RF_IgnoreMissingEntries))) + return mapToSelf(VM, MD); + + // FIXME: This assert crashes during bootstrap, but I think it should be + // correct. For now, just match behaviour from before the metadata/value + // split. + // + // assert(MappedV && "Referenced metadata not in value map!"); + if (MappedV) + return mapToMetadata(VM, MD, ValueAsMetadata::get(MappedV)); + return nullptr; + } + + const MDNode *Node = cast<MDNode>(MD); + assert(Node->isResolved() && "Unexpected unresolved node"); + + // If this is a module-level metadata and we know that nothing at the + // module level is changing, then use an identity mapping. + if (Flags & RF_NoModuleLevelChanges) + return mapToSelf(VM, MD); + + if (Node->isDistinct()) + return mapDistinctNode(Node, Cycles, VM, Flags, TypeMapper, Materializer); + + return mapUniquedNode(Node, Cycles, VM, Flags, TypeMapper, Materializer); +} + +Metadata *llvm::MapMetadata(const Metadata *MD, ValueToValueMapTy &VM, + RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + SmallVector<MDNode *, 8> Cycles; + Metadata *NewMD = + MapMetadataImpl(MD, Cycles, VM, Flags, TypeMapper, Materializer); + + // Resolve cycles underneath MD. + if (NewMD && NewMD != MD) { + if (auto *N = dyn_cast<MDNode>(NewMD)) + if (!N->isResolved()) + N->resolveCycles(); + + for (MDNode *N : Cycles) + if (!N->isResolved()) + N->resolveCycles(); + } else { + // Shouldn't get unresolved cycles if nothing was remapped. + assert(Cycles.empty() && "Expected no unresolved cycles"); + } + + return NewMD; +} + +MDNode *llvm::MapMetadata(const MDNode *MD, ValueToValueMapTy &VM, + RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + return cast<MDNode>(MapMetadata(static_cast<const Metadata *>(MD), VM, Flags, + TypeMapper, Materializer)); +} + /// RemapInstruction - Convert the instruction operands from referencing the /// current values into those specified by VMap. /// @@ -215,7 +374,7 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap, ME = MDs.end(); MI != ME; ++MI) { MDNode *Old = MI->second; - MDNode *New = MapValue(Old, VMap, Flags, TypeMapper, Materializer); + MDNode *New = MapMetadata(Old, VMap, Flags, TypeMapper, Materializer); if (New != Old) I->setMetadata(MI->first, New); } diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index b4991bc..525c050 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -201,14 +201,16 @@ namespace { initializeBBVectorizePass(*PassRegistry::getPassRegistry()); } - BBVectorize(Pass *P, const VectorizeConfig &C) + BBVectorize(Pass *P, Function &F, const VectorizeConfig &C) : BasicBlockPass(ID), Config(C) { AA = &P->getAnalysis<AliasAnalysis>(); DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = &P->getAnalysis<ScalarEvolution>(); DataLayoutPass *DLP = P->getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - TTI = IgnoreTargetInfo ? nullptr : &P->getAnalysis<TargetTransformInfo>(); + TTI = IgnoreTargetInfo + ? nullptr + : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); } typedef std::pair<Value *, Value *> ValuePair; @@ -442,7 +444,10 @@ namespace { SE = &getAnalysis<ScalarEvolution>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - TTI = IgnoreTargetInfo ? nullptr : &getAnalysis<TargetTransformInfo>(); + TTI = IgnoreTargetInfo + ? nullptr + : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *BB.getParent()); return vectorizeBB(BB); } @@ -452,7 +457,7 @@ namespace { AU.addRequired<AliasAnalysis>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<ScalarEvolution>(); @@ -1277,7 +1282,7 @@ namespace { CostSavings, FixedOrder)) continue; // J is a candidate for merging with I. - if (!PairableInsts.size() || + if (PairableInsts.empty() || PairableInsts[PairableInsts.size()-1] != I) { PairableInsts.push_back(I); } @@ -2609,7 +2614,6 @@ namespace { true, o, 1)); NewI1->insertBefore(IBeforeJ ? J : I); I1 = NewI1; - I1T = I2T; I1Elem = I2Elem; } else if (I1Elem > I2Elem) { std::vector<Constant *> Mask(I1Elem); @@ -2626,8 +2630,6 @@ namespace { true, o, 1)); NewI2->insertBefore(IBeforeJ ? J : I); I2 = NewI2; - I2T = I1T; - I2Elem = I1Elem; } // Now that both I1 and I2 are the same length we can shuffle them @@ -3195,7 +3197,7 @@ char BBVectorize::ID = 0; static const char bb_vectorize_name[] = "Basic-Block Vectorization"; INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) @@ -3206,7 +3208,7 @@ BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { bool llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { - BBVectorize BBVectorizer(P, C); + BBVectorize BBVectorizer(P, *BB.getParent(), C); return BBVectorizer.vectorizeBB(BB); } diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 07967d8..905c069 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -3,6 +3,9 @@ add_llvm_library(LLVMVectorize Vectorize.cpp LoopVectorize.cpp SLPVectorizer.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms ) add_dependencies(LLVMVectorize intrinsics_gen) diff --git a/lib/Transforms/Vectorize/LLVMBuild.txt b/lib/Transforms/Vectorize/LLVMBuild.txt index b57ce6c..be00294 100644 --- a/lib/Transforms/Vectorize/LLVMBuild.txt +++ b/lib/Transforms/Vectorize/LLVMBuild.txt @@ -20,4 +20,4 @@ type = Library name = Vectorize parent = Transforms library_name = Vectorize -required_libraries = Analysis Core Support Target TransformUtils +required_libraries = Analysis Core Support TransformUtils diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 35b2ecf..6142306 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -55,9 +55,10 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" @@ -105,15 +106,6 @@ using namespace llvm::PatternMatch; STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); -static cl::opt<unsigned> -VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, - cl::desc("Sets the SIMD width. Zero is autoselect.")); - -static cl::opt<unsigned> -VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden, - cl::desc("Sets the vectorization interleave count. " - "Zero is autoselect.")); - static cl::opt<bool> EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); @@ -144,13 +136,6 @@ static cl::opt<bool> EnableMemAccessVersioning( /// We don't unroll loops with a known constant trip count below this number. static const unsigned TinyTripCountUnrollThreshold = 128; -/// When performing memory disambiguation checks at runtime do not make more -/// than this number of comparisons. -static const unsigned RuntimeMemoryCheckThreshold = 8; - -/// Maximum simd width. -static const unsigned MaxVectorWidth = 64; - static cl::opt<unsigned> ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers.")); @@ -218,27 +203,19 @@ class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizeHints; -/// Optimization analysis message produced during vectorization. Messages inform -/// the user why vectorization did not occur. -class Report { - std::string Message; - raw_string_ostream Out; - Instruction *Instr; - +/// \brief This modifies LoopAccessReport to initialize message with +/// loop-vectorizer-specific part. +class VectorizationReport : public LoopAccessReport { public: - Report(Instruction *I = nullptr) : Out(Message), Instr(I) { - Out << "loop not vectorized: "; - } - - template <typename A> Report &operator<<(const A &Value) { - Out << Value; - return *this; - } - - Instruction *getInstr() { return Instr; } - - std::string &str() { return Out.str(); } - operator Twine() { return Out.str(); } + VectorizationReport(Instruction *I = nullptr) + : LoopAccessReport("loop not vectorized: ", I) {} + + /// \brief This allows promotion of the loop-access analysis report into the + /// loop-vectorizer report. It modifies the message to add the + /// loop-vectorizer-specific part of the message. + explicit VectorizationReport(const LoopAccessReport &R) + : LoopAccessReport(Twine("loop not vectorized: ") + R.str(), + R.getInstr()) {} }; /// InnerLoopVectorizer vectorizes loops which contain only one basic @@ -293,13 +270,6 @@ protected: typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, VectorParts> EdgeMaskCache; - /// \brief Add code that checks at runtime if the accessed arrays overlap. - /// - /// Returns a pair of instructions where the first element is the first - /// instruction generated in possibly a sequence of instructions and the - /// second value is the final comparator value or NULL if no check is needed. - std::pair<Instruction *, Instruction *> addRuntimeCheck(Instruction *Loc); - /// \brief Add checks for strides that where assumed to be 1. /// /// Returns the last check instruction and the first check instruction in the @@ -355,10 +325,9 @@ protected: /// element. virtual Value *getBroadcastInstrs(Value *V); - /// This function adds 0, 1, 2 ... to each vector element, starting at zero. - /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). - /// The sequence starts at StartIndex. - virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); + /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) + /// to each vector element of Val. The sequence starts at StartIndex. + virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step); /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. @@ -479,7 +448,7 @@ private: bool IfPredicateStore = false) override; void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; - Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override; + Value *getStepVector(Value *Val, int StartIdx, Value *Step) override; Value *reverseVector(Value *Vec) override; }; @@ -574,17 +543,14 @@ static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *F /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: - unsigned NumLoads; - unsigned NumStores; - unsigned NumPredStores; - LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, DominatorTree *DT, TargetLibraryInfo *TLI, - AliasAnalysis *AA, Function *F) - : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), - DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr), - WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) { - } + AliasAnalysis *AA, Function *F, + const TargetTransformInfo *TTI, + LoopAccessAnalysis *LAA) + : NumPredStores(0), TheLoop(L), SE(SE), DL(DL), + TLI(TLI), TheFunction(F), TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), + Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -602,11 +568,9 @@ public: /// This enum represents the kinds of inductions that we support. enum InductionKind { - IK_NoInduction, ///< Not an induction variable. - IK_IntInduction, ///< Integer induction variable. Step = 1. - IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. - IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem). - IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). + IK_NoInduction, ///< Not an induction variable. + IK_IntInduction, ///< Integer induction variable. Step = C. + IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem). }; // This enum represents the kind of minmax reduction. @@ -657,51 +621,69 @@ public: MinMaxReductionKind MinMaxKind; }; - /// This struct holds information about the memory runtime legality - /// check that a group of pointers do not overlap. - struct RuntimePointerCheck { - RuntimePointerCheck() : Need(false) {} - - /// Reset the state of the pointer runtime information. - void reset() { - Need = false; - Pointers.clear(); - Starts.clear(); - Ends.clear(); - IsWritePtr.clear(); - DependencySetId.clear(); - AliasSetId.clear(); + /// A struct for saving information about induction variables. + struct InductionInfo { + InductionInfo(Value *Start, InductionKind K, ConstantInt *Step) + : StartValue(Start), IK(K), StepValue(Step) { + assert(IK != IK_NoInduction && "Not an induction"); + assert(StartValue && "StartValue is null"); + assert(StepValue && !StepValue->isZero() && "StepValue is zero"); + assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && + "StartValue is not a pointer for pointer induction"); + assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && + "StartValue is not an integer for integer induction"); + assert(StepValue->getType()->isIntegerTy() && + "StepValue is not an integer"); + } + InductionInfo() + : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {} + + /// Get the consecutive direction. Returns: + /// 0 - unknown or non-consecutive. + /// 1 - consecutive and increasing. + /// -1 - consecutive and decreasing. + int getConsecutiveDirection() const { + if (StepValue && (StepValue->isOne() || StepValue->isMinusOne())) + return StepValue->getSExtValue(); + return 0; } - /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, - unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides); - - /// This flag indicates if we need to add the runtime check. - bool Need; - /// Holds the pointers that we need to check. - SmallVector<TrackingVH<Value>, 2> Pointers; - /// Holds the pointer value at the beginning of the loop. - SmallVector<const SCEV*, 2> Starts; - /// Holds the pointer value at the end of the loop. - SmallVector<const SCEV*, 2> Ends; - /// Holds the information if this pointer is used for writing to memory. - SmallVector<bool, 2> IsWritePtr; - /// Holds the id of the set of pointers that could be dependent because of a - /// shared underlying object. - SmallVector<unsigned, 2> DependencySetId; - /// Holds the id of the disjoint alias set to which this pointer belongs. - SmallVector<unsigned, 2> AliasSetId; - }; + /// Compute the transformed value of Index at offset StartValue using step + /// StepValue. + /// For integer induction, returns StartValue + Index * StepValue. + /// For pointer induction, returns StartValue[Index * StepValue]. + /// FIXME: The newly created binary instructions should contain nsw/nuw + /// flags, which can be found from the original scalar operations. + Value *transform(IRBuilder<> &B, Value *Index) const { + switch (IK) { + case IK_IntInduction: + assert(Index->getType() == StartValue->getType() && + "Index type does not match StartValue type"); + if (StepValue->isMinusOne()) + return B.CreateSub(StartValue, Index); + if (!StepValue->isOne()) + Index = B.CreateMul(Index, StepValue); + return B.CreateAdd(StartValue, Index); + + case IK_PtrInduction: + if (StepValue->isMinusOne()) + Index = B.CreateNeg(Index); + else if (!StepValue->isOne()) + Index = B.CreateMul(Index, StepValue); + return B.CreateGEP(StartValue, Index); + + case IK_NoInduction: + return nullptr; + } + llvm_unreachable("invalid enum"); + } - /// A struct for saving information about induction variables. - struct InductionInfo { - InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} - InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {} /// Start value. TrackingVH<Value> StartValue; /// Induction kind. InductionKind IK; + /// Step value. + ConstantInt *StepValue; }; /// ReductionList contains the reduction descriptors for all @@ -753,13 +735,19 @@ public: bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } /// Returns the information that we collected about runtime memory check. - RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } + const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const { + return LAI->getRuntimePointerCheck(); + } + + const LoopAccessInfo *getLAI() const { + return LAI; + } /// This function returns the identity element (or neutral element) for /// the operation K. static Constant *getReductionIdentity(ReductionKind K, Type *Tp); - unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } bool hasStride(Value *V) { return StrideSet.count(V); } bool mustCheckStrides() { return !StrideSet.empty(); } @@ -768,6 +756,30 @@ public: } SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); } + /// Returns true if the target machine supports masked store operation + /// for the given \p DataType and kind of access to \p Ptr. + bool isLegalMaskedStore(Type *DataType, Value *Ptr) { + return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr)); + } + /// Returns true if the target machine supports masked load operation + /// for the given \p DataType and kind of access to \p Ptr. + bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { + return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr)); + } + /// Returns true if vector representation of the instruction \p I + /// requires mask. + bool isMaskRequired(const Instruction* I) { + return (MaskedOp.count(I) != 0); + } + unsigned getNumStores() const { + return LAI->getNumStores(); + } + unsigned getNumLoads() const { + return LAI->getNumLoads(); + } + unsigned getNumPredStores() const { + return NumPredStores; + } private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -806,40 +818,45 @@ private: /// pattern corresponding to a min(X, Y) or max(X, Y). static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I, ReductionInstDesc &Prev); - /// Returns the induction kind of Phi. This function may return NoInduction - /// if the PHI is not an induction variable. - InductionKind isInductionVariable(PHINode *Phi); + /// Returns the induction kind of Phi and record the step. This function may + /// return NoInduction if the PHI is not an induction variable. + InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue); /// \brief Collect memory access with loop invariant strides. /// /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop /// invariant. - void collectStridedAcccess(Value *LoadOrStoreInst); + void collectStridedAccess(Value *LoadOrStoreInst); /// Report an analysis message to assist the user in diagnosing loops that are - /// not vectorized. - void emitAnalysis(Report &Message) { - DebugLoc DL = TheLoop->getStartLoc(); - if (Instruction *I = Message.getInstr()) - DL = I->getDebugLoc(); - emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE, - *TheFunction, DL, Message.str()); + /// not vectorized. These are handled as LoopAccessReport rather than + /// VectorizationReport because the << operator of VectorizationReport returns + /// LoopAccessReport. + void emitAnalysis(const LoopAccessReport &Message) { + LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); } + unsigned NumPredStores; + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. ScalarEvolution *SE; /// DataLayout analysis. const DataLayout *DL; - /// Dominators. - DominatorTree *DT; /// Target Library Info. TargetLibraryInfo *TLI; - /// Alias analysis. - AliasAnalysis *AA; /// Parent function Function *TheFunction; + /// Target Transform Info + const TargetTransformInfo *TTI; + /// Dominator Tree. + DominatorTree *DT; + // LoopAccess analysis. + LoopAccessAnalysis *LAA; + // And the loop-accesses info corresponding to this loop. This pointer is + // null until canVectorizeMemory sets it up. + const LoopAccessInfo *LAI; // --- vectorization state --- // @@ -861,16 +878,16 @@ private: /// This set holds the variables which are known to be uniform after /// vectorization. SmallPtrSet<Instruction*, 4> Uniforms; - /// We need to check that all of the pointers in this list are disjoint - /// at runtime. - RuntimePointerCheck PtrRtCheck; + /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; - unsigned MaxSafeDepDistBytes; - ValueToValueMap Strides; SmallPtrSet<Value *, 8> StrideSet; + + /// While vectorizing these instructions we have to generate a + /// call to the appropriate masked intrinsic + SmallPtrSet<const Instruction*, 8> MaskedOp; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -886,11 +903,11 @@ public: LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const DataLayout *DL, const TargetLibraryInfo *TLI, - AssumptionTracker *AT, const Function *F, + AssumptionCache *AC, const Function *F, const LoopVectorizeHints *Hints) : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI), TheFunction(F), Hints(Hints) { - CodeMetrics::collectEphemeralValues(L, AT, EphValues); + CodeMetrics::collectEphemeralValues(L, AC, EphValues); } /// Information about vectorization costs @@ -951,13 +968,11 @@ private: bool isConsecutiveLoadOrStore(Instruction *I); /// Report an analysis message to assist the user in diagnosing loops that are - /// not vectorized. - void emitAnalysis(Report &Message) { - DebugLoc DL = TheLoop->getStartLoc(); - if (Instruction *I = Message.getInstr()) - DL = I->getDebugLoc(); - emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE, - *TheFunction, DL, Message.str()); + /// not vectorized. These are handled as LoopAccessReport rather than + /// VectorizationReport because the << operator of VectorizationReport returns + /// LoopAccessReport. + void emitAnalysis(const LoopAccessReport &Message) { + LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); } /// Values used only by @llvm.assume calls. @@ -1010,7 +1025,7 @@ class LoopVectorizeHints { bool validate(unsigned Val) { switch (Kind) { case HK_WIDTH: - return isPowerOf2_32(Val) && Val <= MaxVectorWidth; + return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; case HK_UNROLL: return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; case HK_FORCE: @@ -1038,7 +1053,8 @@ public: }; LoopVectorizeHints(const Loop *L, bool DisableInterleaving) - : Width("vectorize.width", VectorizationFactor, HK_WIDTH), + : Width("vectorize.width", VectorizerParams::VectorizationFactor, + HK_WIDTH), Interleave("interleave.count", DisableInterleaving, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), TheLoop(L) { @@ -1046,8 +1062,8 @@ public: getHintsFromMetadata(); // force-vector-interleave overrides DisableInterleaving. - if (VectorizationInterleave.getNumOccurrences() > 0) - Interleave.Value = VectorizationInterleave; + if (VectorizerParams::isInterleaveForced()) + Interleave.Value = VectorizerParams::VectorizationInterleave; DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"); @@ -1062,7 +1078,7 @@ public: /// Dumps all the hint information. std::string emitRemark() const { - Report R; + VectorizationReport R; if (Force.Value == LoopVectorizeHints::FK_Disabled) R << "vectorization is explicitly disabled"; else { @@ -1097,7 +1113,7 @@ private: for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { const MDString *S = nullptr; - SmallVector<Value*, 4> Args; + SmallVector<Metadata *, 4> Args; // The expected hint is either a MDString or a MDNode with the first // operand a MDString. @@ -1123,12 +1139,12 @@ private: } /// Checks string hint with one operand and set value if valid. - void setHint(StringRef Name, Value *Arg) { + void setHint(StringRef Name, Metadata *Arg) { if (!Name.startswith(Prefix())) return; Name = Name.substr(Prefix().size(), StringRef::npos); - const ConstantInt *C = dyn_cast<ConstantInt>(Arg); + const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); if (!C) return; unsigned Val = C->getZExtValue(); @@ -1147,9 +1163,10 @@ private: /// Create a new hint from name / value pair. MDNode *createHintMetadata(StringRef Name, unsigned V) const { LLVMContext &Context = TheLoop->getHeader()->getContext(); - Value *Vals[] = {MDString::get(Context, Name), - ConstantInt::get(Type::getInt32Ty(Context), V)}; - return MDNode::get(Context, Vals); + Metadata *MDs[] = {MDString::get(Context, Name), + ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Context), V))}; + return MDNode::get(Context, MDs); } /// Matches metadata with hint name. @@ -1170,7 +1187,7 @@ private: return; // Reserve the first element to LoopID (see below). - SmallVector<Value*, 4> Vals(1); + SmallVector<Metadata *, 4> MDs(1); // If the loop already has metadata, then ignore the existing operands. MDNode *LoopID = TheLoop->getLoopID(); if (LoopID) { @@ -1178,25 +1195,21 @@ private: MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); // If node in update list, ignore old value. if (!matchesHintMetadataName(Node, HintTypes)) - Vals.push_back(Node); + MDs.push_back(Node); } } // Now, add the missing hints. for (auto H : HintTypes) - Vals.push_back( - createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); + MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); // Replace current metadata node with new one. LLVMContext &Context = TheLoop->getHeader()->getContext(); - MDNode *NewLoopID = MDNode::get(Context, Vals); + MDNode *NewLoopID = MDNode::get(Context, MDs); // Set operand 0 to refer to the loop id itself. NewLoopID->replaceOperandWith(0, NewLoopID); TheLoop->setLoopID(NewLoopID); - if (LoopID) - LoopID->replaceAllUsesWith(NewLoopID); - LoopID = NewLoopID; } /// The loop these hints belong to. @@ -1248,7 +1261,8 @@ struct LoopVectorize : public FunctionPass { BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; AliasAnalysis *AA; - AssumptionTracker *AT; + AssumptionCache *AC; + LoopAccessAnalysis *LAA; bool DisableUnrolling; bool AlwaysVectorize; @@ -1258,13 +1272,15 @@ struct LoopVectorize : public FunctionPass { SE = &getAnalysis<ScalarEvolution>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - LI = &getAnalysis<LoopInfo>(); - TTI = &getAnalysis<TargetTransformInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); BFI = &getAnalysis<BlockFrequencyInfo>(); - TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TLI = TLIP ? &TLIP->getTLI() : nullptr; AA = &getAnalysis<AliasAnalysis>(); - AT = &getAnalysis<AssumptionTracker>(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + LAA = &getAnalysis<LoopAccessAnalysis>(); // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. @@ -1375,7 +1391,7 @@ struct LoopVectorize : public FunctionPass { } // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F); + LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI, LAA); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints); @@ -1383,7 +1399,7 @@ struct LoopVectorize : public FunctionPass { } // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AT, F, + LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AC, F, &Hints); // Check the function attributes to find out if this function should be @@ -1471,16 +1487,17 @@ struct LoopVectorize : public FunctionPass { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addRequired<BlockFrequencyInfo>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<AliasAnalysis>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopAccessAnalysis>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<AliasAnalysis>(); } @@ -1494,65 +1511,6 @@ struct LoopVectorize : public FunctionPass { // LoopVectorizationCostModel. //===----------------------------------------------------------------------===// -static Value *stripIntegerCast(Value *V) { - if (CastInst *CI = dyn_cast<CastInst>(V)) - if (CI->getOperand(0)->getType()->isIntegerTy()) - return CI->getOperand(0); - return V; -} - -///\brief Replaces the symbolic stride in a pointer SCEV expression by one. -/// -/// If \p OrigPtr is not null, use it to look up the stride value instead of -/// \p Ptr. -static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE, - ValueToValueMap &PtrToStride, - Value *Ptr, Value *OrigPtr = nullptr) { - - const SCEV *OrigSCEV = SE->getSCEV(Ptr); - - // If there is an entry in the map return the SCEV of the pointer with the - // symbolic stride replaced by one. - ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr); - if (SI != PtrToStride.end()) { - Value *StrideVal = SI->second; - - // Strip casts. - StrideVal = stripIntegerCast(StrideVal); - - // Replace symbolic stride by one. - Value *One = ConstantInt::get(StrideVal->getType(), 1); - ValueToValueMap RewriteMap; - RewriteMap[StrideVal] = One; - - const SCEV *ByOne = - SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true); - DEBUG(dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne - << "\n"); - return ByOne; - } - - // Otherwise, just return the SCEV of the original pointer. - return SE->getSCEV(Ptr); -} - -void LoopVectorizationLegality::RuntimePointerCheck::insert( - ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, - unsigned ASId, ValueToValueMap &Strides) { - // Get the stride replaced scev. - const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); - assert(AR && "Invalid addrec expression"); - const SCEV *Ex = SE->getBackedgeTakenCount(Lp); - const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); - Pointers.push_back(Ptr); - Starts.push_back(AR->getStart()); - Ends.push_back(ScEnd); - IsWritePtr.push_back(WritePtr); - DependencySetId.push_back(DepSetId); - AliasSetId.push_back(ASId); -} - Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { // We need to place the broadcast of invariant variables outside the loop. Instruction *Instr = dyn_cast<Instruction>(V); @@ -1572,11 +1530,13 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } -Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, - bool Negate) { +Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, + Value *Step) { assert(Val->getType()->isVectorTy() && "Must be a vector"); assert(Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer"); + assert(Step->getType() == Val->getType()->getScalarType() && + "Step has wrong type"); // Create the types. Type *ITy = Val->getType()->getScalarType(); VectorType *Ty = cast<VectorType>(Val->getType()); @@ -1584,15 +1544,18 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, SmallVector<Constant*, 8> Indices; // Create a vector of consecutive numbers from zero to VF. - for (int i = 0; i < VLen; ++i) { - int64_t Idx = Negate ? (-i) : i; - Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate)); - } + for (int i = 0; i < VLen; ++i) + Indices.push_back(ConstantInt::get(ITy, StartIdx + i)); // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); - return Builder.CreateAdd(Val, Cv, "induction"); + Step = Builder.CreateVectorSplat(VLen, Step); + assert(Step->getType() == Val->getType() && "Invalid step vec"); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, + // which can be found from the original scalar operations. + Step = Builder.CreateMul(Cv, Step); + return Builder.CreateAdd(Val, Step, "induction"); } /// \brief Find the operand of the GEP that should be checked for consecutive @@ -1630,10 +1593,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); if (Phi && Inductions.count(Phi)) { InductionInfo II = Inductions[Phi]; - if (IK_PtrInduction == II.IK) - return 1; - else if (IK_ReversePtrInduction == II.IK) - return -1; + return II.getConsecutiveDirection(); } GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); @@ -1658,10 +1618,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return 0; InductionInfo II = Inductions[Phi]; - if (IK_PtrInduction == II.IK) - return 1; - else if (IK_ReversePtrInduction == II.IK) - return -1; + return II.getConsecutiveDirection(); } unsigned InductionOperand = getGEPInductionOperand(DL, Gep); @@ -1711,7 +1668,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { } bool LoopVectorizationLegality::isUniform(Value *V) { - return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); + return LAI->isUniform(V); } InnerLoopVectorizer::VectorParts& @@ -1763,7 +1720,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; - if (SI && Legal->blockNeedsPredication(SI->getParent())) + if (SI && Legal->blockNeedsPredication(SI->getParent()) && + !Legal->isMaskRequired(SI)) return scalarizeInstruction(Instr, true); if (ScalarAllocatedSize != VectorElementSize) @@ -1832,6 +1790,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); } + VectorParts Mask = createBlockInMask(Instr->getParent()); // Handle Stores: if (SI) { assert(!Legal->isUniform(SI->getPointerOperand()) && @@ -1840,7 +1799,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // We don't want to update the value in the map as it might be used in // another expression. So don't use a reference type for "StoredVal". VectorParts StoredVal = getVectorValue(SI->getValueOperand()); - + for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); @@ -1853,12 +1812,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // wide store needs to start at the last vector element. PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + Mask[Part] = reverseVector(Mask[Part]); } Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - StoreInst *NewSI = - Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); + + Instruction *NewSI; + if (Legal->isMaskRequired(SI)) + NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, + Mask[Part]); + else + NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); propagateMetadata(NewSI, SI); } return; @@ -1873,14 +1838,21 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { if (Reverse) { // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. + // wide load needs to start at the last vector element. PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + Mask[Part] = reverseVector(Mask[Part]); } + Instruction* NewLI; Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - LoadInst *NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); + if (Legal->isMaskRequired(LI)) + NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], + UndefValue::get(DataTy), + "wide.masked.load"); + else + NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); propagateMetadata(NewLI, LI); Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; } @@ -1958,7 +1930,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(CondBlock, *LI); // Update Builder with newly created basic block. Builder.SetInsertPoint(InsertPt); } @@ -1987,7 +1959,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic if (IfPredicateStore) { BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); @@ -2044,102 +2016,6 @@ InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { return std::make_pair(FirstInst, TheCheck); } -std::pair<Instruction *, Instruction *> -InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { - LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = - Legal->getRuntimePointerCheck(); - - Instruction *tnullptr = nullptr; - if (!PtrRtCheck->Need) - return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); - - unsigned NumPointers = PtrRtCheck->Pointers.size(); - SmallVector<TrackingVH<Value> , 2> Starts; - SmallVector<TrackingVH<Value> , 2> Ends; - - LLVMContext &Ctx = Loc->getContext(); - SCEVExpander Exp(*SE, "induction"); - Instruction *FirstInst = nullptr; - - for (unsigned i = 0; i < NumPointers; ++i) { - Value *Ptr = PtrRtCheck->Pointers[i]; - const SCEV *Sc = SE->getSCEV(Ptr); - - if (SE->isLoopInvariant(Sc, OrigLoop)) { - DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << - *Ptr <<"\n"); - Starts.push_back(Ptr); - Ends.push_back(Ptr); - } else { - DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'); - unsigned AS = Ptr->getType()->getPointerAddressSpace(); - - // Use this type for pointer arithmetic. - Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); - - Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); - Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); - Starts.push_back(Start); - Ends.push_back(End); - } - } - - IRBuilder<> ChkBuilder(Loc); - // Our instructions might fold to a constant. - Value *MemoryRuntimeCheck = nullptr; - for (unsigned i = 0; i < NumPointers; ++i) { - for (unsigned j = i+1; j < NumPointers; ++j) { - // No need to check if two readonly pointers intersect. - if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) - continue; - - // Only need to check pointers between two different dependency sets. - if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) - continue; - // Only need to check pointers in the same alias set. - if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j]) - continue; - - unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); - unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); - - assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && - (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && - "Trying to bounds check pointers with different address spaces"); - - Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); - Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); - - Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); - Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); - Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); - Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); - - Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); - FirstInst = getFirstInst(FirstInst, Cmp0, Loc); - Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); - FirstInst = getFirstInst(FirstInst, Cmp1, Loc); - Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); - FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - if (MemoryRuntimeCheck) { - IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, - "conflict.rdx"); - FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - } - MemoryRuntimeCheck = IsConflict; - } - } - - // We have to do this trickery because the IRBuilder might fold the check to a - // constant expression in which case there is no Instruction anchored in a - // the block. - Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, - ConstantInt::getTrue(Ctx)); - ChkBuilder.Insert(Check, "memcheck.conflict"); - FirstInst = getFirstInst(FirstInst, Check, Loc); - return std::make_pair(FirstInst, Check); -} - void InnerLoopVectorizer::createEmptyLoop() { /* In this function we generate a new loop. The new loop will contain @@ -2265,13 +2141,13 @@ void InnerLoopVectorizer::createEmptyLoop() { // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); - ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); - ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); - ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); + ParentLoop->addBasicBlockToLoop(VectorPH, *LI); + ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } - Lp->addBasicBlockToLoop(VecBody, LI->getBase()); + Lp->addBasicBlockToLoop(VecBody, *LI); // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. @@ -2326,7 +2202,7 @@ void InnerLoopVectorizer::createEmptyLoop() { BasicBlock *CheckBlock = LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked"); if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); LoopBypassBlocks.push_back(CheckBlock); Instruction *OldTerm = LastBypassBlock->getTerminator(); BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm); @@ -2346,7 +2222,7 @@ void InnerLoopVectorizer::createEmptyLoop() { BasicBlock *CheckBlock = LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch @@ -2364,13 +2240,13 @@ void InnerLoopVectorizer::createEmptyLoop() { // faster. Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeCheck(LastBypassBlock->getTerminator()); + Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator()); if (MemRuntimeCheck) { // Create a new block containing the memory check. BasicBlock *CheckBlock = - LastBypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck"); + LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck"); if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch @@ -2461,33 +2337,13 @@ void InnerLoopVectorizer::createEmptyLoop() { Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, II.StartValue->getType(), "cast.crd"); - EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end"); - break; - } - case LoopVectorizationLegality::IK_ReverseIntInduction: { - // Convert the CountRoundDown variable to the PHI size. - Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, - II.StartValue->getType(), - "cast.crd"); - // Handle reverse integer induction counter. - EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end"); + EndValue = II.transform(BypassBuilder, CRD); + EndValue->setName("ind.end"); break; } case LoopVectorizationLegality::IK_PtrInduction: { - // For pointer induction variables, calculate the offset using - // the end index. - EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown, - "ptr.ind.end"); - break; - } - case LoopVectorizationLegality::IK_ReversePtrInduction: { - // The value at the end of the loop for the reverse pointer is calculated - // by creating a GEP with a negative index starting from the start value. - Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); - Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown, - "rev.ind.end"); - EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx, - "rev.ptr.ind.end"); + EndValue = II.transform(BypassBuilder, CountRoundDown); + EndValue->setName("ptr.ind.end"); break; } }// end of case @@ -2835,9 +2691,6 @@ void InnerLoopVectorizer::vectorizeLoop() { } // Fix the vector-loop phi. - // We created the induction variable so we know that the - // preheader is the first entry. - BasicBlock *VecPreheader = Induction->getIncomingBlock(0); // Reductions do not have to start at zero. They can start with // any loop invariant values. @@ -2849,7 +2702,8 @@ void InnerLoopVectorizer::vectorizeLoop() { // Make sure to add the reduction stat value only to the // first unroll part. Value *StartVal = (part == 0) ? VectorStart : Identity; - cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); + cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, + LoopVectorPreHeader); cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody.back()); } @@ -3104,6 +2958,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, LoopVectorizationLegality::InductionInfo II = Legal->getInductionVars()->lookup(P); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, + // which can be found from the original scalar operations. switch (II.IK) { case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); @@ -3121,80 +2977,42 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); - Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, - "offset.idx"); + Broadcasted = II.transform(Builder, NormalizedIdx); + Broadcasted->setName("offset.idx"); } Broadcasted = getBroadcastInstrs(Broadcasted); // After broadcasting the induction variable we need to make the vector // consecutive by adding 0, 1, 2, etc. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); + Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue); return; } - case LoopVectorizationLegality::IK_ReverseIntInduction: case LoopVectorizationLegality::IK_PtrInduction: - case LoopVectorizationLegality::IK_ReversePtrInduction: - // Handle reverse integer and pointer inductions. - Value *StartIdx = ExtendedIdx; - // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, - "normalized.idx"); - - // Handle the reverse integer induction variable case. - if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { - IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); - Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, - "resize.norm.idx"); - Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, - "reverse.idx"); - - // This is a new value so do not hoist it out. - Value *Broadcasted = getBroadcastInstrs(ReverseInd); - // After broadcasting the induction variable we need to make the - // vector consecutive by adding ... -3, -2, -1, 0. - for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, - true); - return; - } - // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); - - // Is this a reverse induction ptr or a consecutive induction ptr. - bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == - II.IK); - + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = + Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); // This is the vector of results. Notice that we don't generate // vector geps because scalar geps result in better code. for (unsigned part = 0; part < UF; ++part) { if (VF == 1) { - int EltIndex = (part) * (Reverse ? -1 : 1); + int EltIndex = part; Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); - Value *GlobalIdx; - if (Reverse) - GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); - else - GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); - - Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, - "next.gep"); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Value *SclrGep = II.transform(Builder, GlobalIdx); + SclrGep->setName("next.gep"); Entry[part] = SclrGep; continue; } Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); for (unsigned int i = 0; i < VF; ++i) { - int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); + int EltIndex = i + part * VF; Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); - Value *GlobalIdx; - if (!Reverse) - GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); - else - GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); - - Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, - "next.gep"); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Value *SclrGep = II.transform(Builder, GlobalIdx); + SclrGep->setName("next.gep"); VecVal = Builder.CreateInsertElement(VecVal, SclrGep, Builder.getInt32(i), "insert.gep"); @@ -3214,7 +3032,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Nothing to do for PHIs and BR, since we already took care of the // loop control flow instructions. continue; - case Instruction::PHI:{ + case Instruction::PHI: { // Vectorize PHINodes. widenPHIInstruction(it, Entry, UF, VF, PV); continue; @@ -3335,8 +3153,12 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); Value *Broadcasted = getBroadcastInstrs(ScalarCast); + LoopVectorizationLegality::InductionInfo II = + Legal->getInductionVars()->lookup(OldInduction); + Constant *Step = + ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue()); for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); + Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); propagateMetadata(Entry, it); break; } @@ -3452,7 +3274,7 @@ static bool canIfConvertPHINodes(BasicBlock *BB) { bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) { - emitAnalysis(Report() << "if-conversion is disabled"); + emitAnalysis(VectorizationReport() << "if-conversion is disabled"); return false; } @@ -3485,7 +3307,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // We don't support switch statements inside loops. if (!isa<BranchInst>(BB->getTerminator())) { - emitAnalysis(Report(BB->getTerminator()) + emitAnalysis(VectorizationReport(BB->getTerminator()) << "loop contains a switch statement"); return false; } @@ -3493,12 +3315,12 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { if (!blockCanBePredicated(BB, SafePointes)) { - emitAnalysis(Report(BB->getTerminator()) + emitAnalysis(VectorizationReport(BB->getTerminator()) << "control flow cannot be substituted for a select"); return false; } } else if (BB != Header && !canIfConvertPHINodes(BB)) { - emitAnalysis(Report(BB->getTerminator()) + emitAnalysis(VectorizationReport(BB->getTerminator()) << "control flow cannot be substituted for a select"); return false; } @@ -3513,27 +3335,40 @@ bool LoopVectorizationLegality::canVectorize() { // be canonicalized. if (!TheLoop->getLoopPreheader()) { emitAnalysis( - Report() << "loop control flow is not understood by vectorizer"); + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; } // We can only vectorize innermost loops. - if (TheLoop->getSubLoopsVector().size()) { - emitAnalysis(Report() << "loop is not the innermost loop"); + if (!TheLoop->getSubLoopsVector().empty()) { + emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); return false; } // We must have a single backedge. if (TheLoop->getNumBackEdges() != 1) { emitAnalysis( - Report() << "loop control flow is not understood by vectorizer"); + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; } // We must have a single exiting block. if (!TheLoop->getExitingBlock()) { emitAnalysis( - Report() << "loop control flow is not understood by vectorizer"); + VectorizationReport() << + "loop control flow is not understood by vectorizer"); + return false; + } + + // We only handle bottom-tested loops, i.e. loop in which the condition is + // checked at the end of each iteration. With that we can assume that all + // instructions in the loop are executed the same number of times. + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + emitAnalysis( + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; } @@ -3551,7 +3386,8 @@ bool LoopVectorizationLegality::canVectorize() { // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); if (ExitCount == SE->getCouldNotCompute()) { - emitAnalysis(Report() << "could not determine number of loop iterations"); + emitAnalysis(VectorizationReport() << + "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } @@ -3572,7 +3408,8 @@ bool LoopVectorizationLegality::canVectorize() { collectLoopUniforms(); DEBUG(dbgs() << "LV: We can vectorize this loop" << - (PtrRtCheck.Need ? " (with a runtime bound check)" : "") + (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" : + "") <<"!\n"); // Okay! We can vectorize. At this point we don't have any other mem analysis @@ -3627,9 +3464,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Look for the attribute signaling the absence of NaNs. Function &F = *Header->getParent(); if (F.hasFnAttribute("no-nans-fp-math")) - HasFunNoNaNAttr = F.getAttributes().getAttribute( - AttributeSet::FunctionIndex, - "no-nans-fp-math").getValueAsString() == "true"; + HasFunNoNaNAttr = + F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; // For each block in the loop. for (Loop::block_iterator bb = TheLoop->block_begin(), @@ -3645,7 +3481,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { - emitAnalysis(Report(it) + emitAnalysis(VectorizationReport(it) << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; @@ -3659,14 +3495,15 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // identified reduction value with an outside user. if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) continue; - emitAnalysis(Report(it) << "value could not be identified as " - "an induction or reduction variable"); + emitAnalysis(VectorizationReport(it) << + "value could not be identified as " + "an induction or reduction variable"); return false; } - // We only allow if-converted PHIs with more than two incoming values. + // We only allow if-converted PHIs with exactly two incoming values. if (Phi->getNumIncomingValues() != 2) { - emitAnalysis(Report(it) + emitAnalysis(VectorizationReport(it) << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; @@ -3674,8 +3511,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // This is the value coming from the preheader. Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); + ConstantInt *StepValue = nullptr; // Check if this is an induction variable. - InductionKind IK = isInductionVariable(Phi); + InductionKind IK = isInductionVariable(Phi, StepValue); if (IK_NoInduction != IK) { // Get the widest type. @@ -3685,7 +3523,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy); // Int inductions are special because we only allow one IV. - if (IK == IK_IntInduction) { + if (IK == IK_IntInduction && StepValue->isOne()) { // Use the phi node with the widest type as induction. Use the last // one if there are multiple (no good reason for doing this other // than it is expedient). @@ -3694,13 +3532,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } DEBUG(dbgs() << "LV: Found an induction variable.\n"); - Inductions[Phi] = InductionInfo(StartValue, IK); + Inductions[Phi] = InductionInfo(StartValue, IK, StepValue); // Until we explicitly handle the case of an induction variable with // an outside loop user we have to give up vectorizing this loop. if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(Report(it) << "use of induction value outside of the " - "loop is not handled by vectorizer"); + emitAnalysis(VectorizationReport(it) << + "use of induction value outside of the " + "loop is not handled by vectorizer"); return false; } @@ -3745,8 +3584,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } - emitAnalysis(Report(it) << "value that could not be identified as " - "reduction is used outside the loop"); + emitAnalysis(VectorizationReport(it) << + "value that could not be identified as " + "reduction is used outside the loop"); DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); return false; }// end of PHI handling @@ -3755,7 +3595,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // calls and we do handle certain intrinsic and libm functions. CallInst *CI = dyn_cast<CallInst>(it); if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) { - emitAnalysis(Report(it) << "call instruction cannot be vectorized"); + emitAnalysis(VectorizationReport(it) << + "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a call site.\n"); return false; } @@ -3765,7 +3606,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (CI && hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { - emitAnalysis(Report(it) + emitAnalysis(VectorizationReport(it) << "intrinsic instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); return false; @@ -3776,7 +3617,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { - emitAnalysis(Report(it) + emitAnalysis(VectorizationReport(it) << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; @@ -3786,21 +3627,23 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (StoreInst *ST = dyn_cast<StoreInst>(it)) { Type *T = ST->getValueOperand()->getType(); if (!VectorType::isValidElementType(T)) { - emitAnalysis(Report(ST) << "store instruction cannot be vectorized"); + emitAnalysis(VectorizationReport(ST) << + "store instruction cannot be vectorized"); return false; } if (EnableMemAccessVersioning) - collectStridedAcccess(ST); + collectStridedAccess(ST); } if (EnableMemAccessVersioning) if (LoadInst *LI = dyn_cast<LoadInst>(it)) - collectStridedAcccess(LI); + collectStridedAccess(LI); // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(Report(it) << "value cannot be used outside the loop"); + emitAnalysis(VectorizationReport(it) << + "value cannot be used outside the loop"); return false; } @@ -3811,7 +3654,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); if (Inductions.empty()) { - emitAnalysis(Report() + emitAnalysis(VectorizationReport() << "loop induction variable could not be identified"); return false; } @@ -3933,7 +3776,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, return Stride; } -void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) { +void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { Value *Ptr = nullptr; if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess)) Ptr = LI->getPointerOperand(); @@ -3971,7 +3814,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { if (I->getType()->isPointerTy() && isConsecutivePtr(I)) Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); - while (Worklist.size()) { + while (!Worklist.empty()) { Instruction *I = dyn_cast<Instruction>(Worklist.back()); Worklist.pop_back(); @@ -3989,962 +3832,12 @@ void LoopVectorizationLegality::collectLoopUniforms() { } } -namespace { -/// \brief Analyses memory accesses in a loop. -/// -/// Checks whether run time pointer checks are needed and builds sets for data -/// dependence checking. -class AccessAnalysis { -public: - /// \brief Read or write access location. - typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; - typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; - - /// \brief Set of potential dependent memory accesses. - typedef EquivalenceClasses<MemAccessInfo> DepCandidates; - - AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) : - DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {} - - /// \brief Register a load and whether it is only read from. - void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) { - Value *Ptr = const_cast<Value*>(Loc.Ptr); - AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); - Accesses.insert(MemAccessInfo(Ptr, false)); - if (IsReadOnly) - ReadOnlyPtr.insert(Ptr); - } - - /// \brief Register a store. - void addStore(AliasAnalysis::Location &Loc) { - Value *Ptr = const_cast<Value*>(Loc.Ptr); - AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); - Accesses.insert(MemAccessInfo(Ptr, true)); - } - - /// \brief Check whether we can check the pointers at runtime for - /// non-intersection. - bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, - unsigned &NumComparisons, ScalarEvolution *SE, - Loop *TheLoop, ValueToValueMap &Strides, - bool ShouldCheckStride = false); - - /// \brief Goes over all memory accesses, checks whether a RT check is needed - /// and builds sets of dependent accesses. - void buildDependenceSets() { - processMemAccesses(); - } - - bool isRTCheckNeeded() { return IsRTCheckNeeded; } - - bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } - void resetDepChecks() { CheckDeps.clear(); } - - MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } - -private: - typedef SetVector<MemAccessInfo> PtrAccessSet; - - /// \brief Go over all memory access and check whether runtime pointer checks - /// are needed /// and build sets of dependency check candidates. - void processMemAccesses(); - - /// Set of all accesses. - PtrAccessSet Accesses; - - /// Set of accesses that need a further dependence check. - MemAccessInfoSet CheckDeps; - - /// Set of pointers that are read only. - SmallPtrSet<Value*, 16> ReadOnlyPtr; - - const DataLayout *DL; - - /// An alias set tracker to partition the access set by underlying object and - //intrinsic property (such as TBAA metadata). - AliasSetTracker AST; - - /// Sets of potentially dependent accesses - members of one set share an - /// underlying pointer. The set "CheckDeps" identfies which sets really need a - /// dependence check. - DepCandidates &DepCands; - - bool IsRTCheckNeeded; -}; - -} // end anonymous namespace - -/// \brief Check whether a pointer can participate in a runtime bounds check. -static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides, - Value *Ptr) { - const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); - if (!AR) - return false; - - return AR->isAffine(); -} - -/// \brief Check the stride of the pointer and ensure that it does not wrap in -/// the address space. -static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, - const Loop *Lp, ValueToValueMap &StridesMap); - -bool AccessAnalysis::canCheckPtrAtRT( - LoopVectorizationLegality::RuntimePointerCheck &RtCheck, - unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop, - ValueToValueMap &StridesMap, bool ShouldCheckStride) { - // Find pointers with computable bounds. We are going to use this information - // to place a runtime bound check. - bool CanDoRT = true; - - bool IsDepCheckNeeded = isDependencyCheckNeeded(); - NumComparisons = 0; - - // We assign a consecutive id to access from different alias sets. - // Accesses between different groups doesn't need to be checked. - unsigned ASId = 1; - for (auto &AS : AST) { - unsigned NumReadPtrChecks = 0; - unsigned NumWritePtrChecks = 0; - - // We assign consecutive id to access from different dependence sets. - // Accesses within the same set don't need a runtime check. - unsigned RunningDepId = 1; - DenseMap<Value *, unsigned> DepSetId; - - for (auto A : AS) { - Value *Ptr = A.getValue(); - bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); - MemAccessInfo Access(Ptr, IsWrite); - - if (IsWrite) - ++NumWritePtrChecks; - else - ++NumReadPtrChecks; - - if (hasComputableBounds(SE, StridesMap, Ptr) && - // When we run after a failing dependency check we have to make sure we - // don't have wrapping pointers. - (!ShouldCheckStride || - isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) { - // The id of the dependence set. - unsigned DepId; - - if (IsDepCheckNeeded) { - Value *Leader = DepCands.getLeaderValue(Access).getPointer(); - unsigned &LeaderId = DepSetId[Leader]; - if (!LeaderId) - LeaderId = RunningDepId++; - DepId = LeaderId; - } else - // Each access has its own dependence set. - DepId = RunningDepId++; - - RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap); - - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); - } else { - CanDoRT = false; - } - } - - if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) - NumComparisons += 0; // Only one dependence set. - else { - NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks + - NumWritePtrChecks - 1)); - } - - ++ASId; - } - - // If the pointers that we would use for the bounds comparison have different - // address spaces, assume the values aren't directly comparable, so we can't - // use them for the runtime check. We also have to assume they could - // overlap. In the future there should be metadata for whether address spaces - // are disjoint. - unsigned NumPointers = RtCheck.Pointers.size(); - for (unsigned i = 0; i < NumPointers; ++i) { - for (unsigned j = i + 1; j < NumPointers; ++j) { - // Only need to check pointers between two different dependency sets. - if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) - continue; - // Only need to check pointers in the same alias set. - if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j]) - continue; - - Value *PtrI = RtCheck.Pointers[i]; - Value *PtrJ = RtCheck.Pointers[j]; - - unsigned ASi = PtrI->getType()->getPointerAddressSpace(); - unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); - if (ASi != ASj) { - DEBUG(dbgs() << "LV: Runtime check would require comparison between" - " different address spaces\n"); - return false; - } - } - } - - return CanDoRT; -} - -void AccessAnalysis::processMemAccesses() { - // We process the set twice: first we process read-write pointers, last we - // process read-only pointers. This allows us to skip dependence tests for - // read-only pointers. - - DEBUG(dbgs() << "LV: Processing memory accesses...\n"); - DEBUG(dbgs() << " AST: "; AST.dump()); - DEBUG(dbgs() << "LV: Accesses:\n"); - DEBUG({ - for (auto A : Accesses) - dbgs() << "\t" << *A.getPointer() << " (" << - (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? - "read-only" : "read")) << ")\n"; - }); - - // The AliasSetTracker has nicely partitioned our pointers by metadata - // compatibility and potential for underlying-object overlap. As a result, we - // only need to check for potential pointer dependencies within each alias - // set. - for (auto &AS : AST) { - // Note that both the alias-set tracker and the alias sets themselves used - // linked lists internally and so the iteration order here is deterministic - // (matching the original instruction order within each set). - - bool SetHasWrite = false; - - // Map of pointers to last access encountered. - typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap; - UnderlyingObjToAccessMap ObjToLastAccess; - - // Set of access to check after all writes have been processed. - PtrAccessSet DeferredAccesses; - - // Iterate over each alias set twice, once to process read/write pointers, - // and then to process read-only pointers. - for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { - bool UseDeferred = SetIteration > 0; - PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; - - for (auto A : AS) { - Value *Ptr = A.getValue(); - bool IsWrite = S.count(MemAccessInfo(Ptr, true)); - - // If we're using the deferred access set, then it contains only reads. - bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; - if (UseDeferred && !IsReadOnlyPtr) - continue; - // Otherwise, the pointer must be in the PtrAccessSet, either as a read - // or a write. - assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || - S.count(MemAccessInfo(Ptr, false))) && - "Alias-set pointer not in the access set?"); - - MemAccessInfo Access(Ptr, IsWrite); - DepCands.insert(Access); - - // Memorize read-only pointers for later processing and skip them in the - // first round (they need to be checked after we have seen all write - // pointers). Note: we also mark pointer that are not consecutive as - // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need - // the second check for "!IsWrite". - if (!UseDeferred && IsReadOnlyPtr) { - DeferredAccesses.insert(Access); - continue; - } - - // If this is a write - check other reads and writes for conflicts. If - // this is a read only check other writes for conflicts (but only if - // there is no other write to the ptr - this is an optimization to - // catch "a[i] = a[i] + " without having to do a dependence check). - if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { - CheckDeps.insert(Access); - IsRTCheckNeeded = true; - } - - if (IsWrite) - SetHasWrite = true; - - // Create sets of pointers connected by a shared alias set and - // underlying object. - typedef SmallVector<Value *, 16> ValueVector; - ValueVector TempObjects; - GetUnderlyingObjects(Ptr, TempObjects, DL); - for (Value *UnderlyingObj : TempObjects) { - UnderlyingObjToAccessMap::iterator Prev = - ObjToLastAccess.find(UnderlyingObj); - if (Prev != ObjToLastAccess.end()) - DepCands.unionSets(Access, Prev->second); - - ObjToLastAccess[UnderlyingObj] = Access; - } - } - } - } -} - -namespace { -/// \brief Checks memory dependences among accesses to the same underlying -/// object to determine whether there vectorization is legal or not (and at -/// which vectorization factor). -/// -/// This class works under the assumption that we already checked that memory -/// locations with different underlying pointers are "must-not alias". -/// We use the ScalarEvolution framework to symbolically evalutate access -/// functions pairs. Since we currently don't restructure the loop we can rely -/// on the program order of memory accesses to determine their safety. -/// At the moment we will only deem accesses as safe for: -/// * A negative constant distance assuming program order. -/// -/// Safe: tmp = a[i + 1]; OR a[i + 1] = x; -/// a[i] = tmp; y = a[i]; -/// -/// The latter case is safe because later checks guarantuee that there can't -/// be a cycle through a phi node (that is, we check that "x" and "y" is not -/// the same variable: a header phi can only be an induction or a reduction, a -/// reduction can't have a memory sink, an induction can't have a memory -/// source). This is important and must not be violated (or we have to -/// resort to checking for cycles through memory). -/// -/// * A positive constant distance assuming program order that is bigger -/// than the biggest memory access. -/// -/// tmp = a[i] OR b[i] = x -/// a[i+2] = tmp y = b[i+2]; -/// -/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. -/// -/// * Zero distances and all accesses have the same size. -/// -class MemoryDepChecker { -public: - typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; - typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; - - MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L) - : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), - ShouldRetryWithRuntimeCheck(false) {} - - /// \brief Register the location (instructions are given increasing numbers) - /// of a write access. - void addAccess(StoreInst *SI) { - Value *Ptr = SI->getPointerOperand(); - Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); - InstMap.push_back(SI); - ++AccessIdx; - } - - /// \brief Register the location (instructions are given increasing numbers) - /// of a write access. - void addAccess(LoadInst *LI) { - Value *Ptr = LI->getPointerOperand(); - Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); - InstMap.push_back(LI); - ++AccessIdx; - } - - /// \brief Check whether the dependencies between the accesses are safe. - /// - /// Only checks sets with elements in \p CheckDeps. - bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides); - - /// \brief The maximum number of bytes of a vector register we can vectorize - /// the accesses safely with. - unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } - - /// \brief In same cases when the dependency check fails we can still - /// vectorize the loop with a dynamic array access check. - bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } - -private: - ScalarEvolution *SE; - const DataLayout *DL; - const Loop *InnermostLoop; - - /// \brief Maps access locations (ptr, read/write) to program order. - DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses; - - /// \brief Memory access instructions in program order. - SmallVector<Instruction *, 16> InstMap; - - /// \brief The program order index to be used for the next instruction. - unsigned AccessIdx; - - // We can access this many bytes in parallel safely. - unsigned MaxSafeDepDistBytes; - - /// \brief If we see a non-constant dependence distance we can still try to - /// vectorize this loop with runtime checks. - bool ShouldRetryWithRuntimeCheck; - - /// \brief Check whether there is a plausible dependence between the two - /// accesses. - /// - /// Access \p A must happen before \p B in program order. The two indices - /// identify the index into the program order map. - /// - /// This function checks whether there is a plausible dependence (or the - /// absence of such can't be proved) between the two accesses. If there is a - /// plausible dependence but the dependence distance is bigger than one - /// element access it records this distance in \p MaxSafeDepDistBytes (if this - /// distance is smaller than any other distance encountered so far). - /// Otherwise, this function returns true signaling a possible dependence. - bool isDependent(const MemAccessInfo &A, unsigned AIdx, - const MemAccessInfo &B, unsigned BIdx, - ValueToValueMap &Strides); - - /// \brief Check whether the data dependence could prevent store-load - /// forwarding. - bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); -}; - -} // end anonymous namespace - -static bool isInBoundsGep(Value *Ptr) { - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) - return GEP->isInBounds(); - return false; -} - -/// \brief Check whether the access through \p Ptr has a constant stride. -static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, - const Loop *Lp, ValueToValueMap &StridesMap) { - const Type *Ty = Ptr->getType(); - assert(Ty->isPointerTy() && "Unexpected non-ptr"); - - // Make sure that the pointer does not point to aggregate types. - const PointerType *PtrTy = cast<PointerType>(Ty); - if (PtrTy->getElementType()->isAggregateType()) { - DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << - "\n"); - return 0; - } - - const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr); - - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); - if (!AR) { - DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " - << *Ptr << " SCEV: " << *PtrScev << "\n"); - return 0; - } - - // The accesss function must stride over the innermost loop. - if (Lp != AR->getLoop()) { - DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " << - *Ptr << " SCEV: " << *PtrScev << "\n"); - } - - // The address calculation must not wrap. Otherwise, a dependence could be - // inverted. - // An inbounds getelementptr that is a AddRec with a unit stride - // cannot wrap per definition. The unit stride requirement is checked later. - // An getelementptr without an inbounds attribute and unit stride would have - // to access the pointer value "0" which is undefined behavior in address - // space 0, therefore we can also vectorize this case. - bool IsInBoundsGEP = isInBoundsGep(Ptr); - bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); - bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; - if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) { - DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space " - << *Ptr << " SCEV: " << *PtrScev << "\n"); - return 0; - } - - // Check the step is constant. - const SCEV *Step = AR->getStepRecurrence(*SE); - - // Calculate the pointer stride and check if it is consecutive. - const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); - if (!C) { - DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr << - " SCEV: " << *PtrScev << "\n"); - return 0; - } - - int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType()); - const APInt &APStepVal = C->getValue()->getValue(); - - // Huge step value - give up. - if (APStepVal.getBitWidth() > 64) - return 0; - - int64_t StepVal = APStepVal.getSExtValue(); - - // Strided access. - int64_t Stride = StepVal / Size; - int64_t Rem = StepVal % Size; - if (Rem) - return 0; - - // If the SCEV could wrap but we have an inbounds gep with a unit stride we - // know we can't "wrap around the address space". In case of address space - // zero we know that this won't happen without triggering undefined behavior. - if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && - Stride != 1 && Stride != -1) - return 0; - - return Stride; -} - -bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, - unsigned TypeByteSize) { - // If loads occur at a distance that is not a multiple of a feasible vector - // factor store-load forwarding does not take place. - // Positive dependences might cause troubles because vectorizing them might - // prevent store-load forwarding making vectorized code run a lot slower. - // a[i] = a[i-3] ^ a[i-8]; - // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and - // hence on your typical architecture store-load forwarding does not take - // place. Vectorizing in such cases does not make sense. - // Store-load forwarding distance. - const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize; - // Maximum vector factor. - unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize; - if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues) - MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes; - - for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues; - vf *= 2) { - if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) { - MaxVFWithoutSLForwardIssues = (vf >>=1); - break; - } - } - - if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) { - DEBUG(dbgs() << "LV: Distance " << Distance << - " that could cause a store-load forwarding conflict\n"); - return true; - } - - if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes && - MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize) - MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues; - return false; -} - -bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, - const MemAccessInfo &B, unsigned BIdx, - ValueToValueMap &Strides) { - assert (AIdx < BIdx && "Must pass arguments in program order"); - - Value *APtr = A.getPointer(); - Value *BPtr = B.getPointer(); - bool AIsWrite = A.getInt(); - bool BIsWrite = B.getInt(); - - // Two reads are independent. - if (!AIsWrite && !BIsWrite) - return false; - - // We cannot check pointers in different address spaces. - if (APtr->getType()->getPointerAddressSpace() != - BPtr->getType()->getPointerAddressSpace()) - return true; - - const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr); - const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr); - - int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides); - int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides); - - const SCEV *Src = AScev; - const SCEV *Sink = BScev; - - // If the induction step is negative we have to invert source and sink of the - // dependence. - if (StrideAPtr < 0) { - //Src = BScev; - //Sink = AScev; - std::swap(APtr, BPtr); - std::swap(Src, Sink); - std::swap(AIsWrite, BIsWrite); - std::swap(AIdx, BIdx); - std::swap(StrideAPtr, StrideBPtr); - } - - const SCEV *Dist = SE->getMinusSCEV(Sink, Src); - - DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink - << "(Induction step: " << StrideAPtr << ")\n"); - DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to " - << *InstMap[BIdx] << ": " << *Dist << "\n"); - - // Need consecutive accesses. We don't want to vectorize - // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in - // the address space. - if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ - DEBUG(dbgs() << "Non-consecutive pointer access\n"); - return true; - } - - const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); - if (!C) { - DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n"); - ShouldRetryWithRuntimeCheck = true; - return true; - } - - Type *ATy = APtr->getType()->getPointerElementType(); - Type *BTy = BPtr->getType()->getPointerElementType(); - unsigned TypeByteSize = DL->getTypeAllocSize(ATy); - - // Negative distances are not plausible dependencies. - const APInt &Val = C->getValue()->getValue(); - if (Val.isNegative()) { - bool IsTrueDataDependence = (AIsWrite && !BIsWrite); - if (IsTrueDataDependence && - (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) || - ATy != BTy)) - return true; - - DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n"); - return false; - } - - // Write to the same location with the same size. - // Could be improved to assert type sizes are the same (i32 == float, etc). - if (Val == 0) { - if (ATy == BTy) - return false; - DEBUG(dbgs() << "LV: Zero dependence difference but different types\n"); - return true; - } - - assert(Val.isStrictlyPositive() && "Expect a positive value"); - - // Positive distance bigger than max vectorization factor. - if (ATy != BTy) { - DEBUG(dbgs() << - "LV: ReadWrite-Write positive dependency with different types\n"); - return false; - } - - unsigned Distance = (unsigned) Val.getZExtValue(); - - // Bail out early if passed-in parameters make vectorization not feasible. - unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1; - unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1; - - // The distance must be bigger than the size needed for a vectorized version - // of the operation and the size of the vectorized operation must not be - // bigger than the currrent maximum size. - if (Distance < 2*TypeByteSize || - 2*TypeByteSize > MaxSafeDepDistBytes || - Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { - DEBUG(dbgs() << "LV: Failure because of Positive distance " - << Val.getSExtValue() << '\n'); - return true; - } - - MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? - Distance : MaxSafeDepDistBytes; - - bool IsTrueDataDependence = (!AIsWrite && BIsWrite); - if (IsTrueDataDependence && - couldPreventStoreLoadForward(Distance, TypeByteSize)) - return true; - - DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() << - " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'); - - return false; -} - -bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps, - ValueToValueMap &Strides) { - - MaxSafeDepDistBytes = -1U; - while (!CheckDeps.empty()) { - MemAccessInfo CurAccess = *CheckDeps.begin(); - - // Get the relevant memory access set. - EquivalenceClasses<MemAccessInfo>::iterator I = - AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); - - // Check accesses within this set. - EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE; - AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); - - // Check every access pair. - while (AI != AE) { - CheckDeps.erase(*AI); - EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI); - while (OI != AE) { - // Check every accessing instruction pair in program order. - for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(), - I1E = Accesses[*AI].end(); I1 != I1E; ++I1) - for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(), - I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { - if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides)) - return false; - if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides)) - return false; - } - ++OI; - } - AI++; - } - } - return true; -} - bool LoopVectorizationLegality::canVectorizeMemory() { - - typedef SmallVector<Value*, 16> ValueVector; - typedef SmallPtrSet<Value*, 16> ValueSet; - - // Holds the Load and Store *instructions*. - ValueVector Loads; - ValueVector Stores; - - // Holds all the different accesses in the loop. - unsigned NumReads = 0; - unsigned NumReadWrites = 0; - - PtrRtCheck.Pointers.clear(); - PtrRtCheck.Need = false; - - const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); - MemoryDepChecker DepChecker(SE, DL, TheLoop); - - // For each block. - for (Loop::block_iterator bb = TheLoop->block_begin(), - be = TheLoop->block_end(); bb != be; ++bb) { - - // Scan the BB and collect legal loads and stores. - for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; - ++it) { - - // If this is a load, save it. If this instruction can read from memory - // but is not a load, then we quit. Notice that we don't handle function - // calls that read or write. - if (it->mayReadFromMemory()) { - // Many math library functions read the rounding mode. We will only - // vectorize a loop if it contains known function calls that don't set - // the flag. Therefore, it is safe to ignore this read from memory. - CallInst *Call = dyn_cast<CallInst>(it); - if (Call && getIntrinsicIDForCall(Call, TLI)) - continue; - - LoadInst *Ld = dyn_cast<LoadInst>(it); - if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { - emitAnalysis(Report(Ld) - << "read with atomic ordering or volatile read"); - DEBUG(dbgs() << "LV: Found a non-simple load.\n"); - return false; - } - NumLoads++; - Loads.push_back(Ld); - DepChecker.addAccess(Ld); - continue; - } - - // Save 'store' instructions. Abort if other instructions write to memory. - if (it->mayWriteToMemory()) { - StoreInst *St = dyn_cast<StoreInst>(it); - if (!St) { - emitAnalysis(Report(it) << "instruction cannot be vectorized"); - return false; - } - if (!St->isSimple() && !IsAnnotatedParallel) { - emitAnalysis(Report(St) - << "write with atomic ordering or volatile write"); - DEBUG(dbgs() << "LV: Found a non-simple store.\n"); - return false; - } - NumStores++; - Stores.push_back(St); - DepChecker.addAccess(St); - } - } // Next instr. - } // Next block. - - // Now we have two lists that hold the loads and the stores. - // Next, we find the pointers that they use. - - // Check if we see any stores. If there are no stores, then we don't - // care if the pointers are *restrict*. - if (!Stores.size()) { - DEBUG(dbgs() << "LV: Found a read-only loop!\n"); - return true; - } - - AccessAnalysis::DepCandidates DependentAccesses; - AccessAnalysis Accesses(DL, AA, DependentAccesses); - - // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects - // multiple times on the same object. If the ptr is accessed twice, once - // for read and once for write, it will only appear once (on the write - // list). This is okay, since we are going to check for conflicts between - // writes and between reads and writes, but not between reads and reads. - ValueSet Seen; - - ValueVector::iterator I, IE; - for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { - StoreInst *ST = cast<StoreInst>(*I); - Value* Ptr = ST->getPointerOperand(); - - if (isUniform(Ptr)) { - emitAnalysis( - Report(ST) - << "write to a loop invariant address could not be vectorized"); - DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); - return false; - } - - // If we did *not* see this pointer before, insert it to the read-write - // list. At this phase it is only a 'write' list. - if (Seen.insert(Ptr).second) { - ++NumReadWrites; - - AliasAnalysis::Location Loc = AA->getLocation(ST); - // The TBAA metadata could have a control dependency on the predication - // condition, so we cannot rely on it when determining whether or not we - // need runtime pointer checks. - if (blockNeedsPredication(ST->getParent())) - Loc.AATags.TBAA = nullptr; - - Accesses.addStore(Loc); - } - } - - if (IsAnnotatedParallel) { - DEBUG(dbgs() - << "LV: A loop annotated parallel, ignore memory dependency " - << "checks.\n"); - return true; - } - - for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { - LoadInst *LD = cast<LoadInst>(*I); - Value* Ptr = LD->getPointerOperand(); - // If we did *not* see this pointer before, insert it to the - // read list. If we *did* see it before, then it is already in - // the read-write list. This allows us to vectorize expressions - // such as A[i] += x; Because the address of A[i] is a read-write - // pointer. This only works if the index of A[i] is consecutive. - // If the address of i is unknown (for example A[B[i]]) then we may - // read a few words, modify, and write a few words, and some of the - // words may be written to the same address. - bool IsReadOnlyPtr = false; - if (Seen.insert(Ptr).second || - !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) { - ++NumReads; - IsReadOnlyPtr = true; - } - - AliasAnalysis::Location Loc = AA->getLocation(LD); - // The TBAA metadata could have a control dependency on the predication - // condition, so we cannot rely on it when determining whether or not we - // need runtime pointer checks. - if (blockNeedsPredication(LD->getParent())) - Loc.AATags.TBAA = nullptr; - - Accesses.addLoad(Loc, IsReadOnlyPtr); - } - - // If we write (or read-write) to a single destination and there are no - // other reads in this loop then is it safe to vectorize. - if (NumReadWrites == 1 && NumReads == 0) { - DEBUG(dbgs() << "LV: Found a write-only loop!\n"); - return true; - } - - // Build dependence sets and check whether we need a runtime pointer bounds - // check. - Accesses.buildDependenceSets(); - bool NeedRTCheck = Accesses.isRTCheckNeeded(); - - // Find pointers with computable bounds. We are going to use this information - // to place a runtime bound check. - unsigned NumComparisons = 0; - bool CanDoRT = false; - if (NeedRTCheck) - CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop, - Strides); - - DEBUG(dbgs() << "LV: We need to do " << NumComparisons << - " pointer comparisons.\n"); - - // If we only have one set of dependences to check pointers among we don't - // need a runtime check. - if (NumComparisons == 0 && NeedRTCheck) - NeedRTCheck = false; - - // Check that we did not collect too many pointers or found an unsizeable - // pointer. - if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { - PtrRtCheck.reset(); - CanDoRT = false; - } - - if (CanDoRT) { - DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); - } - - if (NeedRTCheck && !CanDoRT) { - emitAnalysis(Report() << "cannot identify array bounds"); - DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << - "the array bounds.\n"); - PtrRtCheck.reset(); - return false; - } - - PtrRtCheck.Need = NeedRTCheck; - - bool CanVecMem = true; - if (Accesses.isDependencyCheckNeeded()) { - DEBUG(dbgs() << "LV: Checking memory dependencies\n"); - CanVecMem = DepChecker.areDepsSafe( - DependentAccesses, Accesses.getDependenciesToCheck(), Strides); - MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); - - if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { - DEBUG(dbgs() << "LV: Retrying with memory checks\n"); - NeedRTCheck = true; - - // Clear the dependency checks. We assume they are not needed. - Accesses.resetDepChecks(); - - PtrRtCheck.reset(); - PtrRtCheck.Need = true; - - CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, - TheLoop, Strides, true); - // Check that we did not collect too many pointers or found an unsizeable - // pointer. - if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { - if (!CanDoRT && NumComparisons > 0) - emitAnalysis(Report() - << "cannot check memory dependencies at runtime"); - else - emitAnalysis(Report() - << NumComparisons << " exceeds limit of " - << RuntimeMemoryCheckThreshold - << " dependent memory operations checked at runtime"); - DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); - PtrRtCheck.reset(); - return false; - } - - CanVecMem = true; - } - } - - if (!CanVecMem) - emitAnalysis(Report() << "unsafe dependent memory operations in loop"); - - DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << - " need a runtime memory check.\n"); - - return CanVecMem; + LAI = &LAA->getInfo(TheLoop, Strides); + auto &OptionalReport = LAI->getReport(); + if (OptionalReport) + emitAnalysis(VectorizationReport(*OptionalReport)); + return LAI->canVectorizeMemory(); } static bool hasMultipleUsesOf(Instruction *I, @@ -5236,7 +4129,8 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, } LoopVectorizationLegality::InductionKind -LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { +LoopVectorizationLegality::isInductionVariable(PHINode *Phi, + ConstantInt *&StepValue) { Type *PhiTy = Phi->getType(); // We only handle integer and pointer inductions variables. if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) @@ -5249,22 +4143,19 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); return IK_NoInduction; } - const SCEV *Step = AR->getStepRecurrence(*SE); - - // Integer inductions need to have a stride of one. - if (PhiTy->isIntegerTy()) { - if (Step->isOne()) - return IK_IntInduction; - if (Step->isAllOnesValue()) - return IK_ReverseIntInduction; - return IK_NoInduction; - } + const SCEV *Step = AR->getStepRecurrence(*SE); // Calculate the pointer stride and check if it is consecutive. const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); if (!C) return IK_NoInduction; + ConstantInt *CV = C->getValue(); + if (PhiTy->isIntegerTy()) { + StepValue = CV; + return IK_IntInduction; + } + assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); Type *PointerElementType = PhiTy->getPointerElementType(); // The pointer stride cannot be determined if the pointer element type is not @@ -5272,13 +4163,12 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { if (!PointerElementType->isSized()) return IK_NoInduction; - uint64_t Size = DL->getTypeAllocSize(PointerElementType); - if (C->getValue()->equalsInt(Size)) - return IK_PtrInduction; - else if (C->getValue()->equalsInt(0 - Size)) - return IK_ReversePtrInduction; - - return IK_NoInduction; + int64_t Size = static_cast<int64_t>(DL->getTypeAllocSize(PointerElementType)); + int64_t CVSize = CV->getSExtValue(); + if (CVSize % Size) + return IK_NoInduction; + StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size); + return IK_PtrInduction; } bool LoopVectorizationLegality::isInductionVariable(const Value *V) { @@ -5291,21 +4181,32 @@ bool LoopVectorizationLegality::isInductionVariable(const Value *V) { } bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { - assert(TheLoop->contains(BB) && "Unknown block used"); - - // Blocks that do not dominate the latch need predication. - BasicBlock* Latch = TheLoop->getLoopLatch(); - return !DT->dominates(BB, Latch); + return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) { + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // Check that we don't have a constant expression that can trap as operand. + for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); + OI != OE; ++OI) { + if (Constant *C = dyn_cast<Constant>(*OI)) + if (C->canTrap()) + return false; + } // We might be able to hoist the load. if (it->mayReadFromMemory()) { LoadInst *LI = dyn_cast<LoadInst>(it); - if (!LI || !SafePtrs.count(LI->getPointerOperand())) + if (!LI) return false; + if (!SafePtrs.count(LI->getPointerOperand())) { + if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) { + MaskedOp.insert(LI); + continue; + } + return false; + } } // We don't predicate stores at the moment. @@ -5313,22 +4214,30 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, StoreInst *SI = dyn_cast<StoreInst>(it); // We only support predication of stores in basic blocks with one // predecessor. - if (!SI || ++NumPredStores > NumberOfStoresToPredicate || - !SafePtrs.count(SI->getPointerOperand()) || - !SI->getParent()->getSinglePredecessor()) + if (!SI) + return false; + + bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0); + bool isSinglePredecessor = SI->getParent()->getSinglePredecessor(); + + if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || + !isSinglePredecessor) { + // Build a masked store if it is legal for the target, otherwise scalarize + // the block. + bool isLegalMaskedOp = + isLegalMaskedStore(SI->getValueOperand()->getType(), + SI->getPointerOperand()); + if (isLegalMaskedOp) { + --NumPredStores; + MaskedOp.insert(SI); + continue; + } return false; + } } if (it->mayThrow()) return false; - // Check that we don't have a constant expression that can trap as operand. - for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); - OI != OE; ++OI) { - if (Constant *C = dyn_cast<Constant>(*OI)) - if (C->canTrap()) - return false; - } - // The instructions below can trap. switch (it->getOpcode()) { default: continue; @@ -5336,7 +4245,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: - return false; + return false; } } @@ -5348,13 +4257,17 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // Width 1 means no vectorize VectorizationFactor Factor = { 1U, 0U }; if (OptForSize && Legal->getRuntimePointerCheck()->Need) { - emitAnalysis(Report() << "runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os"); + emitAnalysis(VectorizationReport() << + "runtime pointer checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os"); DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); return Factor; } - if (!EnableCondStoresVectorization && Legal->NumPredStores) { - emitAnalysis(Report() << "store that is conditionally executed prevents vectorization"); + if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { + emitAnalysis(VectorizationReport() << + "store that is conditionally executed prevents vectorization"); DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); return Factor; } @@ -5380,7 +4293,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { MaxVectorSize = 1; } - assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements" + assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" " into one vector!"); unsigned VF = MaxVectorSize; @@ -5389,7 +4302,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { if (OptForSize) { // If we are unable to calculate the trip count then don't try to vectorize. if (TC < 2) { - emitAnalysis(Report() << "unable to calculate the loop count due to complex control flow"); + emitAnalysis + (VectorizationReport() << + "unable to calculate the loop count due to complex control flow"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); return Factor; } @@ -5403,10 +4318,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // If the trip count that we found modulo the vectorization factor is not // zero then we require a tail. if (VF < 2) { - emitAnalysis(Report() << "cannot optimize for size and vectorize at the " - "same time. Enable vectorization of this loop " - "with '#pragma clang loop vectorize(enable)' " - "when compiling with -Os"); + emitAnalysis(VectorizationReport() << + "cannot optimize for size and vectorize at the " + "same time. Enable vectorization of this loop " + "with '#pragma clang loop vectorize(enable)' " + "when compiling with -Os"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); return Factor; } @@ -5619,8 +4535,10 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // Unroll until store/load ports (estimated by max unroll factor) are // saturated. - unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1); - unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1); + unsigned NumStores = Legal->getNumStores(); + unsigned NumLoads = Legal->getNumLoads(); + unsigned StoresUF = UF / (NumStores ? NumStores : 1); + unsigned LoadsUF = UF / (NumLoads ? NumLoads : 1); // If we have a scalar reduction (vector reductions are already dealt with // by this point), we can increase the critical path length if the loop @@ -6008,7 +4926,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // Wide load/stores. unsigned Cost = TTI.getAddressComputationCost(VectorTy); - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + if (Legal->isMaskRequired(I)) + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, + AS); + else + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); if (Reverse) Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, @@ -6081,15 +5003,16 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -6186,7 +5109,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, ConstantInt::get(Cond[Part]->getType(), 1)); CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(CondBlock, *LI); // Update Builder with newly created basic block. Builder.SetInsertPoint(InsertPt); } @@ -6212,7 +5135,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, if (IfPredicateStore) { BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); @@ -6237,11 +5160,10 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } -Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, - bool Negate) { +Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) { // When unrolling and the VF is 1, we only need to add a simple scalar. Type *ITy = Val->getType(); assert(!ITy->isVectorTy() && "Val must be a scalar"); - Constant *C = ConstantInt::get(ITy, StartIdx, Negate); - return Builder.CreateAdd(Val, C, "induction"); + Constant *C = ConstantInt::get(ITy, StartIdx); + return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); } diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 44bfea1..baf9741 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -19,9 +19,10 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -74,6 +75,27 @@ static const unsigned MinVecRegSize = 128; static const unsigned RecursionMaxDepth = 12; +// Limit the number of alias checks. The limit is chosen so that +// it has no negative effect on the llvm benchmarks. +static const unsigned AliasedCheckLimit = 10; + +// Another limit for the alias checks: The maximum distance between load/store +// instructions where alias checks are done. +// This limit is useful for very large basic blocks. +static const unsigned MaxMemDepDistance = 160; + +/// \brief Predicate for the element types that the SLP vectorizer supports. +/// +/// The most important thing to filter here are types which are invalid in LLVM +/// vectors. We also filter target specific types which have absolutely no +/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just +/// avoids spending time checking the cost model and realizing that they will +/// be inevitably scalarized. +static bool isValidElementType(Type *Ty) { + return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && + !Ty->isPPC_FP128Ty(); +} + /// \returns the parent basic block if all of the instructions in \p VL /// are in the same block or null otherwise. static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { @@ -207,6 +229,8 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { MD = MDNode::getMostGenericTBAA(MD, IMD); break; case LLVMContext::MD_alias_scope: + MD = MDNode::getMostGenericAliasScope(MD, IMD); + break; case LLVMContext::MD_noalias: MD = MDNode::intersect(MD, IMD); break; @@ -263,104 +287,6 @@ static bool CanReuseExtract(ArrayRef<Value *> VL) { return true; } -static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right) { - - SmallVector<Value *, 16> OrigLeft, OrigRight; - - bool AllSameOpcodeLeft = true; - bool AllSameOpcodeRight = true; - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - Instruction *I = cast<Instruction>(VL[i]); - Value *V0 = I->getOperand(0); - Value *V1 = I->getOperand(1); - - OrigLeft.push_back(V0); - OrigRight.push_back(V1); - - Instruction *I0 = dyn_cast<Instruction>(V0); - Instruction *I1 = dyn_cast<Instruction>(V1); - - // Check whether all operands on one side have the same opcode. In this case - // we want to preserve the original order and not make things worse by - // reordering. - AllSameOpcodeLeft = I0; - AllSameOpcodeRight = I1; - - if (i && AllSameOpcodeLeft) { - if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) { - if(P0->getOpcode() != I0->getOpcode()) - AllSameOpcodeLeft = false; - } else - AllSameOpcodeLeft = false; - } - if (i && AllSameOpcodeRight) { - if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) { - if(P1->getOpcode() != I1->getOpcode()) - AllSameOpcodeRight = false; - } else - AllSameOpcodeRight = false; - } - - // Sort two opcodes. In the code below we try to preserve the ability to use - // broadcast of values instead of individual inserts. - // vl1 = load - // vl2 = phi - // vr1 = load - // vr2 = vr2 - // = vl1 x vr1 - // = vl2 x vr2 - // If we just sorted according to opcode we would leave the first line in - // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). - // = vl1 x vr1 - // = vr2 x vl2 - // Because vr2 and vr1 are from the same load we loose the opportunity of a - // broadcast for the packed right side in the backend: we have [vr1, vl2] - // instead of [vr1, vr2=vr1]. - if (I0 && I1) { - if(!i && I0->getOpcode() > I1->getOpcode()) { - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) { - // Try not to destroy a broad cast for no apparent benefit. - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) { - // Try preserve broadcasts. - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) { - // Try preserve broadcasts. - Left.push_back(I1); - Right.push_back(I0); - } else { - Left.push_back(I0); - Right.push_back(I1); - } - continue; - } - // One opcode, put the instruction on the right. - if (I0) { - Left.push_back(V1); - Right.push_back(I0); - continue; - } - Left.push_back(V0); - Right.push_back(V1); - } - - bool LeftBroadcast = isSplat(Left); - bool RightBroadcast = isSplat(Right); - - // Don't reorder if the operands where good to begin with. - if (!(LeftBroadcast || RightBroadcast) && - (AllSameOpcodeRight || AllSameOpcodeLeft)) { - Left = OrigLeft; - Right = OrigRight; - } -} - /// \returns True if in-tree use also needs extract. This refers to /// possible scalar operand in vectorized instruction. static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, @@ -388,6 +314,26 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, } } +/// \returns the AA location that is being access by the instruction. +static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) { + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return AA->getLocation(SI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return AA->getLocation(LI); + return AliasAnalysis::Location(); +} + +/// \returns True if the instruction is not a volatile or atomic load/store. +static bool isSimple(Instruction *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->isSimple(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->isSimple(); + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) + return !MI->isVolatile(); + return true; +} + /// Bottom Up SLP Vectorizer. class BoUpSLP { public: @@ -398,11 +344,11 @@ public: BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, - LoopInfo *Li, DominatorTree *Dt, AssumptionTracker *AT) - : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), - F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), + LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC) + : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), + SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), Builder(Se->getContext()) { - CodeMetrics::collectEphemeralValues(F, AT, EphValues); + CodeMetrics::collectEphemeralValues(F, AC, EphValues); } /// \brief Vectorize the tree that starts with the elements in \p VL. @@ -494,6 +440,16 @@ private: /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(); + /// \reorder commutative operands in alt shuffle if they result in + /// vectorized code. + void reorderAltShuffleOperands(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right); + /// \reorder commutative operands to get better probability of + /// generating vectorized code. + void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right); struct TreeEntry { TreeEntry() : Scalars(), VectorizedValue(nullptr), NeedToGather(0) {} @@ -555,6 +511,52 @@ private: }; typedef SmallVector<ExternalUser, 16> UserList; + /// Checks if two instructions may access the same memory. + /// + /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it + /// is invariant in the calling loop. + bool isAliased(const AliasAnalysis::Location &Loc1, Instruction *Inst1, + Instruction *Inst2) { + + // First check if the result is already in the cache. + AliasCacheKey key = std::make_pair(Inst1, Inst2); + Optional<bool> &result = AliasCache[key]; + if (result.hasValue()) { + return result.getValue(); + } + AliasAnalysis::Location Loc2 = getLocation(Inst2, AA); + bool aliased = true; + if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { + // Do the alias check. + aliased = AA->alias(Loc1, Loc2); + } + // Store the result in the cache. + result = aliased; + return aliased; + } + + typedef std::pair<Instruction *, Instruction *> AliasCacheKey; + + /// Cache for alias results. + /// TODO: consider moving this to the AliasAnalysis itself. + DenseMap<AliasCacheKey, Optional<bool>> AliasCache; + + /// Removes an instruction from its block and eventually deletes it. + /// It's like Instruction::eraseFromParent() except that the actual deletion + /// is delayed until BoUpSLP is destructed. + /// This is required to ensure that there are no incorrect collisions in the + /// AliasCache, which can happen if a new instruction is allocated at the + /// same address as a previously deleted instruction. + void eraseInstruction(Instruction *I) { + I->removeFromParent(); + I->dropAllReferences(); + DeletedInstructions.push_back(std::unique_ptr<Instruction>(I)); + } + + /// Temporary store for deleted instructions. Instructions will be deleted + /// eventually when the BoUpSLP is destructed. + SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions; + /// A list of values that need to extracted out of the tree. /// This list holds pairs of (Internal Scalar : External User). UserList ExternalUses; @@ -791,7 +793,7 @@ private: /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef<Value *> VL, AliasAnalysis *AA); + bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP); /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef<Value *> VL); @@ -808,7 +810,7 @@ private: /// Updates the dependency information of a bundle and of all instructions/ /// bundles which depend on the original bundle. void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, - AliasAnalysis *AA); + BoUpSLP *SLP); /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); @@ -857,7 +859,7 @@ private: }; /// Attaches the BlockScheduling structures to basic blocks. - DenseMap<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; + MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. @@ -1031,11 +1033,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } } - // If any of the scalars appears in the table OR it is marked as a value that - // needs to stat scalar then we need to gather the scalars. + // If any of the scalars is marked as a value that needs to stay scalar then + // we need to gather the scalars. for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) { - DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n"); + if (MustGather.count(VL[i])) { + DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); newTreeEntry(VL, false); return; } @@ -1069,7 +1071,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, AA)) { + if (!BS.tryScheduleBundle(VL, this)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); BS.cancelScheduling(VL); newTreeEntry(VL, false); @@ -1158,7 +1160,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { Type *SrcTy = VL0->getOperand(0)->getType(); for (unsigned i = 0; i < VL.size(); ++i) { Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); - if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { + if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); @@ -1381,6 +1383,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + + // Reorder operands if reordering would enable vectorization. + if (isa<BinaryOperator>(VL0)) { + ValueList Left, Right; + reorderAltShuffleOperands(VL, Left, Right); + buildTree_rec(Left, Depth + 1); + buildTree_rec(Right, Depth + 1); + return; + } + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1704,7 +1716,7 @@ int BoUpSLP::getTreeCost() { // We only vectorize tiny trees if it is fully vectorizable. if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { - if (!VectorizableTree.size()) { + if (VectorizableTree.empty()) { assert(!ExternalUses.size() && "We should not have any external users"); } return INT_MAX; @@ -1818,6 +1830,195 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { return X == PtrSCEVB; } +// Reorder commutative operations in alternate shuffle if the resulting vectors +// are consecutive loads. This would allow us to vectorize the tree. +// If we have something like- +// load a[0] - load b[0] +// load b[1] + load a[1] +// load a[2] - load b[2] +// load a[3] + load b[3] +// Reordering the second load b[1] load a[1] would allow us to vectorize this +// code. +void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right) { + + // Push left and right operands of binary operation into Left and Right + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Left.push_back(cast<Instruction>(VL[i])->getOperand(0)); + Right.push_back(cast<Instruction>(VL[i])->getOperand(1)); + } + + // Reorder if we have a commutative operation and consecutive access + // are on either side of the alternate instructions. + for (unsigned j = 0; j < VL.size() - 1; ++j) { + if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { + Instruction *VL1 = cast<Instruction>(VL[j]); + Instruction *VL2 = cast<Instruction>(VL[j + 1]); + if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + // else unchanged + } + } + if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { + Instruction *VL1 = cast<Instruction>(VL[j]); + Instruction *VL2 = cast<Instruction>(VL[j + 1]); + if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + // else unchanged + } + } + } +} + +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right) { + + SmallVector<Value *, 16> OrigLeft, OrigRight; + + bool AllSameOpcodeLeft = true; + bool AllSameOpcodeRight = true; + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + Instruction *I = cast<Instruction>(VL[i]); + Value *VLeft = I->getOperand(0); + Value *VRight = I->getOperand(1); + + OrigLeft.push_back(VLeft); + OrigRight.push_back(VRight); + + Instruction *ILeft = dyn_cast<Instruction>(VLeft); + Instruction *IRight = dyn_cast<Instruction>(VRight); + + // Check whether all operands on one side have the same opcode. In this case + // we want to preserve the original order and not make things worse by + // reordering. + if (i && AllSameOpcodeLeft && ILeft) { + if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) { + if (PLeft->getOpcode() != ILeft->getOpcode()) + AllSameOpcodeLeft = false; + } else + AllSameOpcodeLeft = false; + } + if (i && AllSameOpcodeRight && IRight) { + if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) { + if (PRight->getOpcode() != IRight->getOpcode()) + AllSameOpcodeRight = false; + } else + AllSameOpcodeRight = false; + } + + // Sort two opcodes. In the code below we try to preserve the ability to use + // broadcast of values instead of individual inserts. + // vl1 = load + // vl2 = phi + // vr1 = load + // vr2 = vr2 + // = vl1 x vr1 + // = vl2 x vr2 + // If we just sorted according to opcode we would leave the first line in + // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). + // = vl1 x vr1 + // = vr2 x vl2 + // Because vr2 and vr1 are from the same load we loose the opportunity of a + // broadcast for the packed right side in the backend: we have [vr1, vl2] + // instead of [vr1, vr2=vr1]. + if (ILeft && IRight) { + if (!i && ILeft->getOpcode() > IRight->getOpcode()) { + Left.push_back(IRight); + Right.push_back(ILeft); + } else if (i && ILeft->getOpcode() > IRight->getOpcode() && + Right[i - 1] != IRight) { + // Try not to destroy a broad cast for no apparent benefit. + Left.push_back(IRight); + Right.push_back(ILeft); + } else if (i && ILeft->getOpcode() == IRight->getOpcode() && + Right[i - 1] == ILeft) { + // Try preserve broadcasts. + Left.push_back(IRight); + Right.push_back(ILeft); + } else if (i && ILeft->getOpcode() == IRight->getOpcode() && + Left[i - 1] == IRight) { + // Try preserve broadcasts. + Left.push_back(IRight); + Right.push_back(ILeft); + } else { + Left.push_back(ILeft); + Right.push_back(IRight); + } + continue; + } + // One opcode, put the instruction on the right. + if (ILeft) { + Left.push_back(VRight); + Right.push_back(ILeft); + continue; + } + Left.push_back(VLeft); + Right.push_back(VRight); + } + + bool LeftBroadcast = isSplat(Left); + bool RightBroadcast = isSplat(Right); + + // If operands end up being broadcast return this operand order. + if (LeftBroadcast || RightBroadcast) + return; + + // Don't reorder if the operands where good to begin. + if (AllSameOpcodeRight || AllSameOpcodeLeft) { + Left = OrigLeft; + Right = OrigRight; + } + + // Finally check if we can get longer vectorizable chain by reordering + // without breaking the good operand order detected above. + // E.g. If we have something like- + // load a[0] load b[0] + // load b[1] load a[1] + // load a[2] load b[2] + // load a[3] load b[3] + // Reordering the second load b[1] load a[1] would allow us to vectorize + // this code and we still retain AllSameOpcode property. + // FIXME: This load reordering might break AllSameOpcode in some rare cases + // such as- + // add a[0],c[0] load b[0] + // add a[1],c[2] load b[1] + // b[2] load b[2] + // add a[3],c[3] load b[3] + for (unsigned j = 0; j < VL.size() - 1; ++j) { + if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { + if (isConsecutiveAccess(L, L1)) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + } + } + if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { + if (isConsecutiveAccess(L, L1)) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + } + } + // else unchanged + } +} + void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { Instruction *VL0 = cast<Instruction>(VL[0]); BasicBlock::iterator NextInst = VL0; @@ -2214,10 +2415,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); - RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); - } + assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand"); + reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars); Value *LHS = vectorizeTree(LHSVL); @@ -2360,7 +2559,7 @@ Value *BoUpSLP::vectorizeTree() { Scalar->replaceAllUsesWith(Undef); } DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); - cast<Instruction>(Scalar)->eraseFromParent(); + eraseInstruction(cast<Instruction>(Scalar)); } } @@ -2442,7 +2641,7 @@ void BoUpSLP::optimizeGatherSequence() { if (In->isIdenticalTo(*v) && DT->dominates((*v)->getParent(), In->getParent())) { In->replaceAllUsesWith(*v); - In->eraseFromParent(); + eraseInstruction(In); In = nullptr; break; } @@ -2460,7 +2659,7 @@ void BoUpSLP::optimizeGatherSequence() { // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, - AliasAnalysis *AA) { + BoUpSLP *SLP) { if (isa<PHINode>(VL[0])) return true; @@ -2517,7 +2716,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " << BB->getName() << "\n"); - calculateDependencies(Bundle, true, AA); + calculateDependencies(Bundle, true, SLP); // Now try to schedule the new bundle. As soon as the bundle is "ready" it // means that there are no cyclic dependencies and we can schedule it. @@ -2648,18 +2847,9 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, } } -/// \returns the AA location that is being access by the instruction. -static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) { - if (StoreInst *SI = dyn_cast<StoreInst>(I)) - return AA->getLocation(SI); - if (LoadInst *LI = dyn_cast<LoadInst>(I)) - return AA->getLocation(LI); - return AliasAnalysis::Location(); -} - void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, bool InsertInReadyList, - AliasAnalysis *AA) { + BoUpSLP *SLP) { assert(SD->isSchedulingEntity()); SmallVector<ScheduleData *, 10> WorkList; @@ -2704,26 +2894,60 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, // Handle the memory dependencies. ScheduleData *DepDest = BundleMember->NextLoadStore; if (DepDest) { - AliasAnalysis::Location SrcLoc = getLocation(BundleMember->Inst, AA); + Instruction *SrcInst = BundleMember->Inst; + AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA); bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); + unsigned numAliased = 0; + unsigned DistToSrc = 1; while (DepDest) { assert(isInSchedulingRegion(DepDest)); - if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) { - AliasAnalysis::Location DstLoc = getLocation(DepDest->Inst, AA); - if (!SrcLoc.Ptr || !DstLoc.Ptr || AA->alias(SrcLoc, DstLoc)) { - DepDest->MemoryDependencies.push_back(BundleMember); - BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); - } + + // We have two limits to reduce the complexity: + // 1) AliasedCheckLimit: It's a small limit to reduce calls to + // SLP->isAliased (which is the expensive part in this loop). + // 2) MaxMemDepDistance: It's for very large blocks and it aborts + // the whole loop (even if the loop is fast, it's quadratic). + // It's important for the loop break condition (see below) to + // check this limit even between two read-only instructions. + if (DistToSrc >= MaxMemDepDistance || + ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && + (numAliased >= AliasedCheckLimit || + SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { + + // We increment the counter only if the locations are aliased + // (instead of counting all alias checks). This gives a better + // balance between reduced runtime and accurate dependencies. + numAliased++; + + DepDest->MemoryDependencies.push_back(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = DepDest->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) { + WorkList.push_back(DestBundle); } } DepDest = DepDest->NextLoadStore; + + // Example, explaining the loop break condition: Let's assume our + // starting instruction is i0 and MaxMemDepDistance = 3. + // + // +--------v--v--v + // i0,i1,i2,i3,i4,i5,i6,i7,i8 + // +--------^--^--^ + // + // MaxMemDepDistance let us stop alias-checking at i3 and we add + // dependencies from i0 to i3,i4,.. (even if they are not aliased). + // Previously we already added dependencies from i3 to i6,i7,i8 + // (because of MaxMemDepDistance). As we added a dependency from + // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 + // and we can abort this loop at i6. + if (DistToSrc >= 2 * MaxMemDepDistance) + break; + DistToSrc++; } } } @@ -2779,7 +3003,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { "scheduler and vectorizer have different opinion on what is a bundle"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { - BS->calculateDependencies(SD, false, AA); + BS->calculateDependencies(SD, false, this); NumToSchedule++; } } @@ -2833,7 +3057,7 @@ struct SLPVectorizer : public FunctionPass { AliasAnalysis *AA; LoopInfo *LI; DominatorTree *DT; - AssumptionTracker *AT; + AssumptionCache *AC; bool runOnFunction(Function &F) override { if (skipOptnoneFunction(F)) @@ -2842,12 +3066,13 @@ struct SLPVectorizer : public FunctionPass { SE = &getAnalysis<ScalarEvolution>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - TTI = &getAnalysis<TargetTransformInfo>(); - TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TLI = TLIP ? &TLIP->getTLI() : nullptr; AA = &getAnalysis<AliasAnalysis>(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - AT = &getAnalysis<AssumptionTracker>(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); StoreRefs.clear(); bool Changed = false; @@ -2870,7 +3095,10 @@ struct SLPVectorizer : public FunctionPass { // Use the bottom up slp vectorizer to construct chains that start with // store instructions. - BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AT); + BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC); + + // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to + // delete instructions. // Scan the blocks in the function in post order. for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), @@ -2897,13 +3125,13 @@ struct SLPVectorizer : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { FunctionPass::getAnalysisUsage(AU); - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<ScalarEvolution>(); AU.addRequired<AliasAnalysis>(); - AU.addRequired<TargetTransformInfo>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } @@ -3078,7 +3306,7 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { // Check that the pointer points to scalars. Type *Ty = SI->getValueOperand()->getType(); - if (Ty->isAggregateType() || Ty->isVectorTy()) + if (!isValidElementType(Ty)) continue; // Find the base pointer. @@ -3119,7 +3347,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, for (int i = 0, e = VL.size(); i < e; ++i) { Type *Ty = VL[i]->getType(); - if (Ty->isAggregateType() || Ty->isVectorTy()) + if (!isValidElementType(Ty)) return false; Instruction *Inst = dyn_cast<Instruction>(VL[i]); if (!Inst || Inst->getOpcode() != Opcode0) @@ -3339,7 +3567,7 @@ public: return false; Type *Ty = B->getType(); - if (Ty->isVectorTy()) + if (!isValidElementType(Ty)) return false; ReductionOpcode = B->getOpcode(); @@ -3502,11 +3730,10 @@ private: /// \brief Emit a horizontal reduction of the vectorized value. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { assert(VectorizedValue && "Need to have a vectorized tree node"); - Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); - Value *TmpVec = ValToReduce; + Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { if (IsPairwiseReduction) { Value *LeftMask = @@ -3730,6 +3957,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // and the iterator may become invalid value. it = BB->begin(); e = BB->end(); + break; } } } @@ -3786,8 +4014,8 @@ char SLPVectorizer::ID = 0; static const char lv_name[] = "SLP Vectorizer"; INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index d459bcf..6e002fd 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -19,7 +19,7 @@ #include "llvm/Analysis/Passes.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" using namespace llvm; |