diff options
author | Stephen Hines <srhines@google.com> | 2014-02-11 20:01:10 -0800 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2014-02-11 20:01:10 -0800 |
commit | ce9904c6ea8fd669978a8eefb854b330eb9828ff (patch) | |
tree | 2418ee2e96ea220977c8fb74959192036ab5b133 /lib/Transforms | |
parent | c27b10b198c1d9e9b51f2303994313ec2778edd7 (diff) | |
parent | dbb832b83351cec97b025b61c26536ef50c3181c (diff) | |
download | external_llvm-ce9904c6ea8fd669978a8eefb854b330eb9828ff.zip external_llvm-ce9904c6ea8fd669978a8eefb854b330eb9828ff.tar.gz external_llvm-ce9904c6ea8fd669978a8eefb854b330eb9828ff.tar.bz2 |
Merge remote-tracking branch 'upstream/release_34' into merge-20140211
Conflicts:
lib/Linker/LinkModules.cpp
lib/Support/Unix/Signals.inc
Change-Id: Ia54f291fa5dc828052d2412736e8495c1282aa64
Diffstat (limited to 'lib/Transforms')
85 files changed, 7933 insertions, 4577 deletions
diff --git a/lib/Transforms/Hello/Hello.cpp b/lib/Transforms/Hello/Hello.cpp index 9f2343b..9251783 100644 --- a/lib/Transforms/Hello/Hello.cpp +++ b/lib/Transforms/Hello/Hello.cpp @@ -52,7 +52,7 @@ namespace { return false; } - // We don't modify the program, so we preserve all analyses + // We don't modify the program, so we preserve all analyses. virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); } diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp index c42d506..df08091 100644 --- a/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -88,7 +88,7 @@ char ArgPromotion::ID = 0; INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion", "Promote 'by reference' arguments to scalars", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_DEPENDENCY(CallGraph) INITIALIZE_PASS_END(ArgPromotion, "argpromotion", "Promote 'by reference' arguments to scalars", false, false) @@ -504,7 +504,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // OriginalLoads - Keep track of a representative load instruction from the // original function so that we can tell the alias analysis implementation // what the new GEP/Load instructions we are inserting look like. - std::map<IndicesVector, LoadInst*> OriginalLoads; + // We need to keep the original loads for each argument and the elements + // of the argument that are accessed. + std::map<std::pair<Argument*, IndicesVector>, LoadInst*> OriginalLoads; // Attribute - Keep track of the parameter attributes for the arguments // that we are *not* promoting. For the ones that we do promote, the parameter @@ -569,7 +571,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, else // Take any load, we will use it only to update Alias Analysis OrigLoad = cast<LoadInst>(User->use_back()); - OriginalLoads[Indices] = OrigLoad; + OriginalLoads[std::make_pair(I, Indices)] = OrigLoad; } // Add a parameter to the function for each element passed in. @@ -676,7 +678,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, for (ScalarizeTable::iterator SI = ArgIndices.begin(), E = ArgIndices.end(); SI != E; ++SI) { Value *V = *AI; - LoadInst *OrigLoad = OriginalLoads[*SI]; + LoadInst *OrigLoad = OriginalLoads[std::make_pair(I, *SI)]; if (!SI->empty()) { Ops.reserve(SI->size()); Type *ElTy = V->getType(); diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp index a7bf188..d94c0f4 100644 --- a/lib/Transforms/IPO/ConstantMerge.cpp +++ b/lib/Transforms/IPO/ConstantMerge.cpp @@ -93,9 +93,12 @@ bool ConstantMerge::hasKnownAlignment(GlobalVariable *GV) const { } unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const { + unsigned Align = GV->getAlignment(); + if (Align) + return Align; if (TD) return TD->getPreferredAlignment(GV); - return GV->getAlignment(); + return 0; } bool ConstantMerge::runOnModule(Module &M) { @@ -210,9 +213,9 @@ bool ConstantMerge::runOnModule(Module &M) { // Bump the alignment if necessary. if (Replacements[i].first->getAlignment() || Replacements[i].second->getAlignment()) { - Replacements[i].second->setAlignment(std::max( - Replacements[i].first->getAlignment(), - Replacements[i].second->getAlignment())); + Replacements[i].second->setAlignment( + std::max(getAlignment(Replacements[i].first), + getAlignment(Replacements[i].second))); } // Eliminate any uses of the dead global. diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp index 6ee6162..911c14e 100644 --- a/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -357,6 +357,19 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn) if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg()) return false; + // If a function seen at compile time is not necessarily the one linked to + // the binary being built, it is illegal to change the actual arguments + // passed to it. These functions can be captured by isWeakForLinker(). + // *NOTE* that mayBeOverridden() is insufficient for this purpose as it + // doesn't include linkage types like AvailableExternallyLinkage and + // LinkOnceODRLinkage. Take link_odr* as an example, it indicates a set of + // *EQUIVALENT* globals that can be merged at link-time. However, the + // semantic of *EQUIVALENT*-functions includes parameters. Changing + // parameters breaks this assumption. + // + if (Fn.isWeakForLinker()) + return false; + if (Fn.use_empty()) return false; diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp index fa3d72d..50fb3e6 100644 --- a/lib/Transforms/IPO/ExtractGV.cpp +++ b/lib/Transforms/IPO/ExtractGV.cpp @@ -21,6 +21,38 @@ #include <algorithm> using namespace llvm; +/// Make sure GV is visible from both modules. Delete is true if it is +/// being deleted from this module. +/// This also makes sure GV cannot be dropped so that references from +/// the split module remain valid. +static void makeVisible(GlobalValue &GV, bool Delete) { + bool Local = GV.hasLocalLinkage(); + if (Local) + GV.setVisibility(GlobalValue::HiddenVisibility); + + if (Local || Delete) { + GV.setLinkage(GlobalValue::ExternalLinkage); + return; + } + + if (!GV.hasLinkOnceLinkage()) { + assert(!GV.isDiscardableIfUnused()); + return; + } + + // Map linkonce* to weak* so that llvm doesn't drop this GV. + switch(GV.getLinkage()) { + default: + llvm_unreachable("Unexpected linkage"); + case GlobalValue::LinkOnceAnyLinkage: + GV.setLinkage(GlobalValue::WeakAnyLinkage); + return; + case GlobalValue::LinkOnceODRLinkage: + GV.setLinkage(GlobalValue::WeakODRLinkage); + return; + } +} + namespace { /// @brief A pass to extract specific functions and their dependencies. class GVExtractorPass : public ModulePass { @@ -60,12 +92,7 @@ namespace { continue; } - bool Local = I->isDiscardableIfUnused(); - if (Local) - I->setVisibility(GlobalValue::HiddenVisibility); - - if (Local || Delete) - I->setLinkage(GlobalValue::ExternalLinkage); + makeVisible(*I, Delete); if (Delete) I->setInitializer(0); @@ -80,12 +107,7 @@ namespace { continue; } - bool Local = I->isDiscardableIfUnused(); - if (Local) - I->setVisibility(GlobalValue::HiddenVisibility); - - if (Local || Delete) - I->setLinkage(GlobalValue::ExternalLinkage); + makeVisible(*I, Delete); if (Delete) I->deleteBody(); @@ -97,12 +119,10 @@ namespace { Module::alias_iterator CurI = I; ++I; - if (CurI->isDiscardableIfUnused()) { - CurI->setVisibility(GlobalValue::HiddenVisibility); - CurI->setLinkage(GlobalValue::ExternalLinkage); - } + bool Delete = deleteStuff == (bool)Named.count(CurI); + makeVisible(*CurI, Delete); - if (deleteStuff == (bool)Named.count(CurI)) { + if (Delete) { Type *Ty = CurI->getType()->getElementType(); CurI->removeFromParent(); diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp index 1366883..60e5f06 100644 --- a/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/lib/Transforms/IPO/FunctionAttrs.cpp @@ -136,7 +136,8 @@ namespace { char FunctionAttrs::ID = 0; INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs", "Deduce function attributes", false, false) -INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(CallGraph) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_PASS_END(FunctionAttrs, "functionattrs", "Deduce function attributes", false, false) @@ -366,6 +367,7 @@ namespace { } } assert(Found && "Capturing call-site captured nothing?"); + (void)Found; return false; } diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp index 201f320..901295d 100644 --- a/lib/Transforms/IPO/GlobalDCE.cpp +++ b/lib/Transforms/IPO/GlobalDCE.cpp @@ -179,6 +179,9 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) { // any globals used will be marked as needed. Function *F = cast<Function>(G); + if (F->hasPrefixData()) + MarkUsedGlobalsAsNeeded(F->getPrefixData()); + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U) diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index 64cd515..2ea89a1 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -37,7 +37,9 @@ #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/ValueHandle.h" #include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/GlobalStatus.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include <algorithm> using namespace llvm; @@ -60,7 +62,6 @@ STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated"); STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed"); namespace { - struct GlobalStatus; struct GlobalOpt : public ModulePass { virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetLibraryInfo>(); @@ -80,7 +81,6 @@ namespace { bool OptimizeGlobalCtorsList(GlobalVariable *&GCL); bool ProcessGlobal(GlobalVariable *GV,Module::global_iterator &GVI); bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI, - const SmallPtrSet<const PHINode*, 16> &PHIUsers, const GlobalStatus &GS); bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn); @@ -98,209 +98,6 @@ INITIALIZE_PASS_END(GlobalOpt, "globalopt", ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); } -namespace { - -/// GlobalStatus - As we analyze each global, keep track of some information -/// about it. If we find out that the address of the global is taken, none of -/// this info will be accurate. -struct GlobalStatus { - /// isCompared - True if the global's address is used in a comparison. - bool isCompared; - - /// isLoaded - True if the global is ever loaded. If the global isn't ever - /// loaded it can be deleted. - bool isLoaded; - - /// StoredType - Keep track of what stores to the global look like. - /// - enum StoredType { - /// NotStored - There is no store to this global. It can thus be marked - /// constant. - NotStored, - - /// isInitializerStored - This global is stored to, but the only thing - /// stored is the constant it was initialized with. This is only tracked - /// for scalar globals. - isInitializerStored, - - /// isStoredOnce - This global is stored to, but only its initializer and - /// one other value is ever stored to it. If this global isStoredOnce, we - /// track the value stored to it in StoredOnceValue below. This is only - /// tracked for scalar globals. - isStoredOnce, - - /// isStored - This global is stored to by multiple values or something else - /// that we cannot track. - isStored - } StoredType; - - /// StoredOnceValue - If only one value (besides the initializer constant) is - /// ever stored to this global, keep track of what value it is. - Value *StoredOnceValue; - - /// AccessingFunction/HasMultipleAccessingFunctions - These start out - /// null/false. When the first accessing function is noticed, it is recorded. - /// When a second different accessing function is noticed, - /// HasMultipleAccessingFunctions is set to true. - const Function *AccessingFunction; - bool HasMultipleAccessingFunctions; - - /// HasNonInstructionUser - Set to true if this global has a user that is not - /// an instruction (e.g. a constant expr or GV initializer). - bool HasNonInstructionUser; - - /// AtomicOrdering - Set to the strongest atomic ordering requirement. - AtomicOrdering Ordering; - - GlobalStatus() : isCompared(false), isLoaded(false), StoredType(NotStored), - StoredOnceValue(0), AccessingFunction(0), - HasMultipleAccessingFunctions(false), - HasNonInstructionUser(false), Ordering(NotAtomic) {} -}; - -} - -/// StrongerOrdering - Return the stronger of the two ordering. If the two -/// orderings are acquire and release, then return AcquireRelease. -/// -static AtomicOrdering StrongerOrdering(AtomicOrdering X, AtomicOrdering Y) { - if (X == Acquire && Y == Release) return AcquireRelease; - if (Y == Acquire && X == Release) return AcquireRelease; - return (AtomicOrdering)std::max(X, Y); -} - -/// SafeToDestroyConstant - It is safe to destroy a constant iff it is only used -/// by constants itself. Note that constants cannot be cyclic, so this test is -/// pretty easy to implement recursively. -/// -static bool SafeToDestroyConstant(const Constant *C) { - if (isa<GlobalValue>(C)) return false; - - for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E; - ++UI) - if (const Constant *CU = dyn_cast<Constant>(*UI)) { - if (!SafeToDestroyConstant(CU)) return false; - } else - return false; - return true; -} - - -/// AnalyzeGlobal - Look at all uses of the global and fill in the GlobalStatus -/// structure. If the global has its address taken, return true to indicate we -/// can't do anything with it. -/// -static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, - SmallPtrSet<const PHINode*, 16> &PHIUsers) { - for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; - ++UI) { - const User *U = *UI; - if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { - GS.HasNonInstructionUser = true; - - // If the result of the constantexpr isn't pointer type, then we won't - // know to expect it in various places. Just reject early. - if (!isa<PointerType>(CE->getType())) return true; - - if (AnalyzeGlobal(CE, GS, PHIUsers)) return true; - } else if (const Instruction *I = dyn_cast<Instruction>(U)) { - if (!GS.HasMultipleAccessingFunctions) { - const Function *F = I->getParent()->getParent(); - if (GS.AccessingFunction == 0) - GS.AccessingFunction = F; - else if (GS.AccessingFunction != F) - GS.HasMultipleAccessingFunctions = true; - } - if (const LoadInst *LI = dyn_cast<LoadInst>(I)) { - GS.isLoaded = true; - // Don't hack on volatile loads. - if (LI->isVolatile()) return true; - GS.Ordering = StrongerOrdering(GS.Ordering, LI->getOrdering()); - } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) { - // Don't allow a store OF the address, only stores TO the address. - if (SI->getOperand(0) == V) return true; - - // Don't hack on volatile stores. - if (SI->isVolatile()) return true; - - GS.Ordering = StrongerOrdering(GS.Ordering, SI->getOrdering()); - - // If this is a direct store to the global (i.e., the global is a scalar - // value, not an aggregate), keep more specific information about - // stores. - if (GS.StoredType != GlobalStatus::isStored) { - if (const GlobalVariable *GV = dyn_cast<GlobalVariable>( - SI->getOperand(1))) { - Value *StoredVal = SI->getOperand(0); - - if (Constant *C = dyn_cast<Constant>(StoredVal)) { - if (C->isThreadDependent()) { - // The stored value changes between threads; don't track it. - return true; - } - } - - if (StoredVal == GV->getInitializer()) { - if (GS.StoredType < GlobalStatus::isInitializerStored) - GS.StoredType = GlobalStatus::isInitializerStored; - } else if (isa<LoadInst>(StoredVal) && - cast<LoadInst>(StoredVal)->getOperand(0) == GV) { - if (GS.StoredType < GlobalStatus::isInitializerStored) - GS.StoredType = GlobalStatus::isInitializerStored; - } else if (GS.StoredType < GlobalStatus::isStoredOnce) { - GS.StoredType = GlobalStatus::isStoredOnce; - GS.StoredOnceValue = StoredVal; - } else if (GS.StoredType == GlobalStatus::isStoredOnce && - GS.StoredOnceValue == StoredVal) { - // noop. - } else { - GS.StoredType = GlobalStatus::isStored; - } - } else { - GS.StoredType = GlobalStatus::isStored; - } - } - } else if (isa<BitCastInst>(I)) { - if (AnalyzeGlobal(I, GS, PHIUsers)) return true; - } else if (isa<GetElementPtrInst>(I)) { - if (AnalyzeGlobal(I, GS, PHIUsers)) return true; - } else if (isa<SelectInst>(I)) { - if (AnalyzeGlobal(I, GS, PHIUsers)) return true; - } else if (const PHINode *PN = dyn_cast<PHINode>(I)) { - // PHI nodes we can check just like select or GEP instructions, but we - // have to be careful about infinite recursion. - if (PHIUsers.insert(PN)) // Not already visited. - if (AnalyzeGlobal(I, GS, PHIUsers)) return true; - } else if (isa<CmpInst>(I)) { - GS.isCompared = true; - } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) { - if (MTI->isVolatile()) return true; - if (MTI->getArgOperand(0) == V) - GS.StoredType = GlobalStatus::isStored; - if (MTI->getArgOperand(1) == V) - GS.isLoaded = true; - } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) { - assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!"); - if (MSI->isVolatile()) return true; - GS.StoredType = GlobalStatus::isStored; - } else { - return true; // Any other non-load instruction might take address! - } - } else if (const Constant *C = dyn_cast<Constant>(U)) { - GS.HasNonInstructionUser = true; - // We might have a dead and dangling constant hanging off of here. - if (!SafeToDestroyConstant(C)) - return true; - } else { - GS.HasNonInstructionUser = true; - // Otherwise must be some other user. - return true; - } - } - - return false; -} - /// isLeakCheckerRoot - Is this global variable possibly used by a leak checker /// as a root? If so, we might not really want to eliminate the stores to it. static bool isLeakCheckerRoot(GlobalVariable *GV) { @@ -434,7 +231,7 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV, Changed = true; } } else if (Constant *C = dyn_cast<Constant>(U)) { - if (SafeToDestroyConstant(C)) { + if (isSafeToDestroyConstant(C)) { C->destroyConstant(); // This could have invalidated UI, start over from scratch. Dead.clear(); @@ -471,9 +268,17 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV, static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, DataLayout *TD, TargetLibraryInfo *TLI) { bool Changed = false; - SmallVector<User*, 8> WorkList(V->use_begin(), V->use_end()); + // Note that we need to use a weak value handle for the worklist items. When + // we delete a constant array, we may also be holding pointer to one of its + // elements (or an element of one of its elements if we're dealing with an + // array of arrays) in the worklist. + SmallVector<WeakVH, 8> WorkList(V->use_begin(), V->use_end()); while (!WorkList.empty()) { - User *U = WorkList.pop_back_val(); + Value *UV = WorkList.pop_back_val(); + if (!UV) + continue; + + User *U = cast<User>(UV); if (LoadInst *LI = dyn_cast<LoadInst>(U)) { if (Init) { @@ -534,7 +339,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, } else if (Constant *C = dyn_cast<Constant>(U)) { // If we have a chain of dead constantexprs or other things dangling from // us, and if they are all dead, nuke them without remorse. - if (SafeToDestroyConstant(C)) { + if (isSafeToDestroyConstant(C)) { C->destroyConstant(); CleanupConstantGlobalUsers(V, Init, TD, TLI); return true; @@ -549,7 +354,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, static bool isSafeSROAElementUse(Value *V) { // We might have a dead and dangling constant hanging off of here. if (Constant *C = dyn_cast<Constant>(V)) - return SafeToDestroyConstant(C); + return isSafeToDestroyConstant(C); Instruction *I = dyn_cast<Instruction>(V); if (!I) return false; @@ -1373,8 +1178,7 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo, } else if (PHINode *PN = dyn_cast<PHINode>(V)) { // PN's type is pointer to struct. Make a new PHI of pointer to struct // field. - StructType *ST = - cast<StructType>(cast<PointerType>(PN->getType())->getElementType()); + StructType *ST = cast<StructType>(PN->getType()->getPointerElementType()); PHINode *NewPN = PHINode::Create(PointerType::getUnqual(ST->getElementType(FieldNo)), @@ -1505,7 +1309,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, unsigned TypeSize = TD->getTypeAllocSize(FieldTy); if (StructType *ST = dyn_cast<StructType>(FieldTy)) TypeSize = TD->getStructLayout(ST)->getSizeInBytes(); - Type *IntPtrTy = TD->getIntPtrType(CI->getContext()); + Type *IntPtrTy = TD->getIntPtrType(CI->getType()); Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy, ConstantInt::get(IntPtrTy, TypeSize), NElems, 0, @@ -1735,7 +1539,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, // If this is a fixed size array, transform the Malloc to be an alloc of // structs. malloc [100 x struct],1 -> malloc struct, 100 if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) { - Type *IntPtrTy = TD->getIntPtrType(CI->getContext()); + Type *IntPtrTy = TD->getIntPtrType(CI->getType()); unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes(); Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize); Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements()); @@ -1917,13 +1721,12 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, if (!GV->hasLocalLinkage()) return false; - SmallPtrSet<const PHINode*, 16> PHIUsers; GlobalStatus GS; - if (AnalyzeGlobal(GV, GS, PHIUsers)) + if (GlobalStatus::analyzeGlobal(GV, GS)) return false; - if (!GS.isCompared && !GV->hasUnnamedAddr()) { + if (!GS.IsCompared && !GV->hasUnnamedAddr()) { GV->setUnnamedAddr(true); NumUnnamed++; } @@ -1931,14 +1734,13 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, if (GV->isConstant() || !GV->hasInitializer()) return false; - return ProcessInternalGlobal(GV, GVI, PHIUsers, GS); + return ProcessInternalGlobal(GV, GVI, GS); } /// ProcessInternalGlobal - Analyze the specified global variable and optimize /// it if possible. If we make a change, return true. bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, Module::global_iterator &GVI, - const SmallPtrSet<const PHINode*, 16> &PHIUsers, const GlobalStatus &GS) { // If this is a first class global and has only one accessing function // and this function is main (which we know is not recursive), we replace @@ -1971,7 +1773,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, // If the global is never loaded (but may be stored to), it is dead. // Delete it now. - if (!GS.isLoaded) { + if (!GS.IsLoaded) { DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV); bool Changed; @@ -1992,7 +1794,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, } return Changed; - } else if (GS.StoredType <= GlobalStatus::isInitializerStored) { + } else if (GS.StoredType <= GlobalStatus::InitializerStored) { DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n"); GV->setConstant(true); @@ -2015,7 +1817,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, GVI = FirstNewGV; // Don't skip the newly produced globals! return true; } - } else if (GS.StoredType == GlobalStatus::isStoredOnce) { + } else if (GS.StoredType == GlobalStatus::StoredOnce) { // If the initial value for the global was an undef value, and if only // one other value was stored into it, we can just change the // initializer to be the stored value, then delete all stores to the @@ -2048,11 +1850,14 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, // Otherwise, if the global was not a boolean, we can shrink it to be a // boolean. - if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) - if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) { - ++NumShrunkToBool; - return true; + if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) { + if (GS.Ordering == NotAtomic) { + if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) { + ++NumShrunkToBool; + return true; + } } + } } return false; @@ -2210,8 +2015,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL, CSVals[1] = 0; StructType *StructTy = - cast <StructType>( - cast<ArrayType>(GCL->getType()->getElementType())->getElementType()); + cast<StructType>(GCL->getType()->getElementType()->getArrayElementType()); // Create the new init list. std::vector<Constant*> CAList; @@ -3041,14 +2845,8 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) { return true; } -static int compareNames(const void *A, const void *B) { - const GlobalValue *VA = *reinterpret_cast<GlobalValue* const*>(A); - const GlobalValue *VB = *reinterpret_cast<GlobalValue* const*>(B); - if (VA->getName() < VB->getName()) - return -1; - if (VB->getName() < VA->getName()) - return 1; - return 0; +static int compareNames(Constant *const *A, Constant *const *B) { + return (*A)->getName().compare((*B)->getName()); } static void setUsedInitializer(GlobalVariable &V, diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp index a0095da..437597e 100644 --- a/lib/Transforms/IPO/InlineAlways.cpp +++ b/lib/Transforms/IPO/InlineAlways.cpp @@ -63,7 +63,7 @@ public: char AlwaysInliner::ID = 0; INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline", "Inliner for always_inline functions", false, false) -INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_DEPENDENCY(CallGraph) INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) INITIALIZE_PASS_END(AlwaysInliner, "always-inline", "Inliner for always_inline functions", false, false) diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp index a4f7026..57379a3 100644 --- a/lib/Transforms/IPO/InlineSimple.cpp +++ b/lib/Transforms/IPO/InlineSimple.cpp @@ -28,7 +28,7 @@ using namespace llvm; namespace { -/// \brief Actaul inliner pass implementation. +/// \brief Actual inliner pass implementation. /// /// The common implementation of the inlining logic is shared between this /// inliner pass and the always inliner pass. The two passes use different cost @@ -61,7 +61,7 @@ public: char SimpleInliner::ID = 0; INITIALIZE_PASS_BEGIN(SimpleInliner, "inline", "Function Integration/Inlining", false, false) -INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_DEPENDENCY(CallGraph) INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) INITIALIZE_PASS_END(SimpleInliner, "inline", "Function Integration/Inlining", false, false) diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp index d56a06f..64e2ced 100644 --- a/lib/Transforms/IPO/Internalize.cpp +++ b/lib/Transforms/IPO/Internalize.cpp @@ -11,6 +11,12 @@ // If the function or variable is not in the list of external names given to // the pass it is marked as internal. // +// This transformation would not be legal in a regular compilation, but it gets +// extra information from the linker about what is safe. +// +// For example: Internalizing a function with external linkage. Only if we are +// told it is only used from within this module, it is safe to do it. +// //===----------------------------------------------------------------------===// #define DEBUG_TYPE "internalize" @@ -23,6 +29,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/GlobalStatus.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include <fstream> #include <set> @@ -50,10 +57,8 @@ namespace { public: static char ID; // Pass identification, replacement for typeid explicit InternalizePass(); - explicit InternalizePass(ArrayRef<const char *> exportList); + explicit InternalizePass(ArrayRef<const char *> ExportList); void LoadFile(const char *Filename); - void ClearExportList(); - void AddToExportList(const std::string &val); virtual bool runOnModule(Module &M); virtual void getAnalysisUsage(AnalysisUsage &AU) const { @@ -72,15 +77,14 @@ InternalizePass::InternalizePass() initializeInternalizePassPass(*PassRegistry::getPassRegistry()); if (!APIFile.empty()) // If a filename is specified, use it. LoadFile(APIFile.c_str()); - if (!APIList.empty()) // If a list is specified, use it as well. - ExternalNames.insert(APIList.begin(), APIList.end()); + ExternalNames.insert(APIList.begin(), APIList.end()); } -InternalizePass::InternalizePass(ArrayRef<const char *> exportList) +InternalizePass::InternalizePass(ArrayRef<const char *> ExportList) : ModulePass(ID){ initializeInternalizePassPass(*PassRegistry::getPassRegistry()); - for(ArrayRef<const char *>::const_iterator itr = exportList.begin(); - itr != exportList.end(); itr++) { + for(ArrayRef<const char *>::const_iterator itr = ExportList.begin(); + itr != ExportList.end(); itr++) { ExternalNames.insert(*itr); } } @@ -101,12 +105,25 @@ void InternalizePass::LoadFile(const char *Filename) { } } -void InternalizePass::ClearExportList() { - ExternalNames.clear(); -} +static bool shouldInternalize(const GlobalValue &GV, + const std::set<std::string> &ExternalNames) { + // Function must be defined here + if (GV.isDeclaration()) + return false; + + // Available externally is really just a "declaration with a body". + if (GV.hasAvailableExternallyLinkage()) + return false; -void InternalizePass::AddToExportList(const std::string &val) { - ExternalNames.insert(val); + // Already has internal linkage + if (GV.hasLocalLinkage()) + return false; + + // Marked to keep external? + if (ExternalNames.count(GV.getName())) + return false; + + return true; } bool InternalizePass::runOnModule(Module &M) { @@ -114,11 +131,6 @@ bool InternalizePass::runOnModule(Module &M) { CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0; bool Changed = false; - // Never internalize functions which code-gen might insert. - // FIXME: We should probably add this (and the __stack_chk_guard) via some - // type of call-back in CodeGen. - ExternalNames.insert("__stack_chk_fail"); - SmallPtrSet<GlobalValue *, 8> Used; collectUsedGlobalVariables(M, Used, false); @@ -139,19 +151,20 @@ bool InternalizePass::runOnModule(Module &M) { // Mark all functions not in the api as internal. // FIXME: maybe use private linkage? - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) - if (!I->isDeclaration() && // Function must be defined here - // Available externally is really just a "declaration with a body". - !I->hasAvailableExternallyLinkage() && - !I->hasLocalLinkage() && // Can't already have internal linkage - !ExternalNames.count(I->getName())) {// Not marked to keep external? - I->setLinkage(GlobalValue::InternalLinkage); + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + if (!shouldInternalize(*I, ExternalNames)) + continue; + + I->setLinkage(GlobalValue::InternalLinkage); + + if (ExternalNode) // Remove a callgraph edge from the external node to this function. - if (ExternalNode) ExternalNode->removeOneAbstractEdgeTo((*CG)[I]); - Changed = true; - ++NumFunctions; - DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n"); - } + ExternalNode->removeOneAbstractEdgeTo((*CG)[I]); + + Changed = true; + ++NumFunctions; + DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n"); + } // Never internalize the llvm.used symbol. It is used to implement // attribute((used)). @@ -166,35 +179,36 @@ bool InternalizePass::runOnModule(Module &M) { ExternalNames.insert("llvm.global.annotations"); // Never internalize symbols code-gen inserts. + // FIXME: We should probably add this (and the __stack_chk_guard) via some + // type of call-back in CodeGen. + ExternalNames.insert("__stack_chk_fail"); ExternalNames.insert("__stack_chk_guard"); // Mark all global variables with initializers that are not in the api as // internal as well. // FIXME: maybe use private linkage? for (Module::global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) - if (!I->isDeclaration() && !I->hasLocalLinkage() && - // Available externally is really just a "declaration with a body". - !I->hasAvailableExternallyLinkage() && - !ExternalNames.count(I->getName())) { - I->setLinkage(GlobalValue::InternalLinkage); - Changed = true; - ++NumGlobals; - DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n"); - } + I != E; ++I) { + if (!shouldInternalize(*I, ExternalNames)) + continue; + + I->setLinkage(GlobalValue::InternalLinkage); + Changed = true; + ++NumGlobals; + DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n"); + } // Mark all aliases that are not in the api as internal as well. for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); - I != E; ++I) - if (!I->isDeclaration() && !I->hasInternalLinkage() && - // Available externally is really just a "declaration with a body". - !I->hasAvailableExternallyLinkage() && - !ExternalNames.count(I->getName())) { - I->setLinkage(GlobalValue::InternalLinkage); - Changed = true; - ++NumAliases; - DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n"); - } + I != E; ++I) { + if (!shouldInternalize(*I, ExternalNames)) + continue; + + I->setLinkage(GlobalValue::InternalLinkage); + Changed = true; + ++NumAliases; + DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n"); + } return Changed; } @@ -203,6 +217,6 @@ ModulePass *llvm::createInternalizePass() { return new InternalizePass(); } -ModulePass *llvm::createInternalizePass(ArrayRef<const char *> el) { - return new InternalizePass(el); +ModulePass *llvm::createInternalizePass(ArrayRef<const char *> ExportList) { + return new InternalizePass(ExportList); } diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp index 4ce749c..3861421 100644 --- a/lib/Transforms/IPO/MergeFunctions.cpp +++ b/lib/Transforms/IPO/MergeFunctions.cpp @@ -210,16 +210,20 @@ private: // Any two pointers in the same address space are equivalent, intptr_t and // pointers are equivalent. Otherwise, standard type equivalence rules apply. bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const { + + PointerType *PTy1 = dyn_cast<PointerType>(Ty1); + PointerType *PTy2 = dyn_cast<PointerType>(Ty2); + + if (TD) { + if (PTy1 && PTy1->getAddressSpace() == 0) Ty1 = TD->getIntPtrType(Ty1); + if (PTy2 && PTy2->getAddressSpace() == 0) Ty2 = TD->getIntPtrType(Ty2); + } + if (Ty1 == Ty2) return true; - if (Ty1->getTypeID() != Ty2->getTypeID()) { - if (TD) { - LLVMContext &Ctx = Ty1->getContext(); - if (isa<PointerType>(Ty1) && Ty2 == TD->getIntPtrType(Ctx)) return true; - if (isa<PointerType>(Ty2) && Ty1 == TD->getIntPtrType(Ctx)) return true; - } + + if (Ty1->getTypeID() != Ty2->getTypeID()) return false; - } switch (Ty1->getTypeID()) { default: @@ -241,8 +245,7 @@ bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const { return true; case Type::PointerTyID: { - PointerType *PTy1 = cast<PointerType>(Ty1); - PointerType *PTy2 = cast<PointerType>(Ty2); + assert(PTy1 && PTy2 && "Both types must be pointers here."); return PTy1->getAddressSpace() == PTy2->getAddressSpace(); } @@ -352,14 +355,19 @@ bool FunctionComparator::isEquivalentOperation(const Instruction *I1, // Determine whether two GEP operations perform the same underlying arithmetic. bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2) { - // When we have target data, we can reduce the GEP down to the value in bytes - // added to the address. - unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 1; - APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0); - if (TD && - GEP1->accumulateConstantOffset(*TD, Offset1) && - GEP2->accumulateConstantOffset(*TD, Offset2)) { - return Offset1 == Offset2; + unsigned AS = GEP1->getPointerAddressSpace(); + if (AS != GEP2->getPointerAddressSpace()) + return false; + + if (TD) { + // When we have target data, we can reduce the GEP down to the value in bytes + // added to the address. + unsigned BitWidth = TD ? TD->getPointerSizeInBits(AS) : 1; + APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0); + if (GEP1->accumulateConstantOffset(*TD, Offset1) && + GEP2->accumulateConstantOffset(*TD, Offset2)) { + return Offset1 == Offset2; + } } if (GEP1->getPointerOperand()->getType() != @@ -713,6 +721,19 @@ void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) { writeThunk(F, G); } +// Helper for writeThunk, +// Selects proper bitcast operation, +// but a bit simplier then CastInst::getCastOpcode. +static Value* createCast(IRBuilder<false> &Builder, Value *V, Type *DestTy) { + Type *SrcTy = V->getType(); + if (SrcTy->isIntegerTy() && DestTy->isPointerTy()) + return Builder.CreateIntToPtr(V, DestTy); + else if (SrcTy->isPointerTy() && DestTy->isIntegerTy()) + return Builder.CreatePtrToInt(V, DestTy); + else + return Builder.CreateBitCast(V, DestTy); +} + // Replace G with a simple tail call to bitcast(F). Also replace direct uses // of G with bitcast(F). Deletes G. void MergeFunctions::writeThunk(Function *F, Function *G) { @@ -738,7 +759,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { FunctionType *FFTy = F->getFunctionType(); for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end(); AI != AE; ++AI) { - Args.push_back(Builder.CreateBitCast(AI, FFTy->getParamType(i))); + Args.push_back(createCast(Builder, (Value*)AI, FFTy->getParamType(i))); ++i; } @@ -748,13 +769,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { if (NewG->getReturnType()->isVoidTy()) { Builder.CreateRetVoid(); } else { - Type *RetTy = NewG->getReturnType(); - if (CI->getType()->isIntegerTy() && RetTy->isPointerTy()) - Builder.CreateRet(Builder.CreateIntToPtr(CI, RetTy)); - else if (CI->getType()->isPointerTy() && RetTy->isIntegerTy()) - Builder.CreateRet(Builder.CreatePtrToInt(CI, RetTy)); - else - Builder.CreateRet(Builder.CreateBitCast(CI, RetTy)); + Builder.CreateRet(createCast(Builder, CI, NewG->getReturnType())); } NewG->copyAttributesFrom(G); @@ -829,6 +844,18 @@ bool MergeFunctions::insert(ComparableFunction &NewF) { const ComparableFunction &OldF = *Result.first; + // Don't merge tiny functions, since it can just end up making the function + // larger. + // FIXME: Should still merge them if they are unnamed_addr and produce an + // alias. + if (NewF.getFunc()->size() == 1) { + if (NewF.getFunc()->front().size() <= 2) { + DEBUG(dbgs() << NewF.getFunc()->getName() + << " is to small to bother merging\n"); + return false; + } + } + // Never thunk a strong function to a weak function. assert(!OldF.getFunc()->mayBeOverridden() || NewF.getFunc()->mayBeOverridden()); diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index a6b3f4e..24c5018 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -29,20 +29,20 @@ using namespace llvm; static cl::opt<bool> -RunLoopVectorization("vectorize-loops", +RunLoopVectorization("vectorize-loops", cl::Hidden, cl::desc("Run the Loop vectorization passes")); static cl::opt<bool> -LateVectorization("late-vectorize", cl::init(false), cl::Hidden, +LateVectorization("late-vectorize", cl::init(true), cl::Hidden, cl::desc("Run the vectorization pasess late in the pass " "pipeline (after the inliner)")); static cl::opt<bool> -RunSLPVectorization("vectorize-slp", +RunSLPVectorization("vectorize-slp", cl::Hidden, cl::desc("Run the SLP vectorization passes")); static cl::opt<bool> -RunBBVectorization("vectorize-slp-aggressive", +RunBBVectorization("vectorize-slp-aggressive", cl::Hidden, cl::desc("Run the BB vectorization passes")); static cl::opt<bool> @@ -54,6 +54,10 @@ static cl::opt<bool> UseNewSROA("use-new-sroa", cl::init(true), cl::Hidden, cl::desc("Enable the new, experimental SROA pass")); +static cl::opt<bool> +RunLoopRerolling("reroll-loops", cl::Hidden, + cl::desc("Run the loop rerolling pass")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -65,6 +69,7 @@ PassManagerBuilder::PassManagerBuilder() { SLPVectorize = RunSLPVectorization; LoopVectorize = RunLoopVectorization; LateVectorize = LateVectorization; + RerollLoops = RunLoopRerolling; } PassManagerBuilder::~PassManagerBuilder() { @@ -195,8 +200,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. MPM.add(createLoopDeletionPass()); // Delete dead loops - if (!LateVectorize && LoopVectorize && OptLevel > 1 && SizeLevel < 2) - MPM.add(createLoopVectorizePass()); + if (!LateVectorize && LoopVectorize) + MPM.add(createLoopVectorizePass(DisableUnrollLoops)); if (!DisableUnrollLoops) MPM.add(createLoopUnrollPass()); // Unroll small loops @@ -216,22 +221,22 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { addExtensionsToPM(EP_ScalarOptimizerLate, MPM); - if (!LateVectorize) { - if (SLPVectorize) - MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. - - if (BBVectorize) { - MPM.add(createBBVectorizePass()); - MPM.add(createInstructionCombiningPass()); - if (OptLevel > 1 && UseGVNAfterVectorization) - MPM.add(createGVNPass()); // Remove redundancies - else - MPM.add(createEarlyCSEPass()); // Catch trivial redundancies - - // BBVectorize may have significantly shortened a loop body; unroll again. - if (!DisableUnrollLoops) - MPM.add(createLoopUnrollPass()); - } + if (RerollLoops) + MPM.add(createLoopRerollPass()); + if (SLPVectorize) + MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + + if (BBVectorize) { + MPM.add(createBBVectorizePass()); + MPM.add(createInstructionCombiningPass()); + if (OptLevel > 1 && UseGVNAfterVectorization) + MPM.add(createGVNPass()); // Remove redundancies + else + MPM.add(createEarlyCSEPass()); // Catch trivial redundancies + + // BBVectorize may have significantly shortened a loop body; unroll again. + if (!DisableUnrollLoops) + MPM.add(createLoopUnrollPass()); } MPM.add(createAggressiveDCEPass()); // Delete dead instructions @@ -241,7 +246,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { // As an experimental mode, run any vectorization passes in a separate // pipeline from the CGSCC pass manager that runs iteratively with the // inliner. - if (LateVectorize) { + if (LateVectorize && LoopVectorize) { // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC // pass manager that we are specifically trying to avoid. To prevent this // we must insert a no-op module pass to reset the pass manager. @@ -249,35 +254,9 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { // Add the various vectorization passes and relevant cleanup passes for // them since we are no longer in the middle of the main scalar pipeline. - if (LoopVectorize && OptLevel > 1 && SizeLevel < 2) { - MPM.add(createLoopVectorizePass()); - - if (!DisableUnrollLoops) - MPM.add(createLoopUnrollPass()); // Unroll small loops - - // FIXME: Is this necessary/useful? Should we also do SimplifyCFG? - MPM.add(createInstructionCombiningPass()); - } - - if (SLPVectorize) { - MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. - - // FIXME: Is this necessary/useful? Should we also do SimplifyCFG? - MPM.add(createInstructionCombiningPass()); - } - - if (BBVectorize) { - MPM.add(createBBVectorizePass()); - MPM.add(createInstructionCombiningPass()); - if (OptLevel > 1 && UseGVNAfterVectorization) - MPM.add(createGVNPass()); // Remove redundancies - else - MPM.add(createEarlyCSEPass()); // Catch trivial redundancies - - // BBVectorize may have significantly shortened a loop body; unroll again. - if (!DisableUnrollLoops) - MPM.add(createLoopUnrollPass()); - } + MPM.add(createLoopVectorizePass(DisableUnrollLoops)); + MPM.add(createInstructionCombiningPass()); + MPM.add(createCFGSimplificationPass()); } if (!DisableUnitAtATime) { @@ -304,11 +283,8 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, // Now that composite has been compiled, scan through the module, looking // for a main function. If main is defined, mark all other functions // internal. - if (Internalize) { - std::vector<const char*> E; - E.push_back("main"); - PM.add(createInternalizePass(E)); - } + if (Internalize) + PM.add(createInternalizePass("main")); // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function @@ -349,6 +325,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, // The IPO passes may leave cruft around. Clean up after them. PM.add(createInstructionCombiningPass()); PM.add(createJumpThreadingPass()); + // Break up allocas if (UseNewSROA) PM.add(createSROAPass()); @@ -362,6 +339,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, PM.add(createLICMPass()); // Hoist loop invariants. PM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. PM.add(createMemCpyOptPass()); // Remove dead memcpys. + // Nuke dead stores. PM.add(createDeadStoreEliminationPass()); diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp index 89529de..b160913 100644 --- a/lib/Transforms/IPO/PruneEH.cpp +++ b/lib/Transforms/IPO/PruneEH.cpp @@ -51,7 +51,7 @@ namespace { char PruneEH::ID = 0; INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh", "Remove unused exception handling info", false, false) -INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_DEPENDENCY(CallGraph) INITIALIZE_PASS_END(PruneEH, "prune-eh", "Remove unused exception handling info", false, false) diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp index 2791106..c4f5cfc 100644 --- a/lib/Transforms/IPO/StripSymbols.cpp +++ b/lib/Transforms/IPO/StripSymbols.cpp @@ -9,7 +9,7 @@ // // The StripSymbols transformation implements code stripping. Specifically, it // can delete: -// +// // * names for virtual registers // * symbols for internal globals and functions // * debug information @@ -39,7 +39,7 @@ namespace { bool OnlyDebugInfo; public: static char ID; // Pass identification, replacement for typeid - explicit StripSymbols(bool ODI = false) + explicit StripSymbols(bool ODI = false) : ModulePass(ID), OnlyDebugInfo(ODI) { initializeStripSymbolsPass(*PassRegistry::getPassRegistry()); } @@ -144,7 +144,7 @@ static void RemoveDeadConstant(Constant *C) { assert(C->use_empty() && "Constant is not dead!"); SmallPtrSet<Constant*, 4> Operands; for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) - if (OnlyUsedBy(C->getOperand(i), C)) + if (OnlyUsedBy(C->getOperand(i), C)) Operands.insert(cast<Constant>(C->getOperand(i))); if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) { if (!GV->hasLocalLinkage()) return; // Don't delete non static globals. @@ -182,7 +182,7 @@ static void StripTypeNames(Module &M, bool PreserveDbgInfo) { for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) { StructType *STy = StructTypes[i]; if (STy->isLiteral() || STy->getName().empty()) continue; - + if (PreserveDbgInfo && STy->getName().startswith("llvm.dbg")) continue; @@ -199,7 +199,7 @@ static void findUsedValues(GlobalVariable *LLVMUsed, ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer()); for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) - if (GlobalValue *GV = + if (GlobalValue *GV = dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts())) UsedValues.insert(GV); } @@ -217,71 +217,20 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) { if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg")) I->setName(""); // Internal symbols can't participate in linkage } - + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0) if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg")) I->setName(""); // Internal symbols can't participate in linkage StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo); } - + // Remove all names from types. StripTypeNames(M, PreserveDbgInfo); return true; } -// StripDebugInfo - Strip debug info in the module if it exists. -// To do this, we remove llvm.dbg.func.start, llvm.dbg.stoppoint, and -// llvm.dbg.region.end calls, and any globals they point to if now dead. -static bool StripDebugInfo(Module &M) { - - bool Changed = false; - - // Remove all of the calls to the debugger intrinsics, and remove them from - // the module. - if (Function *Declare = M.getFunction("llvm.dbg.declare")) { - while (!Declare->use_empty()) { - CallInst *CI = cast<CallInst>(Declare->use_back()); - CI->eraseFromParent(); - } - Declare->eraseFromParent(); - Changed = true; - } - - if (Function *DbgVal = M.getFunction("llvm.dbg.value")) { - while (!DbgVal->use_empty()) { - CallInst *CI = cast<CallInst>(DbgVal->use_back()); - CI->eraseFromParent(); - } - DbgVal->eraseFromParent(); - Changed = true; - } - - for (Module::named_metadata_iterator NMI = M.named_metadata_begin(), - NME = M.named_metadata_end(); NMI != NME;) { - NamedMDNode *NMD = NMI; - ++NMI; - if (NMD->getName().startswith("llvm.dbg.")) { - NMD->eraseFromParent(); - Changed = true; - } - } - - for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI) - for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE; - ++FI) - for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; - ++BI) { - if (!BI->getDebugLoc().isUnknown()) { - Changed = true; - BI->setDebugLoc(DebugLoc()); - } - } - - return Changed; -} - bool StripSymbols::runOnModule(Module &M) { bool Changed = false; Changed |= StripDebugInfo(M); @@ -307,13 +256,13 @@ bool StripDebugDeclare::runOnModule(Module &M) { assert(CI->use_empty() && "llvm.dbg intrinsic should have void result"); CI->eraseFromParent(); if (Arg1->use_empty()) { - if (Constant *C = dyn_cast<Constant>(Arg1)) + if (Constant *C = dyn_cast<Constant>(Arg1)) DeadConstants.push_back(C); - else + else RecursivelyDeleteTriviallyDeadInstructions(Arg1); } if (Arg2->use_empty()) - if (Constant *C = dyn_cast<Constant>(Arg2)) + if (Constant *C = dyn_cast<Constant>(Arg2)) DeadConstants.push_back(C); } Declare->eraseFromParent(); @@ -332,76 +281,107 @@ bool StripDebugDeclare::runOnModule(Module &M) { return true; } +/// Remove any debug info for global variables/functions in the given module for +/// which said global variable/function no longer exists (i.e. is null). +/// +/// Debugging information is encoded in llvm IR using metadata. This is designed +/// such a way that debug info for symbols preserved even if symbols are +/// optimized away by the optimizer. This special pass removes debug info for +/// such symbols. bool StripDeadDebugInfo::runOnModule(Module &M) { bool Changed = false; - // Debugging infomration is encoded in llvm IR using metadata. This is designed - // such a way that debug info for symbols preserved even if symbols are - // optimized away by the optimizer. This special pass removes debug info for - // such symbols. - - // llvm.dbg.gv keeps track of debug info for global variables. - if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.gv")) { - SmallVector<MDNode *, 8> MDs; - for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) - if (NMD->getOperand(i)) { - assert(DIGlobalVariable(NMD->getOperand(i)).isGlobalVariable() && - "A MDNode in llvm.dbg.gv should be a DIGlobalVariable."); - MDs.push_back(NMD->getOperand(i)); - } - else - Changed = true; - NMD->eraseFromParent(); - NMD = NULL; - - for (SmallVectorImpl<MDNode *>::iterator I = MDs.begin(), - E = MDs.end(); I != E; ++I) { - GlobalVariable *GV = DIGlobalVariable(*I).getGlobal(); - if (GV && M.getGlobalVariable(GV->getName(), true)) { - if (!NMD) - NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv"); - NMD->addOperand(*I); - } + LLVMContext &C = M.getContext(); + + // Find all debug info in F. This is actually overkill in terms of what we + // want to do, but we want to try and be as resilient as possible in the face + // of potential debug info changes by using the formal interfaces given to us + // as much as possible. + DebugInfoFinder F; + F.processModule(M); + + // For each compile unit, find the live set of global variables/functions and + // replace the current list of potentially dead global variables/functions + // with the live list. + SmallVector<Value *, 64> LiveGlobalVariables; + SmallVector<Value *, 64> LiveSubprograms; + DenseSet<const MDNode *> VisitedSet; + + for (DebugInfoFinder::iterator CI = F.compile_unit_begin(), + CE = F.compile_unit_end(); CI != CE; ++CI) { + // Create our compile unit. + DICompileUnit DIC(*CI); + assert(DIC.Verify() && "DIC must verify as a DICompileUnit."); + + // Create our live subprogram list. + DIArray SPs = DIC.getSubprograms(); + bool SubprogramChange = false; + for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) { + DISubprogram DISP(SPs.getElement(i)); + assert(DISP.Verify() && "DISP must verify as a DISubprogram."); + + // Make sure we visit each subprogram only once. + if (!VisitedSet.insert(DISP).second) + continue; + + // If the function referenced by DISP is not null, the function is live. + if (DISP.getFunction()) + LiveSubprograms.push_back(DISP); else - Changed = true; + SubprogramChange = true; } - } - // llvm.dbg.sp keeps track of debug info for subprograms. - if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.sp")) { - SmallVector<MDNode *, 8> MDs; - for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) - if (NMD->getOperand(i)) { - assert(DISubprogram(NMD->getOperand(i)).isSubprogram() && - "A MDNode in llvm.dbg.sp should be a DISubprogram."); - MDs.push_back(NMD->getOperand(i)); - } + // Create our live global variable list. + DIArray GVs = DIC.getGlobalVariables(); + bool GlobalVariableChange = false; + for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i) { + DIGlobalVariable DIG(GVs.getElement(i)); + assert(DIG.Verify() && "DIG must verify as DIGlobalVariable."); + + // Make sure we only visit each global variable only once. + if (!VisitedSet.insert(DIG).second) + continue; + + // If the global variable referenced by DIG is not null, the global + // variable is live. + if (DIG.getGlobal()) + LiveGlobalVariables.push_back(DIG); else - Changed = true; - NMD->eraseFromParent(); - NMD = NULL; - - for (SmallVectorImpl<MDNode *>::iterator I = MDs.begin(), - E = MDs.end(); I != E; ++I) { - bool FnIsLive = false; - if (Function *F = DISubprogram(*I).getFunction()) - if (M.getFunction(F->getName())) - FnIsLive = true; - if (FnIsLive) { - if (!NMD) - NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp"); - NMD->addOperand(*I); - } else { - // Remove llvm.dbg.lv.fnname named mdnode which may have been used - // to hold debug info for dead function's local variables. - StringRef FName = DISubprogram(*I).getLinkageName(); - if (FName.empty()) - FName = DISubprogram(*I).getName(); - if (NamedMDNode *LVNMD = M.getNamedMetadata( - "llvm.dbg.lv." + Function::getRealLinkageName(FName))) - LVNMD->eraseFromParent(); - } + GlobalVariableChange = true; + } + + // If we found dead subprograms or global variables, replace the current + // subprogram list/global variable list with our new live subprogram/global + // variable list. + if (SubprogramChange) { + // Make sure that 9 is still the index of the subprograms. This is to make + // sure that an assert is hit if the location of the subprogram array + // changes. This is just to make sure that this is updated if such an + // event occurs. + assert(DIC->getNumOperands() >= 10 && + SPs == DIC->getOperand(9) && + "DICompileUnits is expected to store Subprograms in operand " + "9."); + DIC->replaceOperandWith(9, MDNode::get(C, LiveSubprograms)); + Changed = true; } + + if (GlobalVariableChange) { + // Make sure that 10 is still the index of global variables. This is to + // make sure that an assert is hit if the location of the subprogram array + // changes. This is just to make sure that this index is updated if such + // an event occurs. + assert(DIC->getNumOperands() >= 11 && + GVs == DIC->getOperand(10) && + "DICompileUnits is expected to store Global Variables in operand " + "10."); + DIC->replaceOperandWith(10, MDNode::get(C, LiveGlobalVariables)); + Changed = true; + } + + // Reset lists for the next iteration. + LiveSubprograms.clear(); + LiveGlobalVariables.clear(); } return Changed; diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h index b3084cc..a5eddc2 100644 --- a/lib/Transforms/InstCombine/InstCombine.h +++ b/lib/Transforms/InstCombine/InstCombine.h @@ -158,8 +158,8 @@ public: ConstantInt *DivRHS); Instruction *FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *DivI, ConstantInt *DivRHS); - Instruction *FoldICmpAddOpCst(ICmpInst &ICI, Value *X, ConstantInt *CI, - ICmpInst::Predicate Pred, Value *TheAdd); + Instruction *FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI, + ICmpInst::Predicate Pred); Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, ICmpInst::Predicate Cond, Instruction &I); Instruction *FoldShiftByConstant(Value *Op0, ConstantInt *Op1, @@ -178,6 +178,7 @@ public: Instruction *visitPtrToInt(PtrToIntInst &CI); Instruction *visitIntToPtr(IntToPtrInst &CI); Instruction *visitBitCast(BitCastInst &CI); + Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI); Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI); Instruction *FoldSelectIntoOp(SelectInst &SI, Value*, Value*); @@ -212,8 +213,8 @@ private: bool ShouldChangeType(Type *From, Type *To) const; Value *dyn_castNegVal(Value *V) const; Value *dyn_castFNegVal(Value *V, bool NoSignedZero=false) const; - Type *FindElementAtOffset(Type *Ty, int64_t Offset, - SmallVectorImpl<Value*> &NewIndices); + Type *FindElementAtOffset(Type *PtrTy, int64_t Offset, + SmallVectorImpl<Value*> &NewIndices); Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI); /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually @@ -271,7 +272,7 @@ public: if (&I == V) V = UndefValue::get(I.getType()); - DEBUG(errs() << "IC: Replacing " << I << "\n" + DEBUG(dbgs() << "IC: Replacing " << I << "\n" " with " << *V << '\n'); I.replaceAllUsesWith(V); @@ -283,7 +284,7 @@ public: // instruction. Instead, visit methods should return the value returned by // this function. Instruction *EraseInstFromFunction(Instruction &I) { - DEBUG(errs() << "IC: ERASE " << I << '\n'); + DEBUG(dbgs() << "IC: ERASE " << I << '\n'); assert(I.use_empty() && "Cannot erase instruction that is used!"); // Make sure that we reprocess all operands now that we reduced their diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index b474bd8..88bb69b 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -488,6 +488,26 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, return result; } +/// Convert an analysis of a masked ICmp into its equivalent if all boolean +/// operations had the opposite sense. Since each "NotXXX" flag (recording !=) +/// is adjacent to the corresponding normal flag (recording ==), this just +/// involves swapping those bits over. +static unsigned conjugateICmpMask(unsigned Mask) { + unsigned NewMask; + NewMask = (Mask & (FoldMskICmp_AMask_AllOnes | FoldMskICmp_BMask_AllOnes | + FoldMskICmp_Mask_AllZeroes | FoldMskICmp_AMask_Mixed | + FoldMskICmp_BMask_Mixed)) + << 1; + + NewMask |= + (Mask & (FoldMskICmp_AMask_NotAllOnes | FoldMskICmp_BMask_NotAllOnes | + FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_AMask_NotMixed | + FoldMskICmp_BMask_NotMixed)) + >> 1; + + return NewMask; +} + /// decomposeBitTestICmp - Decompose an icmp into the form ((X & Y) pred Z) /// if possible. The returned predicate is either == or !=. Returns false if /// decomposition fails. @@ -548,14 +568,22 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, L21 = L22 = L1 = 0; } else { // Look for ANDs in the LHS icmp. - if (match(L1, m_And(m_Value(L11), m_Value(L12)))) { - if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) - L21 = L22 = 0; - } else { - if (!match(L2, m_And(m_Value(L11), m_Value(L12)))) - return 0; - std::swap(L1, L2); + if (!L1->getType()->isIntegerTy()) { + // You can icmp pointers, for example. They really aren't masks. + L11 = L12 = 0; + } else if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) { + // Any icmp can be viewed as being trivially masked; if it allows us to + // remove one, it's worth it. + L11 = L1; + L12 = Constant::getAllOnesValue(L1->getType()); + } + + if (!L2->getType()->isIntegerTy()) { + // You can icmp pointers, for example. They really aren't masks. L21 = L22 = 0; + } else if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) { + L21 = L2; + L22 = Constant::getAllOnesValue(L2->getType()); } } @@ -576,7 +604,14 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, return 0; } E = R2; R1 = 0; ok = true; - } else if (match(R1, m_And(m_Value(R11), m_Value(R12)))) { + } else if (R1->getType()->isIntegerTy()) { + if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) { + // As before, model no mask as a trivial mask if it'll let us do an + // optimisation. + R11 = R1; + R12 = Constant::getAllOnesValue(R1->getType()); + } + if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { A = R11; D = R12; E = R2; ok = true; } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) { @@ -589,7 +624,12 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, return 0; // Look for ANDs in on the right side of the RHS icmp. - if (!ok && match(R2, m_And(m_Value(R11), m_Value(R12)))) { + if (!ok && R2->getType()->isIntegerTy()) { + if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) { + R11 = R2; + R12 = Constant::getAllOnesValue(R2->getType()); + } + if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { A = R11; D = R12; E = R1; ok = true; } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) { @@ -618,8 +658,7 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, /// foldLogOpOfMaskedICmps: /// try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) /// into a single (icmp(A & X) ==/!= Y) -static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, - ICmpInst::Predicate NEWCC, +static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, llvm::InstCombiner::BuilderTy* Builder) { Value *A = 0, *B = 0, *C = 0, *D = 0, *E = 0; ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); @@ -629,8 +668,24 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) && "foldLogOpOfMaskedICmpsHelper must return an equality predicate."); - if (NEWCC == ICmpInst::ICMP_NE) - mask >>= 1; // treat "Not"-states as normal states + // In full generality: + // (icmp (A & B) Op C) | (icmp (A & D) Op E) + // == ![ (icmp (A & B) !Op C) & (icmp (A & D) !Op E) ] + // + // If the latter can be converted into (icmp (A & X) Op Y) then the former is + // equivalent to (icmp (A & X) !Op Y). + // + // Therefore, we can pretend for the rest of this function that we're dealing + // with the conjunction, provided we flip the sense of any comparisons (both + // input and output). + + // In most cases we're going to produce an EQ for the "&&" case. + ICmpInst::Predicate NEWCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE; + if (!IsAnd) { + // Convert the masking analysis into its equivalent with negated + // comparisons. + mask = conjugateICmpMask(mask); + } if (mask & FoldMskICmp_Mask_AllZeroes) { // (icmp eq (A & B), 0) & (icmp eq (A & D), 0) @@ -657,6 +712,40 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, Value* newAnd = Builder->CreateAnd(A, newAnd1); return Builder->CreateICmp(NEWCC, newAnd, A); } + + // Remaining cases assume at least that B and D are constant, and depend on + // their actual values. This isn't strictly, necessary, just a "handle the + // easy cases for now" decision. + ConstantInt *BCst = dyn_cast<ConstantInt>(B); + if (BCst == 0) return 0; + ConstantInt *DCst = dyn_cast<ConstantInt>(D); + if (DCst == 0) return 0; + + if (mask & (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotAllOnes)) { + // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0) + // Only valid if one of the masks is a superset of the other (check "B&D" is + // the same as either B or D). + APInt NewMask = BCst->getValue() & DCst->getValue(); + + if (NewMask == BCst->getValue()) + return LHS; + else if (NewMask == DCst->getValue()) + return RHS; + } + if (mask & FoldMskICmp_AMask_NotAllOnes) { + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // -> (icmp ne (A & B), A) or (icmp ne (A & D), A) + // Only valid if one of the masks is a superset of the other (check "B|D" is + // the same as either B or D). + APInt NewMask = BCst->getValue() | DCst->getValue(); + + if (NewMask == BCst->getValue()) + return LHS; + else if (NewMask == DCst->getValue()) + return RHS; + } if (mask & FoldMskICmp_BMask_Mixed) { // (icmp eq (A & B), C) & (icmp eq (A & D), E) // We already know that B & C == C && D & E == E. @@ -665,14 +754,9 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, // contradict, then we can transform to // -> (icmp eq (A & (B|D)), (C|E)) // Currently, we only handle the case of B, C, D, and E being constant. - ConstantInt *BCst = dyn_cast<ConstantInt>(B); - if (BCst == 0) return 0; - ConstantInt *DCst = dyn_cast<ConstantInt>(D); - if (DCst == 0) return 0; // we can't simply use C and E, because we might actually handle // (icmp ne (A & B), B) & (icmp eq (A & D), D) // with B and D, having a single bit set - ConstantInt *CCst = dyn_cast<ConstantInt>(C); if (CCst == 0) return 0; if (LHSCC != NEWCC) @@ -715,7 +799,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { } // handle (roughly): (icmp eq (A & B), C) & (icmp eq (A & D), E) - if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_EQ, Builder)) + if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder)) return V; // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2). @@ -849,10 +933,15 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { case ICmpInst::ICMP_SGT: // (X != 13 & X s> 15) -> X s> 15 return RHS; case ICmpInst::ICMP_NE: + // Special case to get the ordering right when the values wrap around + // zero. + if (LHSCst->getValue() == 0 && RHSCst->getValue().isAllOnesValue()) + std::swap(LHSCst, RHSCst); if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1 Constant *AddCST = ConstantExpr::getNeg(LHSCst); Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off"); - return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1)); + return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1), + Val->getName()+".cmp"); } break; // (X != 13 & X != 15) -> no change } @@ -1454,10 +1543,60 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B, return 0; } +/// IsOneHotValue - Returns true for "one-hot" values (values where at most +/// one bit can be set). +static bool IsOneHotValue(Value *V) { + // Match 1<<K. + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) + if (BO->getOpcode() == Instruction::Shl) { + ConstantInt *One = dyn_cast<ConstantInt>(BO->getOperand(0)); + return One && One->isOne(); + } + + // Check for power of two integer constants. + if (ConstantInt *K = dyn_cast<ConstantInt>(V)) + return K->getValue().isPowerOf2(); + + return false; +} + /// FoldOrOfICmps - Fold (icmp)|(icmp) if possible. Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); + // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) + // if K1 and K2 are a one-bit mask. + ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1)); + ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1)); + + if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero() && + RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) { + + BinaryOperator *LAnd = dyn_cast<BinaryOperator>(LHS->getOperand(0)); + BinaryOperator *RAnd = dyn_cast<BinaryOperator>(RHS->getOperand(0)); + if (LAnd && RAnd && LAnd->hasOneUse() && RHS->hasOneUse() && + LAnd->getOpcode() == Instruction::And && + RAnd->getOpcode() == Instruction::And) { + + Value *Mask = 0; + Value *Masked = 0; + if (LAnd->getOperand(0) == RAnd->getOperand(0) && + IsOneHotValue(LAnd->getOperand(1)) && + IsOneHotValue(RAnd->getOperand(1))) { + Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1)); + Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask); + } else if (LAnd->getOperand(1) == RAnd->getOperand(1) && + IsOneHotValue(LAnd->getOperand(0)) && + IsOneHotValue(RAnd->getOperand(0))) { + Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0)); + Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask); + } + + if (Masked) + return Builder->CreateICmp(ICmpInst::ICMP_NE, Masked, Mask); + } + } + // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B) if (PredicatesFoldable(LHSCC, RHSCC)) { if (LHS->getOperand(0) == RHS->getOperand(1) && @@ -1474,13 +1613,10 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // handle (roughly): // (icmp ne (A & B), C) | (icmp ne (A & D), E) - if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_NE, Builder)) + if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder)) return V; Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0); - ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1)); - ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1)); - if (LHS->hasOneUse() || RHS->hasOneUse()) { // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1) // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1) diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 9f74fd6..0cd7b14 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -999,20 +999,15 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { // Check to see if we are changing the return type... if (OldRetTy != NewRetTy) { - if (Callee->isDeclaration() && - // Conversion is ok if changing from one pointer type to another or from - // a pointer to an integer of the same size. - !((OldRetTy->isPointerTy() || !TD || - OldRetTy == TD->getIntPtrType(Caller->getContext())) && - (NewRetTy->isPointerTy() || !TD || - NewRetTy == TD->getIntPtrType(Caller->getContext())))) - return false; // Cannot transform this return value. + if (!CastInst::isBitCastable(NewRetTy, OldRetTy)) { + if (Callee->isDeclaration()) + return false; // Cannot transform this return value. - if (!Caller->use_empty() && - // void -> non-void is handled specially - !NewRetTy->isVoidTy() && - !CastInst::isBitCastable(NewRetTy, OldRetTy)) + if (!Caller->use_empty() && + // void -> non-void is handled specially + !NewRetTy->isVoidTy()) return false; // Cannot transform this return value. + } if (!CallerPAL.isEmpty() && !Caller->use_empty()) { AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex); @@ -1045,9 +1040,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { Type *ParamTy = FT->getParamType(i); Type *ActTy = (*AI)->getType(); - if (!CastInst::isBitCastable(ActTy, ParamTy)) { + if (!CastInst::isBitCastable(ActTy, ParamTy)) return false; // Cannot transform this parameter value. - } if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1). hasAttributes(AttributeFuncs:: @@ -1063,21 +1057,11 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0) return false; - Type *CurElTy = cast<PointerType>(ActTy)->getElementType(); + Type *CurElTy = ActTy->getPointerElementType(); if (TD->getTypeAllocSize(CurElTy) != TD->getTypeAllocSize(ParamPTy->getElementType())) return false; } - - // Converting from one pointer type to another or between a pointer and an - // integer of the same size is safe even if we do not have a body. - bool isConvertible = ActTy == ParamTy || - (TD && ((ParamTy->isPointerTy() || - ParamTy == TD->getIntPtrType(Caller->getContext())) && - (ActTy->isPointerTy() || - ActTy == TD->getIntPtrType(Caller->getContext())))); - if (Callee->isDeclaration() && !isConvertible) - return false; } if (Callee->isDeclaration()) { diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 361acdd..72377dc 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1229,6 +1229,19 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { } } + // (fptrunc (select cond, R1, Cst)) --> + // (select cond, (fptrunc R1), (fptrunc Cst)) + SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)); + if (SI && + (isa<ConstantFP>(SI->getOperand(1)) || + isa<ConstantFP>(SI->getOperand(2)))) { + Value *LHSTrunc = Builder->CreateFPTrunc(SI->getOperand(1), + CI.getType()); + Value *RHSTrunc = Builder->CreateFPTrunc(SI->getOperand(2), + CI.getType()); + return SelectInst::Create(SI->getOperand(0), LHSTrunc, RHSTrunc); + } + IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0)); if (II) { switch (II->getIntrinsicID()) { @@ -1249,9 +1262,14 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { } // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x) + // Note that we restrict this transformation based on + // TLI->has(LibFunc::sqrtf), even for the sqrt intrinsic, because + // TLI->has(LibFunc::sqrtf) is sufficient to guarantee that the + // single-precision intrinsic can be expanded in the backend. CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0)); if (Call && Call->getCalledFunction() && TLI->has(LibFunc::sqrtf) && - Call->getCalledFunction()->getName() == TLI->getName(LibFunc::sqrt) && + (Call->getCalledFunction()->getName() == TLI->getName(LibFunc::sqrt) || + Call->getCalledFunction()->getIntrinsicID() == Intrinsic::sqrt) && Call->getNumArgOperands() == 1 && Call->hasOneUse()) { CastInst *Arg = dyn_cast<CastInst>(Call->getArgOperand(0)); @@ -1262,11 +1280,11 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { Arg->getOperand(0)->getType()->isFloatTy()) { Function *Callee = Call->getCalledFunction(); Module *M = CI.getParent()->getParent()->getParent(); - Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf", - Callee->getAttributes(), - Builder->getFloatTy(), - Builder->getFloatTy(), - NULL); + Constant *SqrtfFunc = (Callee->getIntrinsicID() == Intrinsic::sqrt) ? + Intrinsic::getDeclaration(M, Intrinsic::sqrt, Builder->getFloatTy()) : + M->getOrInsertFunction("sqrtf", Callee->getAttributes(), + Builder->getFloatTy(), Builder->getFloatTy(), + NULL); CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0), "sqrtfcall"); ret->setAttributes(Callee->getAttributes()); @@ -1338,14 +1356,18 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) { // If the source integer type is not the intptr_t type for this target, do a // trunc or zext to the intptr_t type, then inttoptr of it. This allows the // cast to be exposed to other transforms. - if (TD && CI.getOperand(0)->getType()->getScalarSizeInBits() != - TD->getPointerSizeInBits()) { - Type *Ty = TD->getIntPtrType(CI.getContext()); - if (CI.getType()->isVectorTy()) // Handle vectors of pointers. - Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements()); - - Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty); - return new IntToPtrInst(P, CI.getType()); + + if (TD) { + unsigned AS = CI.getAddressSpace(); + if (CI.getOperand(0)->getType()->getScalarSizeInBits() != + TD->getPointerSizeInBits(AS)) { + Type *Ty = TD->getIntPtrType(CI.getContext(), AS); + if (CI.getType()->isVectorTy()) // Handle vectors of pointers. + Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements()); + + Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty); + return new IntToPtrInst(P, CI.getType()); + } } if (Instruction *I = commonCastTransforms(CI)) @@ -1370,25 +1392,32 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) { return &CI; } + if (!TD) + return commonCastTransforms(CI); + // If the GEP has a single use, and the base pointer is a bitcast, and the // GEP computes a constant offset, see if we can convert these three // instructions into fewer. This typically happens with unions and other // non-type-safe code. - APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0); - if (TD && GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0)) && + unsigned AS = GEP->getPointerAddressSpace(); + unsigned OffsetBits = TD->getPointerSizeInBits(AS); + APInt Offset(OffsetBits, 0); + BitCastInst *BCI = dyn_cast<BitCastInst>(GEP->getOperand(0)); + if (GEP->hasOneUse() && + BCI && GEP->accumulateConstantOffset(*TD, Offset)) { // Get the base pointer input of the bitcast, and the type it points to. - Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0); - Type *GEPIdxTy = - cast<PointerType>(OrigBase->getType())->getElementType(); + Value *OrigBase = BCI->getOperand(0); SmallVector<Value*, 8> NewIndices; - if (FindElementAtOffset(GEPIdxTy, Offset.getSExtValue(), NewIndices)) { + if (FindElementAtOffset(OrigBase->getType(), + Offset.getSExtValue(), + NewIndices)) { // If we were able to index down into an element, create the GEP // and bitcast the result. This eliminates one bitcast, potentially // two. Value *NGEP = cast<GEPOperator>(GEP)->isInBounds() ? - Builder->CreateInBoundsGEP(OrigBase, NewIndices) : - Builder->CreateGEP(OrigBase, NewIndices); + Builder->CreateInBoundsGEP(OrigBase, NewIndices) : + Builder->CreateGEP(OrigBase, NewIndices); NGEP->takeName(GEP); if (isa<BitCastInst>(CI)) @@ -1406,16 +1435,22 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { // If the destination integer type is not the intptr_t type for this target, // do a ptrtoint to intptr_t then do a trunc or zext. This allows the cast // to be exposed to other transforms. - if (TD && CI.getType()->getScalarSizeInBits() != TD->getPointerSizeInBits()) { - Type *Ty = TD->getIntPtrType(CI.getContext()); - if (CI.getType()->isVectorTy()) // Handle vectors of pointers. - Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements()); - Value *P = Builder->CreatePtrToInt(CI.getOperand(0), Ty); - return CastInst::CreateIntegerCast(P, CI.getType(), /*isSigned=*/false); - } + if (!TD) + return commonPointerCastTransforms(CI); + + Type *Ty = CI.getType(); + unsigned AS = CI.getPointerAddressSpace(); + + if (Ty->getScalarSizeInBits() == TD->getPointerSizeInBits(AS)) + return commonPointerCastTransforms(CI); - return commonPointerCastTransforms(CI); + Type *PtrTy = TD->getIntPtrType(CI.getContext(), AS); + if (Ty->isVectorTy()) // Handle vectors of pointers. + PtrTy = VectorType::get(PtrTy, Ty->getVectorNumElements()); + + Value *P = Builder->CreatePtrToInt(CI.getOperand(0), PtrTy); + return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false); } /// OptimizeVectorResize - This input value (which is known to have vector type) @@ -1488,12 +1523,17 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) { /// insertions into the vector. See the example in the comment for /// OptimizeIntegerToVectorInsertions for the pattern this handles. /// The type of V is always a non-zero multiple of VecEltTy's size. +/// Shift is the number of bits between the lsb of V and the lsb of +/// the vector. /// /// This returns false if the pattern can't be matched or true if it can, /// filling in Elements with the elements found here. -static bool CollectInsertionElements(Value *V, unsigned ElementIndex, +static bool CollectInsertionElements(Value *V, unsigned Shift, SmallVectorImpl<Value*> &Elements, - Type *VecEltTy) { + Type *VecEltTy, InstCombiner &IC) { + assert(isMultipleOfTypeSize(Shift, VecEltTy) && + "Shift should be a multiple of the element type size"); + // Undef values never contribute useful bits to the result. if (isa<UndefValue>(V)) return true; @@ -1505,8 +1545,12 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex, if (C->isNullValue()) return true; + unsigned ElementIndex = getTypeSizeIndex(Shift, VecEltTy); + if (IC.getDataLayout()->isBigEndian()) + ElementIndex = Elements.size() - ElementIndex - 1; + // Fail if multiple elements are inserted into this slot. - if (ElementIndex >= Elements.size() || Elements[ElementIndex] != 0) + if (Elements[ElementIndex] != 0) return false; Elements[ElementIndex] = V; @@ -1522,7 +1566,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex, // it to the right type so it gets properly inserted. if (NumElts == 1) return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy), - ElementIndex, Elements, VecEltTy); + Shift, Elements, VecEltTy, IC); // Okay, this is a constant that covers multiple elements. Slice it up into // pieces and insert each element-sized piece into the vector. @@ -1533,10 +1577,11 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex, Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize); for (unsigned i = 0; i != NumElts; ++i) { + unsigned ShiftI = Shift+i*ElementSize; Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(), - i*ElementSize)); + ShiftI)); Piece = ConstantExpr::getTrunc(Piece, ElementIntTy); - if (!CollectInsertionElements(Piece, ElementIndex+i, Elements, VecEltTy)) + if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy, IC)) return false; } return true; @@ -1549,29 +1594,28 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex, switch (I->getOpcode()) { default: return false; // Unhandled case. case Instruction::BitCast: - return CollectInsertionElements(I->getOperand(0), ElementIndex, - Elements, VecEltTy); + return CollectInsertionElements(I->getOperand(0), Shift, + Elements, VecEltTy, IC); case Instruction::ZExt: if (!isMultipleOfTypeSize( I->getOperand(0)->getType()->getPrimitiveSizeInBits(), VecEltTy)) return false; - return CollectInsertionElements(I->getOperand(0), ElementIndex, - Elements, VecEltTy); + return CollectInsertionElements(I->getOperand(0), Shift, + Elements, VecEltTy, IC); case Instruction::Or: - return CollectInsertionElements(I->getOperand(0), ElementIndex, - Elements, VecEltTy) && - CollectInsertionElements(I->getOperand(1), ElementIndex, - Elements, VecEltTy); + return CollectInsertionElements(I->getOperand(0), Shift, + Elements, VecEltTy, IC) && + CollectInsertionElements(I->getOperand(1), Shift, + Elements, VecEltTy, IC); case Instruction::Shl: { // Must be shifting by a constant that is a multiple of the element size. ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1)); if (CI == 0) return false; - if (!isMultipleOfTypeSize(CI->getZExtValue(), VecEltTy)) return false; - unsigned IndexShift = getTypeSizeIndex(CI->getZExtValue(), VecEltTy); - - return CollectInsertionElements(I->getOperand(0), ElementIndex+IndexShift, - Elements, VecEltTy); + Shift += CI->getZExtValue(); + if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false; + return CollectInsertionElements(I->getOperand(0), Shift, + Elements, VecEltTy, IC); } } @@ -1594,12 +1638,15 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex, /// Into two insertelements that do "buildvector{%inc, %inc5}". static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, InstCombiner &IC) { + // We need to know the target byte order to perform this optimization. + if (!IC.getDataLayout()) return 0; + VectorType *DestVecTy = cast<VectorType>(CI.getType()); Value *IntInput = CI.getOperand(0); SmallVector<Value*, 8> Elements(DestVecTy->getNumElements()); if (!CollectInsertionElements(IntInput, 0, Elements, - DestVecTy->getElementType())) + DestVecTy->getElementType(), IC)) return 0; // If we succeeded, we know that all of the element are specified by Elements @@ -1785,10 +1832,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { // Okay, we have (bitcast (shuffle ..)). Check to see if this is // a bitcast to a vector with the same # elts. if (SVI->hasOneUse() && DestTy->isVectorTy() && - cast<VectorType>(DestTy)->getNumElements() == - SVI->getType()->getNumElements() && + DestTy->getVectorNumElements() == SVI->getType()->getNumElements() && SVI->getType()->getNumElements() == - cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements()) { + SVI->getOperand(0)->getType()->getVectorNumElements()) { BitCastInst *Tmp; // If either of the operands is a cast from CI.getType(), then // evaluating the shuffle in the casted destination's type will allow @@ -1810,3 +1856,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { return commonPointerCastTransforms(CI); return commonCastTransforms(CI); } + +Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) { + return commonCastTransforms(CI); +} diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index c0225ae..9bb65ef 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -227,7 +227,8 @@ Instruction *InstCombiner:: FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI, ConstantInt *AndCst) { // We need TD information to know the pointer size unless this is inbounds. - if (!GEP->isInBounds() && TD == 0) return 0; + if (!GEP->isInBounds() && TD == 0) + return 0; Constant *Init = GV->getInitializer(); if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init)) @@ -393,9 +394,12 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, // If the index is larger than the pointer size of the target, truncate the // index down like the GEP would do implicitly. We don't have to do this for // an inbounds GEP because the index can't be out of range. - if (!GEP->isInBounds() && - Idx->getType()->getPrimitiveSizeInBits() > TD->getPointerSizeInBits()) - Idx = Builder->CreateTrunc(Idx, TD->getIntPtrType(Idx->getContext())); + if (!GEP->isInBounds()) { + Type *IntPtrTy = TD->getIntPtrType(GEP->getType()); + unsigned PtrSize = IntPtrTy->getIntegerBitWidth(); + if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize) + Idx = Builder->CreateTrunc(Idx, IntPtrTy); + } // If the comparison is only true for one or two elements, emit direct // comparisons. @@ -562,16 +566,18 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) { } } + + // Okay, we know we have a single variable index, which must be a // pointer/array/vector index. If there is no offset, life is simple, return // the index. - unsigned IntPtrWidth = TD.getPointerSizeInBits(); + Type *IntPtrTy = TD.getIntPtrType(GEP->getOperand(0)->getType()); + unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth(); if (Offset == 0) { // Cast to intptrty in case a truncation occurs. If an extension is needed, // we don't need to bother extending: the extension won't affect where the // computation crosses zero. if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) { - Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext()); VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy); } return VariableIdx; @@ -593,7 +599,6 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) { return 0; // Okay, we can do this evaluation. Start by converting the index to intptr. - Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext()); if (VariableIdx->getType() != IntPtrTy) VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy, true /*Signed*/); @@ -737,10 +742,9 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, } /// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X". -Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI, +Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI, - ICmpInst::Predicate Pred, - Value *TheAdd) { + ICmpInst::Predicate Pred) { // If we have X+0, exit early (simplifying logic below) and let it get folded // elsewhere. icmp X+0, X -> icmp X, X if (CI->isZero()) { @@ -1194,11 +1198,16 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, Type *AndTy = AndCST->getType(); // Type of the and. // We can fold this as long as we can't shift unknown bits - // into the mask. This can only happen with signed shift - // rights, as they sign-extend. + // into the mask. This can happen with signed shift + // rights, as they sign-extend. With logical shifts, + // we must still make sure the comparison is not signed + // because we are effectively changing the + // position of the sign bit (PR17827). + // TODO: We can relax these constraints a bit more. if (ShAmt) { - bool CanFold = Shift->isLogicalShift(); - if (!CanFold) { + bool CanFold = false; + unsigned ShiftOpcode = Shift->getOpcode(); + if (ShiftOpcode == Instruction::AShr) { // To test for the bad case of the signed shr, see if any // of the bits shifted in could be tested after the mask. uint32_t TyBits = Ty->getPrimitiveSizeInBits(); @@ -1208,6 +1217,9 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) & AndCST->getValue()) == 0) CanFold = true; + } else if (ShiftOpcode == Instruction::Shl || + ShiftOpcode == Instruction::LShr) { + CanFold = !ICI.isSigned(); } if (CanFold) { @@ -1781,8 +1793,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) { // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the // integer type is the same size as the pointer type. if (TD && LHSCI->getOpcode() == Instruction::PtrToInt && - TD->getPointerSizeInBits() == - cast<IntegerType>(DestTy)->getBitWidth()) { + TD->getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) { Value *RHSOp = 0; if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) { RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy); @@ -2035,14 +2046,59 @@ static APInt DemandedBitsLHSMask(ICmpInst &I, } +/// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst +/// should be swapped. +/// The descision is based on how many times these two operands are reused +/// as subtract operands and their positions in those instructions. +/// The rational is that several architectures use the same instruction for +/// both subtract and cmp, thus it is better if the order of those operands +/// match. +/// \return true if Op0 and Op1 should be swapped. +static bool swapMayExposeCSEOpportunities(const Value * Op0, + const Value * Op1) { + // Filter out pointer value as those cannot appears directly in subtract. + // FIXME: we may want to go through inttoptrs or bitcasts. + if (Op0->getType()->isPointerTy()) + return false; + // Count every uses of both Op0 and Op1 in a subtract. + // Each time Op0 is the first operand, count -1: swapping is bad, the + // subtract has already the same layout as the compare. + // Each time Op0 is the second operand, count +1: swapping is good, the + // subtract has a diffrent layout as the compare. + // At the end, if the benefit is greater than 0, Op0 should come second to + // expose more CSE opportunities. + int GlobalSwapBenefits = 0; + for (Value::const_use_iterator UI = Op0->use_begin(), UIEnd = Op0->use_end(); UI != UIEnd; ++UI) { + const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(*UI); + if (!BinOp || BinOp->getOpcode() != Instruction::Sub) + continue; + // If Op0 is the first argument, this is not beneficial to swap the + // arguments. + int LocalSwapBenefits = -1; + unsigned Op1Idx = 1; + if (BinOp->getOperand(Op1Idx) == Op0) { + Op1Idx = 0; + LocalSwapBenefits = 1; + } + if (BinOp->getOperand(Op1Idx) != Op1) + continue; + GlobalSwapBenefits += LocalSwapBenefits; + } + return GlobalSwapBenefits > 0; +} + Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { bool Changed = false; Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + unsigned Op0Cplxity = getComplexity(Op0); + unsigned Op1Cplxity = getComplexity(Op1); /// Orders the operands of the compare so that they are listed from most /// complex to least complex. This puts constants before unary operators, /// before binary operators. - if (getComplexity(Op0) < getComplexity(Op1)) { + if (Op0Cplxity < Op1Cplxity || + (Op0Cplxity == Op1Cplxity && + swapMayExposeCSEOpportunities(Op0, Op1))) { I.swapOperands(); std::swap(Op0, Op1); Changed = true; @@ -2477,7 +2533,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { case Instruction::IntToPtr: // icmp pred inttoptr(X), null -> icmp pred X, 0 if (RHSC->isNullValue() && TD && - TD->getIntPtrType(RHSC->getContext()) == + TD->getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType()) return new ICmpInst(I.getPredicate(), LHSI->getOperand(0), Constant::getNullValue(LHSI->getOperand(0)->getType())); @@ -2900,6 +2956,24 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { Builder->CreateTrunc(B, A->getType())); } + // (A >> C) == (B >> C) --> (A^B) u< (1 << C) + // For lshr and ashr pairs. + if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) && + match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) || + (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) && + match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) { + unsigned TypeBits = Cst1->getBitWidth(); + unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits); + if (ShAmt < TypeBits && ShAmt != 0) { + ICmpInst::Predicate Pred = I.getPredicate() == ICmpInst::ICMP_NE + ? ICmpInst::ICMP_UGE + : ICmpInst::ICMP_ULT; + Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted"); + APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt); + return new ICmpInst(Pred, Xor, Builder->getInt(CmpVal)); + } + } + // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to // "icmp (and X, mask), cst" uint64_t ShAmt = 0; @@ -2930,20 +3004,15 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { Value *X; ConstantInt *Cst; // icmp X+Cst, X if (match(Op0, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op1 == X) - return FoldICmpAddOpCst(I, X, Cst, I.getPredicate(), Op0); + return FoldICmpAddOpCst(I, X, Cst, I.getPredicate()); // icmp X, X+Cst if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X) - return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate(), Op1); + return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate()); } return Changed ? &I : 0; } - - - - - /// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible. /// Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index e2d7966..4c861b3 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -154,7 +154,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // Ensure that the alloca array size argument has type intptr_t, so that // any casting is exposed early. if (TD) { - Type *IntPtrTy = TD->getIntPtrType(AI.getContext()); + Type *IntPtrTy = TD->getIntPtrType(AI.getType()); if (AI.getArraySize()->getType() != IntPtrTy) { Value *V = Builder->CreateIntCast(AI.getArraySize(), IntPtrTy, false); @@ -180,12 +180,13 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // Now that I is pointing to the first non-allocation-inst in the block, // insert our getelementptr instruction... // - Value *NullIdx =Constant::getNullValue(Type::getInt32Ty(AI.getContext())); - Value *Idx[2]; - Idx[0] = NullIdx; - Idx[1] = NullIdx; + Type *IdxTy = TD + ? TD->getIntPtrType(AI.getType()) + : Type::getInt64Ty(AI.getContext()); + Value *NullIdx = Constant::getNullValue(IdxTy); + Value *Idx[2] = { NullIdx, NullIdx }; Instruction *GEP = - GetElementPtrInst::CreateInBounds(New, Idx, New->getName()+".sub"); + GetElementPtrInst::CreateInBounds(New, Idx, New->getName() + ".sub"); InsertNewInstBefore(GEP, *It); // Now make everything use the getelementptr instead of the original @@ -262,9 +263,9 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) EraseInstFromFunction(*ToDelete[i]); Constant *TheSrc = cast<Constant>(Copy->getSource()); - Instruction *NewI - = ReplaceInstUsesWith(AI, ConstantExpr::getBitCast(TheSrc, - AI.getType())); + Constant *Cast + = ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, AI.getType()); + Instruction *NewI = ReplaceInstUsesWith(AI, Cast); EraseInstFromFunction(*Copy); ++NumGlobalCopies; return NewI; @@ -302,9 +303,11 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI, if (ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy)) if (Constant *CSrc = dyn_cast<Constant>(CastOp)) if (ASrcTy->getNumElements() != 0) { - Value *Idxs[2]; - Idxs[0] = Constant::getNullValue(Type::getInt32Ty(LI.getContext())); - Idxs[1] = Idxs[0]; + Type *IdxTy = TD + ? TD->getIntPtrType(SrcTy) + : Type::getInt64Ty(SrcTy->getContext()); + Value *Idx = Constant::getNullValue(IdxTy); + Value *Idxs[2] = { Idx, Idx }; CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs); SrcTy = cast<PointerType>(CastOp->getType()); SrcPTy = SrcTy->getElementType(); @@ -315,7 +318,8 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI, SrcPTy->isVectorTy()) && // Do not allow turning this into a load of an integer, which is then // casted to a pointer, this pessimizes pointer analysis a lot. - (SrcPTy->isPointerTy() == LI.getType()->isPointerTy()) && + (SrcPTy->isPtrOrPtrVectorTy() == + LI.getType()->isPtrOrPtrVectorTy()) && IC.getDataLayout()->getTypeSizeInBits(SrcPTy) == IC.getDataLayout()->getTypeSizeInBits(DestPTy)) { diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index cc6a301..a759548 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -374,9 +374,12 @@ Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C, } else { if (C0) { // (C0 / X) * C => (C0 * C) / X - ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C)); - if (isNormalFp(F)) - R = BinaryOperator::CreateFDiv(F, Opnd1); + if (FMulOrDiv->hasOneUse()) { + // It would otherwise introduce another div. + ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C)); + if (isNormalFp(F)) + R = BinaryOperator::CreateFDiv(F, Opnd1); + } } else { // (X / C1) * C => X * (C/C1) if C/C1 is not a denormal ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFDiv(C, C1)); @@ -460,10 +463,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (Swap && FAddSub->getOpcode() == Instruction::FSub) std::swap(M0, M1); - Value *R = (FAddSub->getOpcode() == Instruction::FAdd) ? - BinaryOperator::CreateFAdd(M0, M1) : - BinaryOperator::CreateFSub(M0, M1); - Instruction *RI = cast<Instruction>(R); + Instruction *RI = (FAddSub->getOpcode() == Instruction::FAdd) + ? BinaryOperator::CreateFAdd(M0, M1) + : BinaryOperator::CreateFSub(M0, M1); RI->copyFastMathFlags(&I); return RI; } @@ -490,13 +492,13 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { } // if pattern detected emit alternate sequence if (OpX && OpY) { + BuilderTy::FastMathFlagGuard Guard(*Builder); + Builder->SetFastMathFlags(Log2->getFastMathFlags()); Log2->setArgOperand(0, OpY); Value *FMulVal = Builder->CreateFMul(OpX, Log2); - Instruction *FMul = cast<Instruction>(FMulVal); - FMul->copyFastMathFlags(Log2); - Instruction *FSub = BinaryOperator::CreateFSub(FMulVal, OpX); - FSub->copyFastMathFlags(Log2); - return FSub; + Value *FSub = Builder->CreateFSub(FMulVal, OpX); + FSub->takeName(&I); + return ReplaceInstUsesWith(I, FSub); } } @@ -506,6 +508,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { for (int i = 0; i < 2; i++) { bool IgnoreZeroSign = I.hasNoSignedZeros(); if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) { + BuilderTy::FastMathFlagGuard Guard(*Builder); + Builder->SetFastMathFlags(I.getFastMathFlags()); + Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign); Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign); @@ -516,13 +521,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (Opnd0->hasOneUse()) { // -X * Y => -(X*Y) (Promote negation as high as possible) Value *T = Builder->CreateFMul(N0, Opnd1); - cast<Instruction>(T)->setDebugLoc(I.getDebugLoc()); - Instruction *Neg = BinaryOperator::CreateFNeg(T); - if (I.getFastMathFlags().any()) { - cast<Instruction>(T)->copyFastMathFlags(&I); - Neg->copyFastMathFlags(&I); - } - return Neg; + Value *Neg = Builder->CreateFNeg(T); + Neg->takeName(&I); + return ReplaceInstUsesWith(I, Neg); } } @@ -545,13 +546,13 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { Y = Opnd0_0; if (Y) { - Instruction *T = cast<Instruction>(Builder->CreateFMul(Opnd1, Opnd1)); - T->copyFastMathFlags(&I); - T->setDebugLoc(I.getDebugLoc()); + BuilderTy::FastMathFlagGuard Guard(*Builder); + Builder->SetFastMathFlags(I.getFastMathFlags()); + Value *T = Builder->CreateFMul(Opnd1, Opnd1); - Instruction *R = BinaryOperator::CreateFMul(T, Y); - R->copyFastMathFlags(&I); - return R; + Value *R = Builder->CreateFMul(T, Y); + R->takeName(&I); + return ReplaceInstUsesWith(I, R); } } } diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp index bd14e81..4c6d0c4 100644 --- a/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -604,8 +604,6 @@ namespace llvm { LHS.Width == RHS.Width; } }; - template <> - struct isPodLike<LoweredPHIRecord> { static const bool value = true; }; } @@ -688,10 +686,10 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { // extracted out of it. First, sort the users by their offset and size. array_pod_sort(PHIUsers.begin(), PHIUsers.end()); - DEBUG(errs() << "SLICING UP PHI: " << FirstPhi << '\n'; - for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) - errs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] <<'\n'; - ); + DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n'; + for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) + dbgs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n'; + ); // PredValues - This is a temporary used when rewriting PHI nodes. It is // hoisted out here to avoid construction/destruction thrashing. @@ -772,7 +770,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { } PredValues.clear(); - DEBUG(errs() << " Made element PHI for offset " << Offset << ": " + DEBUG(dbgs() << " Made element PHI for offset " << Offset << ": " << *EltPHI << '\n'); ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI; } @@ -792,7 +790,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { // PHINode simplification // Instruction *InstCombiner::visitPHINode(PHINode &PN) { - if (Value *V = SimplifyInstruction(&PN, TD)) + if (Value *V = SimplifyInstruction(&PN, TD, TLI)) return ReplaceInstUsesWith(PN, V); // If all PHI operands are the same operation, pull them through the PHI, diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index 7581dbe..283bec2 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -367,7 +367,7 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal, Value *FalseVal, InstCombiner::BuilderTy *Builder) { const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition()); - if (!IC || !IC->isEquality()) + if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy()) return 0; Value *CmpLHS = IC->getOperand(0); diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index a7bfe09..c831ddd 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -808,7 +808,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // TODO: Could compute known zero/one bits based on the input. break; } - case Intrinsic::x86_sse42_crc32_64_8: case Intrinsic::x86_sse42_crc32_64_64: KnownZero = APInt::getHighBitsSet(64, 32); return 0; @@ -845,21 +844,26 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr, Instruction *Shl, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne) { - unsigned ShlAmt = cast<ConstantInt>(Shl->getOperand(1))->getZExtValue(); - unsigned ShrAmt = cast<ConstantInt>(Shr->getOperand(1))->getZExtValue(); + const APInt &ShlOp1 = cast<ConstantInt>(Shl->getOperand(1))->getValue(); + const APInt &ShrOp1 = cast<ConstantInt>(Shr->getOperand(1))->getValue(); + if (!ShlOp1 || !ShrOp1) + return 0; // Noop. + + Value *VarX = Shr->getOperand(0); + Type *Ty = VarX->getType(); + unsigned BitWidth = Ty->getIntegerBitWidth(); + if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth)) + return 0; // Undef. + + unsigned ShlAmt = ShlOp1.getZExtValue(); + unsigned ShrAmt = ShrOp1.getZExtValue(); KnownOne.clearAllBits(); KnownZero = APInt::getBitsSet(KnownZero.getBitWidth(), 0, ShlAmt-1); KnownZero &= DemandedMask; - if (ShlAmt == 0 || ShrAmt == 0) - return 0; - - Value *VarX = Shr->getOperand(0); - Type *Ty = VarX->getType(); - - APInt BitMask1(APInt::getAllOnesValue(Ty->getIntegerBitWidth())); - APInt BitMask2(APInt::getAllOnesValue(Ty->getIntegerBitWidth())); + APInt BitMask1(APInt::getAllOnesValue(BitWidth)); + APInt BitMask2(APInt::getAllOnesValue(BitWidth)); bool isLshr = (Shr->getOpcode() == Instruction::LShr); BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) : diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index f3de6e2..1e72410 100644 --- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -106,8 +106,8 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) { } // If we have a PHI node with a vector type that has only 2 uses: feed -// itself and be an operand of extractelemnt at a constant location, -// try to replace the PHI of the vector type with a PHI of a scalar type +// itself and be an operand of extractelement at a constant location, +// try to replace the PHI of the vector type with a PHI of a scalar type. Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { // Verify that the PHI node has exactly 2 uses. Otherwise return NULL. if (!PN->hasNUses(2)) @@ -282,6 +282,38 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { Worklist.AddValue(EE); return CastInst::Create(CI->getOpcode(), EE, EI.getType()); } + } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) { + if (SI->hasOneUse()) { + // TODO: For a select on vectors, it might be useful to do this if it + // has multiple extractelement uses. For vector select, that seems to + // fight the vectorizer. + + // If we are extracting an element from a vector select or a select on + // vectors, a select on the scalars extracted from the vector arguments. + Value *TrueVal = SI->getTrueValue(); + Value *FalseVal = SI->getFalseValue(); + + Value *Cond = SI->getCondition(); + if (Cond->getType()->isVectorTy()) { + Cond = Builder->CreateExtractElement(Cond, + EI.getIndexOperand(), + Cond->getName() + ".elt"); + } + + Value *V1Elem + = Builder->CreateExtractElement(TrueVal, + EI.getIndexOperand(), + TrueVal->getName() + ".elt"); + + Value *V2Elem + = Builder->CreateExtractElement(FalseVal, + EI.getIndexOperand(), + FalseVal->getName() + ".elt"); + return SelectInst::Create(Cond, + V1Elem, + V2Elem, + SI->getName() + ".elt"); + } } } return 0; @@ -294,7 +326,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, SmallVectorImpl<Constant*> &Mask) { assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() && "Invalid CollectSingleShuffleElements"); - unsigned NumElts = cast<VectorType>(V->getType())->getNumElements(); + unsigned NumElts = V->getType()->getVectorNumElements(); if (isa<UndefValue>(V)) { Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext()))); diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h index 19959c0..f84db27 100644 --- a/lib/Transforms/InstCombine/InstCombineWorklist.h +++ b/lib/Transforms/InstCombine/InstCombineWorklist.h @@ -37,7 +37,7 @@ public: /// in it. void Add(Instruction *I) { if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second) { - DEBUG(errs() << "IC: ADD: " << *I << '\n'); + DEBUG(dbgs() << "IC: ADD: " << *I << '\n'); Worklist.push_back(I); } } @@ -54,7 +54,7 @@ public: assert(Worklist.empty() && "Worklist must be empty to add initial group"); Worklist.reserve(NumEntries+16); WorklistMap.resize(NumEntries); - DEBUG(errs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n"); + DEBUG(dbgs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n"); for (unsigned Idx = 0; NumEntries; --NumEntries) { Instruction *I = List[NumEntries-1]; WorklistMap.insert(std::make_pair(I, Idx++)); @@ -74,8 +74,7 @@ public: } Instruction *RemoveOne() { - Instruction *I = Worklist.back(); - Worklist.pop_back(); + Instruction *I = Worklist.pop_back_val(); WorklistMap.erase(I); return I; } diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index b34ae21..191a101 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -699,7 +699,10 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB); Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB); Value *InV = 0; - if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) + // Beware of ConstantExpr: it may eventually evaluate to getNullValue, + // even if currently isNullValue gives false. + Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)); + if (InC && !isa<ConstantExpr>(InC)) InV = InC->isNullValue() ? FalseVInPred : TrueVInPred; else InV = Builder->CreateSelect(PN->getIncomingValue(i), @@ -755,19 +758,25 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { return ReplaceInstUsesWith(I, NewPN); } -/// FindElementAtOffset - Given a type and a constant offset, determine whether -/// or not there is a sequence of GEP indices into the type that will land us at -/// the specified offset. If so, fill them into NewIndices and return the -/// resultant element type, otherwise return null. -Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, - SmallVectorImpl<Value*> &NewIndices) { - if (!TD) return 0; - if (!Ty->isSized()) return 0; +/// FindElementAtOffset - Given a pointer type and a constant offset, determine +/// whether or not there is a sequence of GEP indices into the pointed type that +/// will land us at the specified offset. If so, fill them into NewIndices and +/// return the resultant element type, otherwise return null. +Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset, + SmallVectorImpl<Value*> &NewIndices) { + assert(PtrTy->isPtrOrPtrVectorTy()); + + if (!TD) + return 0; + + Type *Ty = PtrTy->getPointerElementType(); + if (!Ty->isSized()) + return 0; // Start with the index over the outer type. Note that the type size // might be zero (even if the offset isn't zero) if the indexed type // is something like [0 x {int, int}] - Type *IntPtrTy = TD->getIntPtrType(Ty->getContext()); + Type *IntPtrTy = TD->getIntPtrType(PtrTy); int64_t FirstIdx = 0; if (int64_t TySize = TD->getTypeAllocSize(Ty)) { FirstIdx = Offset/TySize; @@ -1176,6 +1185,22 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { GetElementPtrInst::Create(Src->getOperand(0), Indices, GEP.getName()); } + // Canonicalize (gep i8* X, -(ptrtoint Y)) to (sub (ptrtoint X), (ptrtoint Y)) + // The GEP pattern is emitted by the SCEV expander for certain kinds of + // pointer arithmetic. + if (TD && GEP.getNumIndices() == 1 && + match(GEP.getOperand(1), m_Neg(m_PtrToInt(m_Value())))) { + unsigned AS = GEP.getPointerAddressSpace(); + if (GEP.getType() == Builder->getInt8PtrTy(AS) && + GEP.getOperand(1)->getType()->getScalarSizeInBits() == + TD->getPointerSizeInBits(AS)) { + Operator *Index = cast<Operator>(GEP.getOperand(1)); + Value *PtrToInt = Builder->CreatePtrToInt(PtrOp, Index->getType()); + Value *NewSub = Builder->CreateSub(PtrToInt, Index->getOperand(1)); + return CastInst::Create(Instruction::IntToPtr, NewSub, GEP.getType()); + } + } + // Handle gep(bitcast x) and gep(gep x, 0, 0, 0). Value *StrippedPtr = PtrOp->stripPointerCasts(); PointerType *StrippedPtrTy = dyn_cast<PointerType>(StrippedPtr->getType()); @@ -1231,13 +1256,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V // into: %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast Type *SrcElTy = StrippedPtrTy->getElementType(); - Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType(); + Type *ResElTy = PtrOp->getType()->getPointerElementType(); if (TD && SrcElTy->isArrayTy() && - TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()) == + TD->getTypeAllocSize(SrcElTy->getArrayElementType()) == TD->getTypeAllocSize(ResElTy)) { - Value *Idx[2]; - Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext())); - Idx[1] = GEP.getOperand(1); + Type *IdxType = TD->getIntPtrType(GEP.getType()); + Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) }; Value *NewGEP = GEP.isInBounds() ? Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) : Builder->CreateGEP(StrippedPtr, Idx, GEP.getName()); @@ -1261,7 +1285,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Earlier transforms ensure that the index has type IntPtrType, which // considerably simplifies the logic by eliminating implicit casts. - assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) && + assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) && "Index not cast to pointer width?"); bool NSW; @@ -1287,8 +1311,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Check that changing to the array element type amounts to dividing the // index by a scale factor. uint64_t ResSize = TD->getTypeAllocSize(ResElTy); - uint64_t ArrayEltSize = - TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()); + uint64_t ArrayEltSize + = TD->getTypeAllocSize(SrcElTy->getArrayElementType()); if (ResSize && ArrayEltSize % ResSize == 0) { Value *Idx = GEP.getOperand(1); unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits(); @@ -1296,7 +1320,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Earlier transforms ensure that the index has type IntPtrType, which // considerably simplifies the logic by eliminating implicit casts. - assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) && + assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) && "Index not cast to pointer width?"); bool NSW; @@ -1304,9 +1328,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Successfully decomposed Idx as NewIdx * Scale, form a new GEP. // If the multiplication NewIdx * Scale may overflow then the new // GEP may not be "inbounds". - Value *Off[2]; - Off[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext())); - Off[1] = NewIdx; + Value *Off[2] = { + Constant::getNullValue(TD->getIntPtrType(GEP.getType())), + NewIdx + }; + Value *NewGEP = GEP.isInBounds() && NSW ? Builder->CreateInBoundsGEP(StrippedPtr, Off, GEP.getName()) : Builder->CreateGEP(StrippedPtr, Off, GEP.getName()); @@ -1318,15 +1344,20 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } } + if (!TD) + return 0; + /// See if we can simplify: /// X = bitcast A* to B* /// Y = gep X, <...constant indices...> /// into a gep of the original struct. This is important for SROA and alias /// analysis of unions. If "A" is also a bitcast, wait for A/X to be merged. if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) { - APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0); - if (TD && - !isa<BitCastInst>(BCI->getOperand(0)) && + Value *Operand = BCI->getOperand(0); + PointerType *OpType = cast<PointerType>(Operand->getType()); + unsigned OffsetBits = TD->getPointerTypeSizeInBits(OpType); + APInt Offset(OffsetBits, 0); + if (!isa<BitCastInst>(Operand) && GEP.accumulateConstantOffset(*TD, Offset) && StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) { @@ -1335,8 +1366,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (!Offset) { // If the bitcast is of an allocation, and the allocation will be // converted to match the type of the cast, don't touch this. - if (isa<AllocaInst>(BCI->getOperand(0)) || - isAllocationFn(BCI->getOperand(0), TLI)) { + if (isa<AllocaInst>(Operand) || isAllocationFn(Operand, TLI)) { // See if the bitcast simplifies, if so, don't nuke this GEP yet. if (Instruction *I = visitBitCast(*BCI)) { if (I != BCI) { @@ -1347,19 +1377,17 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { return &GEP; } } - return new BitCastInst(BCI->getOperand(0), GEP.getType()); + return new BitCastInst(Operand, GEP.getType()); } // Otherwise, if the offset is non-zero, we need to find out if there is a // field at Offset in 'A's type. If so, we can pull the cast through the // GEP. SmallVector<Value*, 8> NewIndices; - Type *InTy = - cast<PointerType>(BCI->getOperand(0)->getType())->getElementType(); - if (FindElementAtOffset(InTy, Offset.getSExtValue(), NewIndices)) { + if (FindElementAtOffset(OpType, Offset.getSExtValue(), NewIndices)) { Value *NGEP = GEP.isInBounds() ? - Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) : - Builder->CreateGEP(BCI->getOperand(0), NewIndices); + Builder->CreateInBoundsGEP(Operand, NewIndices) : + Builder->CreateGEP(Operand, NewIndices); if (NGEP->getType() == GEP.getType()) return ReplaceInstUsesWith(GEP, NGEP); @@ -1372,8 +1400,6 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { return 0; } - - static bool isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users, const TargetLibraryInfo *TLI) { @@ -2209,7 +2235,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, // DCE instruction if trivially dead. if (isInstructionTriviallyDead(Inst, TLI)) { ++NumDeadInst; - DEBUG(errs() << "IC: DCE: " << *Inst << '\n'); + DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n'); Inst->eraseFromParent(); continue; } @@ -2217,7 +2243,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, // ConstantProp instruction if trivially constant. if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0))) if (Constant *C = ConstantFoldInstruction(Inst, TD, TLI)) { - DEBUG(errs() << "IC: ConstFold to: " << *C << " from: " + DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *Inst << '\n'); Inst->replaceAllUsesWith(C); ++NumConstProp; @@ -2293,7 +2319,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { MadeIRChange = false; - DEBUG(errs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " + DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " << F.getName() << "\n"); { @@ -2338,7 +2364,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { // Check to see if we can DCE the instruction. if (isInstructionTriviallyDead(I, TLI)) { - DEBUG(errs() << "IC: DCE: " << *I << '\n'); + DEBUG(dbgs() << "IC: DCE: " << *I << '\n'); EraseInstFromFunction(*I); ++NumDeadInst; MadeIRChange = true; @@ -2348,7 +2374,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { // Instruction isn't dead, see if we can constant propagate it. if (!I->use_empty() && isa<Constant>(I->getOperand(0))) if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) { - DEBUG(errs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n'); + DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n'); // Add operands to the worklist. ReplaceInstUsesWith(*I, C); @@ -2396,13 +2422,13 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { std::string OrigI; #endif DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str();); - DEBUG(errs() << "IC: Visiting: " << OrigI << '\n'); + DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n'); if (Instruction *Result = visit(*I)) { ++NumCombined; // Should we replace the old instruction with a new one? if (Result != I) { - DEBUG(errs() << "IC: Old = " << *I << '\n' + DEBUG(dbgs() << "IC: Old = " << *I << '\n' << " New = " << *Result << '\n'); if (!I->getDebugLoc().isUnknown()) @@ -2431,7 +2457,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { EraseInstFromFunction(*I); } else { #ifndef NDEBUG - DEBUG(errs() << "IC: Mod = " << OrigI << '\n' + DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n' << " New = " << *I << '\n'); #endif diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index d77e20b..d731ec5 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" #include "llvm/DIBuilder.h" @@ -59,6 +60,7 @@ static const uint64_t kDefaultShort64bitShadowOffset = 0x7FFF8000; // < 2G. static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41; static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa8000; +static const size_t kMinStackMallocSize = 1 << 6; // 64B static const size_t kMaxStackMallocSize = 1 << 16; // 64K static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3; static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E; @@ -75,21 +77,30 @@ static const char *const kAsanUnregisterGlobalsName = static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init"; static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init"; static const char *const kAsanInitName = "__asan_init_v3"; +static const char *const kAsanCovName = "__sanitizer_cov"; static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return"; static const char *const kAsanMappingOffsetName = "__asan_mapping_offset"; static const char *const kAsanMappingScaleName = "__asan_mapping_scale"; -static const char *const kAsanStackMallocName = "__asan_stack_malloc"; -static const char *const kAsanStackFreeName = "__asan_stack_free"; +static const int kMaxAsanStackMallocSizeClass = 10; +static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_"; +static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_"; static const char *const kAsanGenPrefix = "__asan_gen_"; static const char *const kAsanPoisonStackMemoryName = "__asan_poison_stack_memory"; static const char *const kAsanUnpoisonStackMemoryName = "__asan_unpoison_stack_memory"; +static const char *const kAsanOptionDetectUAR = + "__asan_option_detect_stack_use_after_return"; + +// These constants must match the definitions in the run-time library. static const int kAsanStackLeftRedzoneMagic = 0xf1; static const int kAsanStackMidRedzoneMagic = 0xf2; static const int kAsanStackRightRedzoneMagic = 0xf3; static const int kAsanStackPartialRedzoneMagic = 0xf4; +#ifndef NDEBUG +static const int kAsanStackAfterReturnMagic = 0xf5; +#endif // Accesses sizes are powers of two: 1, 2, 4, 8, 16. static const size_t kNumberOfAccessSizes = 5; @@ -124,6 +135,8 @@ static cl::opt<bool> ClUseAfterReturn("asan-use-after-return", // This flag may need to be replaced with -f[no]asan-globals. static cl::opt<bool> ClGlobals("asan-globals", cl::desc("Handle global objects"), cl::Hidden, cl::init(true)); +static cl::opt<bool> ClCoverage("asan-coverage", + cl::desc("ASan coverage"), cl::Hidden, cl::init(false)); static cl::opt<bool> ClInitializers("asan-initialization-order", cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false)); static cl::opt<bool> ClMemIntrin("asan-memintrin", @@ -184,6 +197,13 @@ static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"), static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"), cl::Hidden, cl::init(-1)); +STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); +STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); +STATISTIC(NumOptimizedAccessesToGlobalArray, + "Number of optimized accesses to global arrays"); +STATISTIC(NumOptimizedAccessesToGlobalVar, + "Number of optimized accesses to global vars"); + namespace { /// A set of dynamically initialized globals extracted from metadata. class SetOfDynamicallyInitializedGlobals { @@ -306,6 +326,8 @@ struct AddressSanitizer : public FunctionPass { bool ShouldInstrumentGlobal(GlobalVariable *G); bool LooksLikeCodeInBug11395(Instruction *I); void FindDynamicInitializers(Module &M); + bool GlobalIsLinkerInitialized(GlobalVariable *G); + bool InjectCoverage(Function &F); bool CheckInitOrder; bool CheckUseAfterReturn; @@ -321,6 +343,7 @@ struct AddressSanitizer : public FunctionPass { Function *AsanCtorFunction; Function *AsanInitFunction; Function *AsanHandleNoReturnFunc; + Function *AsanCovFunction; OwningPtr<SpecialCaseList> BL; // This array is indexed by AccessIsWrite and log2(AccessSize). Function *AsanErrorCallback[2][kNumberOfAccessSizes]; @@ -396,12 +419,14 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { uint64_t TotalStackSize; unsigned StackAlignment; - Function *AsanStackMallocFunc, *AsanStackFreeFunc; + Function *AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1], + *AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1]; Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc; // Stores a place and arguments of poisoning/unpoisoning call for alloca. struct AllocaPoisonCall { IntrinsicInst *InsBefore; + AllocaInst *AI; uint64_t Size; bool DoPoison; }; @@ -480,7 +505,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { AllocaInst *AI = findAllocaForValue(II.getArgOperand(1)); if (!AI) return; bool DoPoison = (ID == Intrinsic::lifetime_end); - AllocaPoisonCall APC = {&II, SizeValue, DoPoison}; + AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison}; AllocaPoisonCallVec.push_back(APC); } @@ -488,7 +513,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { void initializeCallbacks(Module &M); // Check if we want (and can) handle this alloca. - bool isInterestingAlloca(AllocaInst &AI) { + bool isInterestingAlloca(AllocaInst &AI) const { return (!AI.isArrayAllocation() && AI.isStaticAlloca() && AI.getAlignment() <= RedzoneSize() && @@ -498,24 +523,27 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { size_t RedzoneSize() const { return RedzoneSizeForScale(Mapping.Scale); } - uint64_t getAllocaSizeInBytes(AllocaInst *AI) { + uint64_t getAllocaSizeInBytes(AllocaInst *AI) const { Type *Ty = AI->getAllocatedType(); uint64_t SizeInBytes = ASan.TD->getTypeAllocSize(Ty); return SizeInBytes; } - uint64_t getAlignedSize(uint64_t SizeInBytes) { + uint64_t getAlignedSize(uint64_t SizeInBytes) const { size_t RZ = RedzoneSize(); return ((SizeInBytes + RZ - 1) / RZ) * RZ; } - uint64_t getAlignedAllocaSize(AllocaInst *AI) { + uint64_t getAlignedAllocaSize(AllocaInst *AI) const { uint64_t SizeInBytes = getAllocaSizeInBytes(AI); return getAlignedSize(SizeInBytes); } /// Finds alloca where the value comes from. AllocaInst *findAllocaForValue(Value *V); - void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, + void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> &IRB, Value *ShadowBase, bool DoPoison); - void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> IRB, bool DoPoison); + void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison); + + void SetShadowToStackAfterReturnInlined(IRBuilder<> &IRB, Value *ShadowBase, + int Size); }; } // namespace @@ -642,6 +670,13 @@ static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) { return NULL; } +bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) { + // If a global variable does not have dynamic initialization we don't + // have to instrument it. However, if a global does not have initializer + // at all, we assume it has dynamic initializer (in other TU). + return G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G); +} + void AddressSanitizer::instrumentMop(Instruction *I) { bool IsWrite = false; Value *Addr = isInterestingMemoryAccess(I, &IsWrite); @@ -650,13 +685,19 @@ void AddressSanitizer::instrumentMop(Instruction *I) { if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) { // If initialization order checking is disabled, a simple access to a // dynamically initialized global is always valid. - if (!CheckInitOrder) - return; - // If a global variable does not have dynamic initialization we don't - // have to instrument it. However, if a global does not have initailizer - // at all, we assume it has dynamic initializer (in other TU). - if (G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G)) + if (!CheckInitOrder || GlobalIsLinkerInitialized(G)) { + NumOptimizedAccessesToGlobalVar++; return; + } + } + ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr); + if (CE && CE->isGEPWithNoNotionalOverIndexing()) { + if (GlobalVariable *G = dyn_cast<GlobalVariable>(CE->getOperand(0))) { + if (CE->getOperand(1)->isNullValue() && GlobalIsLinkerInitialized(G)) { + NumOptimizedAccessesToGlobalArray++; + return; + } + } } } @@ -668,6 +709,11 @@ void AddressSanitizer::instrumentMop(Instruction *I) { assert((TypeSize % 8) == 0); + if (IsWrite) + NumInstrumentedWrites++; + else + NumInstrumentedReads++; + // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check. if (TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 || TypeSize == 128) @@ -883,7 +929,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) { TD = getAnalysisIfAvailable<DataLayout>(); if (!TD) return false; - BL.reset(new SpecialCaseList(BlacklistFile)); + BL.reset(SpecialCaseList::createOrDie(BlacklistFile)); if (BL->isIn(M)) return false; C = &(M.getContext()); int LongSize = TD->getPointerSizeInBits(); @@ -914,8 +960,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) { StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy, NULL); - SmallVector<Constant *, 16> Initializers(n), DynamicInit; - + SmallVector<Constant *, 16> Initializers(n); Function *CtorFunc = M.getFunction(kAsanModuleCtorName); assert(CtorFunc); @@ -1046,6 +1091,8 @@ void AddressSanitizer::initializeCallbacks(Module &M) { AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction( kAsanHandleNoReturnName, IRB.getVoidTy(), NULL)); + AsanCovFunction = checkInterfaceFunction(M.getOrInsertFunction( + kAsanCovName, IRB.getVoidTy(), IntptrTy, NULL)); // We insert an empty inline asm after __asan_report* to avoid callback merge. EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), StringRef(""), StringRef(""), @@ -1076,7 +1123,7 @@ bool AddressSanitizer::doInitialization(Module &M) { if (!TD) return false; - BL.reset(new SpecialCaseList(BlacklistFile)); + BL.reset(SpecialCaseList::createOrDie(BlacklistFile)); DynamicallyInitializedGlobals.Init(M); C = &(M.getContext()); @@ -1117,6 +1164,47 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { return false; } +// Poor man's coverage that works with ASan. +// We create a Guard boolean variable with the same linkage +// as the function and inject this code into the entry block: +// if (*Guard) { +// __sanitizer_cov(&F); +// *Guard = 1; +// } +// The accesses to Guard are atomic. The rest of the logic is +// in __sanitizer_cov (it's fine to call it more than once). +// +// This coverage implementation provides very limited data: +// it only tells if a given function was ever executed. +// No counters, no per-basic-block or per-edge data. +// But for many use cases this is what we need and the added slowdown +// is negligible. This simple implementation will probably be obsoleted +// by the upcoming Clang-based coverage implementation. +// By having it here and now we hope to +// a) get the functionality to users earlier and +// b) collect usage statistics to help improve Clang coverage design. +bool AddressSanitizer::InjectCoverage(Function &F) { + if (!ClCoverage) return false; + IRBuilder<> IRB(F.getEntryBlock().getFirstInsertionPt()); + Type *Int8Ty = IRB.getInt8Ty(); + GlobalVariable *Guard = new GlobalVariable( + *F.getParent(), Int8Ty, false, GlobalValue::PrivateLinkage, + Constant::getNullValue(Int8Ty), "__asan_gen_cov_" + F.getName()); + LoadInst *Load = IRB.CreateLoad(Guard); + Load->setAtomic(Monotonic); + Load->setAlignment(1); + Value *Cmp = IRB.CreateICmpEQ(Constant::getNullValue(Int8Ty), Load); + Instruction *Ins = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); + IRB.SetInsertPoint(Ins); + // We pass &F to __sanitizer_cov. We could avoid this and rely on + // GET_CALLER_PC, but having the PC of the first instruction is just nice. + IRB.CreateCall(AsanCovFunction, IRB.CreatePointerCast(&F, IntptrTy)); + StoreInst *Store = IRB.CreateStore(ConstantInt::get(Int8Ty, 1), Guard); + Store->setAtomic(Monotonic); + Store->setAlignment(1); + return true; +} + bool AddressSanitizer::runOnFunction(Function &F) { if (BL->isIn(F)) return false; if (&F == AsanCtorFunction) return false; @@ -1212,6 +1300,10 @@ bool AddressSanitizer::runOnFunction(Function &F) { } bool res = NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty(); + + if (InjectCoverage(F)) + res = true; + DEBUG(dbgs() << "ASAN done instrumenting: " << res << " " << F << "\n"); if (ClKeepUninstrumented) { @@ -1271,11 +1363,15 @@ bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) { void FunctionStackPoisoner::initializeCallbacks(Module &M) { IRBuilder<> IRB(*C); - AsanStackMallocFunc = checkInterfaceFunction(M.getOrInsertFunction( - kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL)); - AsanStackFreeFunc = checkInterfaceFunction(M.getOrInsertFunction( - kAsanStackFreeName, IRB.getVoidTy(), - IntptrTy, IntptrTy, IntptrTy, NULL)); + for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) { + std::string Suffix = itostr(i); + AsanStackMallocFunc[i] = checkInterfaceFunction( + M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy, + IntptrTy, IntptrTy, NULL)); + AsanStackFreeFunc[i] = checkInterfaceFunction(M.getOrInsertFunction( + kAsanStackFreeNameTemplate + Suffix, IRB.getVoidTy(), IntptrTy, + IntptrTy, IntptrTy, NULL)); + } AsanPoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction( kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); AsanUnpoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction( @@ -1283,7 +1379,7 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) { } void FunctionStackPoisoner::poisonRedZones( - const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, Value *ShadowBase, + const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> &IRB, Value *ShadowBase, bool DoPoison) { size_t ShadowRZSize = RedzoneSize() >> Mapping.Scale; assert(ShadowRZSize >= 1 && ShadowRZSize <= 4); @@ -1344,12 +1440,40 @@ void FunctionStackPoisoner::poisonRedZones( } } +// Fake stack allocator (asan_fake_stack.h) has 11 size classes +// for every power of 2 from kMinStackMallocSize to kMaxAsanStackMallocSizeClass +static int StackMallocSizeClass(uint64_t LocalStackSize) { + assert(LocalStackSize <= kMaxStackMallocSize); + uint64_t MaxSize = kMinStackMallocSize; + for (int i = 0; ; i++, MaxSize *= 2) + if (LocalStackSize <= MaxSize) + return i; + llvm_unreachable("impossible LocalStackSize"); +} + +// Set Size bytes starting from ShadowBase to kAsanStackAfterReturnMagic. +// We can not use MemSet intrinsic because it may end up calling the actual +// memset. Size is a multiple of 8. +// Currently this generates 8-byte stores on x86_64; it may be better to +// generate wider stores. +void FunctionStackPoisoner::SetShadowToStackAfterReturnInlined( + IRBuilder<> &IRB, Value *ShadowBase, int Size) { + assert(!(Size % 8)); + assert(kAsanStackAfterReturnMagic == 0xf5); + for (int i = 0; i < Size; i += 8) { + Value *p = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)); + IRB.CreateStore(ConstantInt::get(IRB.getInt64Ty(), 0xf5f5f5f5f5f5f5f5ULL), + IRB.CreateIntToPtr(p, IRB.getInt64Ty()->getPointerTo())); + } +} + void FunctionStackPoisoner::poisonStack() { uint64_t LocalStackSize = TotalStackSize + (AllocaVec.size() + 1) * RedzoneSize(); bool DoStackMalloc = ASan.CheckUseAfterReturn && LocalStackSize <= kMaxStackMallocSize; + int StackMallocIdx = -1; assert(AllocaVec.size() > 0); Instruction *InsBefore = AllocaVec[0]; @@ -1367,8 +1491,28 @@ void FunctionStackPoisoner::poisonStack() { Value *LocalStackBase = OrigStackBase; if (DoStackMalloc) { - LocalStackBase = IRB.CreateCall2(AsanStackMallocFunc, + // LocalStackBase = OrigStackBase + // if (__asan_option_detect_stack_use_after_return) + // LocalStackBase = __asan_stack_malloc_N(LocalStackBase, OrigStackBase); + StackMallocIdx = StackMallocSizeClass(LocalStackSize); + assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass); + Constant *OptionDetectUAR = F.getParent()->getOrInsertGlobal( + kAsanOptionDetectUAR, IRB.getInt32Ty()); + Value *Cmp = IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUAR), + Constant::getNullValue(IRB.getInt32Ty())); + Instruction *Term = + SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); + BasicBlock *CmpBlock = cast<Instruction>(Cmp)->getParent(); + IRBuilder<> IRBIf(Term); + LocalStackBase = IRBIf.CreateCall2( + AsanStackMallocFunc[StackMallocIdx], ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase); + BasicBlock *SetBlock = cast<Instruction>(LocalStackBase)->getParent(); + IRB.SetInsertPoint(InsBefore); + PHINode *Phi = IRB.CreatePHI(IntptrTy, 2); + Phi->addIncoming(OrigStackBase, CmpBlock); + Phi->addIncoming(LocalStackBase, SetBlock); + LocalStackBase = Phi; } // This string will be parsed by the run-time (DescribeAddressIfStack). @@ -1380,11 +1524,10 @@ void FunctionStackPoisoner::poisonStack() { bool HavePoisonedAllocas = false; for (size_t i = 0, n = AllocaPoisonCallVec.size(); i < n; i++) { const AllocaPoisonCall &APC = AllocaPoisonCallVec[i]; - IntrinsicInst *II = APC.InsBefore; - AllocaInst *AI = findAllocaForValue(II->getArgOperand(1)); - assert(AI); - IRBuilder<> IRB(II); - poisonAlloca(AI, APC.Size, IRB, APC.DoPoison); + assert(APC.InsBefore); + assert(APC.AI); + IRBuilder<> IRB(APC.InsBefore); + poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison); HavePoisonedAllocas |= APC.DoPoison; } @@ -1442,10 +1585,35 @@ void FunctionStackPoisoner::poisonStack() { // Unpoison the stack. poisonRedZones(AllocaVec, IRBRet, ShadowBase, false); if (DoStackMalloc) { + assert(StackMallocIdx >= 0); // In use-after-return mode, mark the whole stack frame unaddressable. - IRBRet.CreateCall3(AsanStackFreeFunc, LocalStackBase, - ConstantInt::get(IntptrTy, LocalStackSize), - OrigStackBase); + if (StackMallocIdx <= 4) { + // For small sizes inline the whole thing: + // if LocalStackBase != OrigStackBase: + // memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize); + // **SavedFlagPtr(LocalStackBase) = 0 + // FIXME: if LocalStackBase != OrigStackBase don't call poisonRedZones. + Value *Cmp = IRBRet.CreateICmpNE(LocalStackBase, OrigStackBase); + TerminatorInst *PoisonTerm = + SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); + IRBuilder<> IRBPoison(PoisonTerm); + int ClassSize = kMinStackMallocSize << StackMallocIdx; + SetShadowToStackAfterReturnInlined(IRBPoison, ShadowBase, + ClassSize >> Mapping.Scale); + Value *SavedFlagPtrPtr = IRBPoison.CreateAdd( + LocalStackBase, + ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8)); + Value *SavedFlagPtr = IRBPoison.CreateLoad( + IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy)); + IRBPoison.CreateStore( + Constant::getNullValue(IRBPoison.getInt8Ty()), + IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy())); + } else { + // For larger frames call __asan_stack_free_*. + IRBRet.CreateCall3(AsanStackFreeFunc[StackMallocIdx], LocalStackBase, + ConstantInt::get(IntptrTy, LocalStackSize), + OrigStackBase); + } } else if (HavePoisonedAllocas) { // If we poisoned some allocas in llvm.lifetime analysis, // unpoison whole stack frame now. @@ -1460,7 +1628,7 @@ void FunctionStackPoisoner::poisonStack() { } void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size, - IRBuilder<> IRB, bool DoPoison) { + IRBuilder<> &IRB, bool DoPoison) { // For now just insert the call to ASan runtime. Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy); Value *SizeArg = ConstantInt::get(IntptrTy, Size); diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp index b094d42..7a9f0f6 100644 --- a/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -80,7 +80,7 @@ BasicBlock *BoundsChecking::getTrapBB() { return TrapBB; Function *Fn = Inst->getParent()->getParent(); - BasicBlock::iterator PrevInsertPoint = Builder->GetInsertPoint(); + IRBuilder<>::InsertPointGuard Guard(*Builder); TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn); Builder->SetInsertPoint(TrapBB); @@ -91,7 +91,6 @@ BasicBlock *BoundsChecking::getTrapBB() { TrapCall->setDebugLoc(Inst->getDebugLoc()); Builder->CreateUnreachable(); - Builder->SetInsertPoint(PrevInsertPoint); return TrapBB; } @@ -173,7 +172,8 @@ bool BoundsChecking::runOnFunction(Function &F) { TrapBB = 0; BuilderTy TheBuilder(F.getContext(), TargetFolder(TD)); Builder = &TheBuilder; - ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext()); + ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext(), + /*RoundToAlign=*/true); ObjSizeEval = &TheObjSizeEval; // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt index 5e34863..3563593 100644 --- a/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/lib/Transforms/Instrumentation/CMakeLists.txt @@ -1,14 +1,11 @@ add_llvm_library(LLVMInstrumentation AddressSanitizer.cpp BoundsChecking.cpp + DataFlowSanitizer.cpp DebugIR.cpp - EdgeProfiling.cpp GCOVProfiling.cpp MemorySanitizer.cpp Instrumentation.cpp - OptimalEdgeProfiling.cpp - PathProfiling.cpp - ProfilingUtils.cpp ThreadSanitizer.cpp ) diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp new file mode 100644 index 0000000..9b9e725 --- /dev/null +++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -0,0 +1,1397 @@ +//===-- DataFlowSanitizer.cpp - dynamic data flow analysis ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file is a part of DataFlowSanitizer, a generalised dynamic data flow +/// analysis. +/// +/// Unlike other Sanitizer tools, this tool is not designed to detect a specific +/// class of bugs on its own. Instead, it provides a generic dynamic data flow +/// analysis framework to be used by clients to help detect application-specific +/// issues within their own code. +/// +/// The analysis is based on automatic propagation of data flow labels (also +/// known as taint labels) through a program as it performs computation. Each +/// byte of application memory is backed by two bytes of shadow memory which +/// hold the label. On Linux/x86_64, memory is laid out as follows: +/// +/// +--------------------+ 0x800000000000 (top of memory) +/// | application memory | +/// +--------------------+ 0x700000008000 (kAppAddr) +/// | | +/// | unused | +/// | | +/// +--------------------+ 0x200200000000 (kUnusedAddr) +/// | union table | +/// +--------------------+ 0x200000000000 (kUnionTableAddr) +/// | shadow memory | +/// +--------------------+ 0x000000010000 (kShadowAddr) +/// | reserved by kernel | +/// +--------------------+ 0x000000000000 +/// +/// To derive a shadow memory address from an application memory address, +/// bits 44-46 are cleared to bring the address into the range +/// [0x000000008000,0x100000000000). Then the address is shifted left by 1 to +/// account for the double byte representation of shadow labels and move the +/// address into the shadow memory range. See the function +/// DataFlowSanitizer::getShadowAddress below. +/// +/// For more information, please refer to the design document: +/// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html + +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/InstVisitor.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SpecialCaseList.h" +#include <iterator> + +using namespace llvm; + +// The -dfsan-preserve-alignment flag controls whether this pass assumes that +// alignment requirements provided by the input IR are correct. For example, +// if the input IR contains a load with alignment 8, this flag will cause +// the shadow load to have alignment 16. This flag is disabled by default as +// we have unfortunately encountered too much code (including Clang itself; +// see PR14291) which performs misaligned access. +static cl::opt<bool> ClPreserveAlignment( + "dfsan-preserve-alignment", + cl::desc("respect alignment requirements provided by input IR"), cl::Hidden, + cl::init(false)); + +// The ABI list file controls how shadow parameters are passed. The pass treats +// every function labelled "uninstrumented" in the ABI list file as conforming +// to the "native" (i.e. unsanitized) ABI. Unless the ABI list contains +// additional annotations for those functions, a call to one of those functions +// will produce a warning message, as the labelling behaviour of the function is +// unknown. The other supported annotations are "functional" and "discard", +// which are described below under DataFlowSanitizer::WrapperKind. +static cl::opt<std::string> ClABIListFile( + "dfsan-abilist", + cl::desc("File listing native ABI functions and how the pass treats them"), + cl::Hidden); + +// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented +// functions (see DataFlowSanitizer::InstrumentedABI below). +static cl::opt<bool> ClArgsABI( + "dfsan-args-abi", + cl::desc("Use the argument ABI rather than the TLS ABI"), + cl::Hidden); + +static cl::opt<bool> ClDebugNonzeroLabels( + "dfsan-debug-nonzero-labels", + cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, " + "load or return with a nonzero label"), + cl::Hidden); + +namespace { + +class DataFlowSanitizer : public ModulePass { + friend struct DFSanFunction; + friend class DFSanVisitor; + + enum { + ShadowWidth = 16 + }; + + /// Which ABI should be used for instrumented functions? + enum InstrumentedABI { + /// Argument and return value labels are passed through additional + /// arguments and by modifying the return type. + IA_Args, + + /// Argument and return value labels are passed through TLS variables + /// __dfsan_arg_tls and __dfsan_retval_tls. + IA_TLS + }; + + /// How should calls to uninstrumented functions be handled? + enum WrapperKind { + /// This function is present in an uninstrumented form but we don't know + /// how it should be handled. Print a warning and call the function anyway. + /// Don't label the return value. + WK_Warning, + + /// This function does not write to (user-accessible) memory, and its return + /// value is unlabelled. + WK_Discard, + + /// This function does not write to (user-accessible) memory, and the label + /// of its return value is the union of the label of its arguments. + WK_Functional, + + /// Instead of calling the function, a custom wrapper __dfsw_F is called, + /// where F is the name of the function. This function may wrap the + /// original function or provide its own implementation. This is similar to + /// the IA_Args ABI, except that IA_Args uses a struct return type to + /// pass the return value shadow in a register, while WK_Custom uses an + /// extra pointer argument to return the shadow. This allows the wrapped + /// form of the function type to be expressed in C. + WK_Custom + }; + + DataLayout *DL; + Module *Mod; + LLVMContext *Ctx; + IntegerType *ShadowTy; + PointerType *ShadowPtrTy; + IntegerType *IntptrTy; + ConstantInt *ZeroShadow; + ConstantInt *ShadowPtrMask; + ConstantInt *ShadowPtrMul; + Constant *ArgTLS; + Constant *RetvalTLS; + void *(*GetArgTLSPtr)(); + void *(*GetRetvalTLSPtr)(); + Constant *GetArgTLS; + Constant *GetRetvalTLS; + FunctionType *DFSanUnionFnTy; + FunctionType *DFSanUnionLoadFnTy; + FunctionType *DFSanUnimplementedFnTy; + FunctionType *DFSanSetLabelFnTy; + FunctionType *DFSanNonzeroLabelFnTy; + Constant *DFSanUnionFn; + Constant *DFSanUnionLoadFn; + Constant *DFSanUnimplementedFn; + Constant *DFSanSetLabelFn; + Constant *DFSanNonzeroLabelFn; + MDNode *ColdCallWeights; + OwningPtr<SpecialCaseList> ABIList; + DenseMap<Value *, Function *> UnwrappedFnMap; + AttributeSet ReadOnlyNoneAttrs; + + Value *getShadowAddress(Value *Addr, Instruction *Pos); + Value *combineShadows(Value *V1, Value *V2, Instruction *Pos); + bool isInstrumented(const Function *F); + bool isInstrumented(const GlobalAlias *GA); + FunctionType *getArgsFunctionType(FunctionType *T); + FunctionType *getTrampolineFunctionType(FunctionType *T); + FunctionType *getCustomFunctionType(FunctionType *T); + InstrumentedABI getInstrumentedABI(); + WrapperKind getWrapperKind(Function *F); + void addGlobalNamePrefix(GlobalValue *GV); + Function *buildWrapperFunction(Function *F, StringRef NewFName, + GlobalValue::LinkageTypes NewFLink, + FunctionType *NewFT); + Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName); + + public: + DataFlowSanitizer(StringRef ABIListFile = StringRef(), + void *(*getArgTLS)() = 0, void *(*getRetValTLS)() = 0); + static char ID; + bool doInitialization(Module &M); + bool runOnModule(Module &M); +}; + +struct DFSanFunction { + DataFlowSanitizer &DFS; + Function *F; + DataFlowSanitizer::InstrumentedABI IA; + bool IsNativeABI; + Value *ArgTLSPtr; + Value *RetvalTLSPtr; + AllocaInst *LabelReturnAlloca; + DenseMap<Value *, Value *> ValShadowMap; + DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap; + std::vector<std::pair<PHINode *, PHINode *> > PHIFixups; + DenseSet<Instruction *> SkipInsts; + DenseSet<Value *> NonZeroChecks; + + DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI) + : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()), + IsNativeABI(IsNativeABI), ArgTLSPtr(0), RetvalTLSPtr(0), + LabelReturnAlloca(0) {} + Value *getArgTLSPtr(); + Value *getArgTLS(unsigned Index, Instruction *Pos); + Value *getRetvalTLS(); + Value *getShadow(Value *V); + void setShadow(Instruction *I, Value *Shadow); + Value *combineOperandShadows(Instruction *Inst); + Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align, + Instruction *Pos); + void storeShadow(Value *Addr, uint64_t Size, uint64_t Align, Value *Shadow, + Instruction *Pos); +}; + +class DFSanVisitor : public InstVisitor<DFSanVisitor> { + public: + DFSanFunction &DFSF; + DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {} + + void visitOperandShadowInst(Instruction &I); + + void visitBinaryOperator(BinaryOperator &BO); + void visitCastInst(CastInst &CI); + void visitCmpInst(CmpInst &CI); + void visitGetElementPtrInst(GetElementPtrInst &GEPI); + void visitLoadInst(LoadInst &LI); + void visitStoreInst(StoreInst &SI); + void visitReturnInst(ReturnInst &RI); + void visitCallSite(CallSite CS); + void visitPHINode(PHINode &PN); + void visitExtractElementInst(ExtractElementInst &I); + void visitInsertElementInst(InsertElementInst &I); + void visitShuffleVectorInst(ShuffleVectorInst &I); + void visitExtractValueInst(ExtractValueInst &I); + void visitInsertValueInst(InsertValueInst &I); + void visitAllocaInst(AllocaInst &I); + void visitSelectInst(SelectInst &I); + void visitMemSetInst(MemSetInst &I); + void visitMemTransferInst(MemTransferInst &I); +}; + +} + +char DataFlowSanitizer::ID; +INITIALIZE_PASS(DataFlowSanitizer, "dfsan", + "DataFlowSanitizer: dynamic data flow analysis.", false, false) + +ModulePass *llvm::createDataFlowSanitizerPass(StringRef ABIListFile, + void *(*getArgTLS)(), + void *(*getRetValTLS)()) { + return new DataFlowSanitizer(ABIListFile, getArgTLS, getRetValTLS); +} + +DataFlowSanitizer::DataFlowSanitizer(StringRef ABIListFile, + void *(*getArgTLS)(), + void *(*getRetValTLS)()) + : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS), + ABIList(SpecialCaseList::createOrDie(ABIListFile.empty() ? ClABIListFile + : ABIListFile)) { +} + +FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) { + llvm::SmallVector<Type *, 4> ArgTypes; + std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes)); + for (unsigned i = 0, e = T->getNumParams(); i != e; ++i) + ArgTypes.push_back(ShadowTy); + if (T->isVarArg()) + ArgTypes.push_back(ShadowPtrTy); + Type *RetType = T->getReturnType(); + if (!RetType->isVoidTy()) + RetType = StructType::get(RetType, ShadowTy, (Type *)0); + return FunctionType::get(RetType, ArgTypes, T->isVarArg()); +} + +FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) { + assert(!T->isVarArg()); + llvm::SmallVector<Type *, 4> ArgTypes; + ArgTypes.push_back(T->getPointerTo()); + std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes)); + for (unsigned i = 0, e = T->getNumParams(); i != e; ++i) + ArgTypes.push_back(ShadowTy); + Type *RetType = T->getReturnType(); + if (!RetType->isVoidTy()) + ArgTypes.push_back(ShadowPtrTy); + return FunctionType::get(T->getReturnType(), ArgTypes, false); +} + +FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) { + assert(!T->isVarArg()); + llvm::SmallVector<Type *, 4> ArgTypes; + for (FunctionType::param_iterator i = T->param_begin(), e = T->param_end(); + i != e; ++i) { + FunctionType *FT; + if (isa<PointerType>(*i) && (FT = dyn_cast<FunctionType>(cast<PointerType>( + *i)->getElementType()))) { + ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo()); + ArgTypes.push_back(Type::getInt8PtrTy(*Ctx)); + } else { + ArgTypes.push_back(*i); + } + } + for (unsigned i = 0, e = T->getNumParams(); i != e; ++i) + ArgTypes.push_back(ShadowTy); + Type *RetType = T->getReturnType(); + if (!RetType->isVoidTy()) + ArgTypes.push_back(ShadowPtrTy); + return FunctionType::get(T->getReturnType(), ArgTypes, false); +} + +bool DataFlowSanitizer::doInitialization(Module &M) { + DL = getAnalysisIfAvailable<DataLayout>(); + if (!DL) + return false; + + Mod = &M; + Ctx = &M.getContext(); + ShadowTy = IntegerType::get(*Ctx, ShadowWidth); + ShadowPtrTy = PointerType::getUnqual(ShadowTy); + IntptrTy = DL->getIntPtrType(*Ctx); + ZeroShadow = ConstantInt::getSigned(ShadowTy, 0); + ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL); + ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8); + + Type *DFSanUnionArgs[2] = { ShadowTy, ShadowTy }; + DFSanUnionFnTy = + FunctionType::get(ShadowTy, DFSanUnionArgs, /*isVarArg=*/ false); + Type *DFSanUnionLoadArgs[2] = { ShadowPtrTy, IntptrTy }; + DFSanUnionLoadFnTy = + FunctionType::get(ShadowTy, DFSanUnionLoadArgs, /*isVarArg=*/ false); + DFSanUnimplementedFnTy = FunctionType::get( + Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false); + Type *DFSanSetLabelArgs[3] = { ShadowTy, Type::getInt8PtrTy(*Ctx), IntptrTy }; + DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx), + DFSanSetLabelArgs, /*isVarArg=*/false); + DFSanNonzeroLabelFnTy = FunctionType::get( + Type::getVoidTy(*Ctx), ArrayRef<Type *>(), /*isVarArg=*/false); + + if (GetArgTLSPtr) { + Type *ArgTLSTy = ArrayType::get(ShadowTy, 64); + ArgTLS = 0; + GetArgTLS = ConstantExpr::getIntToPtr( + ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)), + PointerType::getUnqual( + FunctionType::get(PointerType::getUnqual(ArgTLSTy), (Type *)0))); + } + if (GetRetvalTLSPtr) { + RetvalTLS = 0; + GetRetvalTLS = ConstantExpr::getIntToPtr( + ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)), + PointerType::getUnqual( + FunctionType::get(PointerType::getUnqual(ShadowTy), (Type *)0))); + } + + ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000); + return true; +} + +bool DataFlowSanitizer::isInstrumented(const Function *F) { + return !ABIList->isIn(*F, "uninstrumented"); +} + +bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) { + return !ABIList->isIn(*GA, "uninstrumented"); +} + +DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() { + return ClArgsABI ? IA_Args : IA_TLS; +} + +DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) { + if (ABIList->isIn(*F, "functional")) + return WK_Functional; + if (ABIList->isIn(*F, "discard")) + return WK_Discard; + if (ABIList->isIn(*F, "custom")) + return WK_Custom; + + return WK_Warning; +} + +void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) { + std::string GVName = GV->getName(), Prefix = "dfs$"; + GV->setName(Prefix + GVName); + + // Try to change the name of the function in module inline asm. We only do + // this for specific asm directives, currently only ".symver", to try to avoid + // corrupting asm which happens to contain the symbol name as a substring. + // Note that the substitution for .symver assumes that the versioned symbol + // also has an instrumented name. + std::string Asm = GV->getParent()->getModuleInlineAsm(); + std::string SearchStr = ".symver " + GVName + ","; + size_t Pos = Asm.find(SearchStr); + if (Pos != std::string::npos) { + Asm.replace(Pos, SearchStr.size(), + ".symver " + Prefix + GVName + "," + Prefix); + GV->getParent()->setModuleInlineAsm(Asm); + } +} + +Function * +DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName, + GlobalValue::LinkageTypes NewFLink, + FunctionType *NewFT) { + FunctionType *FT = F->getFunctionType(); + Function *NewF = Function::Create(NewFT, NewFLink, NewFName, + F->getParent()); + NewF->copyAttributesFrom(F); + NewF->removeAttributes( + AttributeSet::ReturnIndex, + AttributeFuncs::typeIncompatible(NewFT->getReturnType(), + AttributeSet::ReturnIndex)); + + BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF); + std::vector<Value *> Args; + unsigned n = FT->getNumParams(); + for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n) + Args.push_back(&*ai); + CallInst *CI = CallInst::Create(F, Args, "", BB); + if (FT->getReturnType()->isVoidTy()) + ReturnInst::Create(*Ctx, BB); + else + ReturnInst::Create(*Ctx, CI, BB); + + return NewF; +} + +Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT, + StringRef FName) { + FunctionType *FTT = getTrampolineFunctionType(FT); + Constant *C = Mod->getOrInsertFunction(FName, FTT); + Function *F = dyn_cast<Function>(C); + if (F && F->isDeclaration()) { + F->setLinkage(GlobalValue::LinkOnceODRLinkage); + BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F); + std::vector<Value *> Args; + Function::arg_iterator AI = F->arg_begin(); ++AI; + for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N) + Args.push_back(&*AI); + CallInst *CI = + CallInst::Create(&F->getArgumentList().front(), Args, "", BB); + ReturnInst *RI; + if (FT->getReturnType()->isVoidTy()) + RI = ReturnInst::Create(*Ctx, BB); + else + RI = ReturnInst::Create(*Ctx, CI, BB); + + DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true); + Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI; + for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) + DFSF.ValShadowMap[ValAI] = ShadowAI; + DFSanVisitor(DFSF).visitCallInst(*CI); + if (!FT->getReturnType()->isVoidTy()) + new StoreInst(DFSF.getShadow(RI->getReturnValue()), + &F->getArgumentList().back(), RI); + } + + return C; +} + +bool DataFlowSanitizer::runOnModule(Module &M) { + if (!DL) + return false; + + if (ABIList->isIn(M, "skip")) + return false; + + if (!GetArgTLSPtr) { + Type *ArgTLSTy = ArrayType::get(ShadowTy, 64); + ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy); + if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS)) + G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); + } + if (!GetRetvalTLSPtr) { + RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy); + if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS)) + G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); + } + + DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy); + if (Function *F = dyn_cast<Function>(DFSanUnionFn)) { + F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone); + F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); + F->addAttribute(1, Attribute::ZExt); + F->addAttribute(2, Attribute::ZExt); + } + DFSanUnionLoadFn = + Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy); + if (Function *F = dyn_cast<Function>(DFSanUnionLoadFn)) { + F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); + } + DFSanUnimplementedFn = + Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy); + DFSanSetLabelFn = + Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy); + if (Function *F = dyn_cast<Function>(DFSanSetLabelFn)) { + F->addAttribute(1, Attribute::ZExt); + } + DFSanNonzeroLabelFn = + Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy); + + std::vector<Function *> FnsToInstrument; + llvm::SmallPtrSet<Function *, 2> FnsWithNativeABI; + for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) { + if (!i->isIntrinsic() && + i != DFSanUnionFn && + i != DFSanUnionLoadFn && + i != DFSanUnimplementedFn && + i != DFSanSetLabelFn && + i != DFSanNonzeroLabelFn) + FnsToInstrument.push_back(&*i); + } + + // Give function aliases prefixes when necessary, and build wrappers where the + // instrumentedness is inconsistent. + for (Module::alias_iterator i = M.alias_begin(), e = M.alias_end(); i != e;) { + GlobalAlias *GA = &*i; + ++i; + // Don't stop on weak. We assume people aren't playing games with the + // instrumentedness of overridden weak aliases. + if (Function *F = dyn_cast<Function>( + GA->resolveAliasedGlobal(/*stopOnWeak=*/false))) { + bool GAInst = isInstrumented(GA), FInst = isInstrumented(F); + if (GAInst && FInst) { + addGlobalNamePrefix(GA); + } else if (GAInst != FInst) { + // Non-instrumented alias of an instrumented function, or vice versa. + // Replace the alias with a native-ABI wrapper of the aliasee. The pass + // below will take care of instrumenting it. + Function *NewF = + buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType()); + GA->replaceAllUsesWith(NewF); + NewF->takeName(GA); + GA->eraseFromParent(); + FnsToInstrument.push_back(NewF); + } + } + } + + AttrBuilder B; + B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone); + ReadOnlyNoneAttrs = AttributeSet::get(*Ctx, AttributeSet::FunctionIndex, B); + + // First, change the ABI of every function in the module. ABI-listed + // functions keep their original ABI and get a wrapper function. + for (std::vector<Function *>::iterator i = FnsToInstrument.begin(), + e = FnsToInstrument.end(); + i != e; ++i) { + Function &F = **i; + FunctionType *FT = F.getFunctionType(); + + bool IsZeroArgsVoidRet = (FT->getNumParams() == 0 && !FT->isVarArg() && + FT->getReturnType()->isVoidTy()); + + if (isInstrumented(&F)) { + // Instrumented functions get a 'dfs$' prefix. This allows us to more + // easily identify cases of mismatching ABIs. + if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) { + FunctionType *NewFT = getArgsFunctionType(FT); + Function *NewF = Function::Create(NewFT, F.getLinkage(), "", &M); + NewF->copyAttributesFrom(&F); + NewF->removeAttributes( + AttributeSet::ReturnIndex, + AttributeFuncs::typeIncompatible(NewFT->getReturnType(), + AttributeSet::ReturnIndex)); + for (Function::arg_iterator FArg = F.arg_begin(), + NewFArg = NewF->arg_begin(), + FArgEnd = F.arg_end(); + FArg != FArgEnd; ++FArg, ++NewFArg) { + FArg->replaceAllUsesWith(NewFArg); + } + NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList()); + + for (Function::use_iterator ui = F.use_begin(), ue = F.use_end(); + ui != ue;) { + BlockAddress *BA = dyn_cast<BlockAddress>(ui.getUse().getUser()); + ++ui; + if (BA) { + BA->replaceAllUsesWith( + BlockAddress::get(NewF, BA->getBasicBlock())); + delete BA; + } + } + F.replaceAllUsesWith( + ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT))); + NewF->takeName(&F); + F.eraseFromParent(); + *i = NewF; + addGlobalNamePrefix(NewF); + } else { + addGlobalNamePrefix(&F); + } + // Hopefully, nobody will try to indirectly call a vararg + // function... yet. + } else if (FT->isVarArg()) { + UnwrappedFnMap[&F] = &F; + *i = 0; + } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) { + // Build a wrapper function for F. The wrapper simply calls F, and is + // added to FnsToInstrument so that any instrumentation according to its + // WrapperKind is done in the second pass below. + FunctionType *NewFT = getInstrumentedABI() == IA_Args + ? getArgsFunctionType(FT) + : FT; + Function *NewF = buildWrapperFunction( + &F, std::string("dfsw$") + std::string(F.getName()), + GlobalValue::LinkOnceODRLinkage, NewFT); + if (getInstrumentedABI() == IA_TLS) + NewF->removeAttributes(AttributeSet::FunctionIndex, ReadOnlyNoneAttrs); + + Value *WrappedFnCst = + ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)); + F.replaceAllUsesWith(WrappedFnCst); + UnwrappedFnMap[WrappedFnCst] = &F; + *i = NewF; + + if (!F.isDeclaration()) { + // This function is probably defining an interposition of an + // uninstrumented function and hence needs to keep the original ABI. + // But any functions it may call need to use the instrumented ABI, so + // we instrument it in a mode which preserves the original ABI. + FnsWithNativeABI.insert(&F); + + // This code needs to rebuild the iterators, as they may be invalidated + // by the push_back, taking care that the new range does not include + // any functions added by this code. + size_t N = i - FnsToInstrument.begin(), + Count = e - FnsToInstrument.begin(); + FnsToInstrument.push_back(&F); + i = FnsToInstrument.begin() + N; + e = FnsToInstrument.begin() + Count; + } + } + } + + for (std::vector<Function *>::iterator i = FnsToInstrument.begin(), + e = FnsToInstrument.end(); + i != e; ++i) { + if (!*i || (*i)->isDeclaration()) + continue; + + removeUnreachableBlocks(**i); + + DFSanFunction DFSF(*this, *i, FnsWithNativeABI.count(*i)); + + // DFSanVisitor may create new basic blocks, which confuses df_iterator. + // Build a copy of the list before iterating over it. + llvm::SmallVector<BasicBlock *, 4> BBList; + std::copy(df_begin(&(*i)->getEntryBlock()), df_end(&(*i)->getEntryBlock()), + std::back_inserter(BBList)); + + for (llvm::SmallVector<BasicBlock *, 4>::iterator i = BBList.begin(), + e = BBList.end(); + i != e; ++i) { + Instruction *Inst = &(*i)->front(); + while (1) { + // DFSanVisitor may split the current basic block, changing the current + // instruction's next pointer and moving the next instruction to the + // tail block from which we should continue. + Instruction *Next = Inst->getNextNode(); + // DFSanVisitor may delete Inst, so keep track of whether it was a + // terminator. + bool IsTerminator = isa<TerminatorInst>(Inst); + if (!DFSF.SkipInsts.count(Inst)) + DFSanVisitor(DFSF).visit(Inst); + if (IsTerminator) + break; + Inst = Next; + } + } + + // We will not necessarily be able to compute the shadow for every phi node + // until we have visited every block. Therefore, the code that handles phi + // nodes adds them to the PHIFixups list so that they can be properly + // handled here. + for (std::vector<std::pair<PHINode *, PHINode *> >::iterator + i = DFSF.PHIFixups.begin(), + e = DFSF.PHIFixups.end(); + i != e; ++i) { + for (unsigned val = 0, n = i->first->getNumIncomingValues(); val != n; + ++val) { + i->second->setIncomingValue( + val, DFSF.getShadow(i->first->getIncomingValue(val))); + } + } + + // -dfsan-debug-nonzero-labels will split the CFG in all kinds of crazy + // places (i.e. instructions in basic blocks we haven't even begun visiting + // yet). To make our life easier, do this work in a pass after the main + // instrumentation. + if (ClDebugNonzeroLabels) { + for (DenseSet<Value *>::iterator i = DFSF.NonZeroChecks.begin(), + e = DFSF.NonZeroChecks.end(); + i != e; ++i) { + Instruction *Pos; + if (Instruction *I = dyn_cast<Instruction>(*i)) + Pos = I->getNextNode(); + else + Pos = DFSF.F->getEntryBlock().begin(); + while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos)) + Pos = Pos->getNextNode(); + IRBuilder<> IRB(Pos); + Instruction *NeInst = cast<Instruction>( + IRB.CreateICmpNE(*i, DFSF.DFS.ZeroShadow)); + BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen( + NeInst, /*Unreachable=*/ false, ColdCallWeights)); + IRBuilder<> ThenIRB(BI); + ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn); + } + } + } + + return false; +} + +Value *DFSanFunction::getArgTLSPtr() { + if (ArgTLSPtr) + return ArgTLSPtr; + if (DFS.ArgTLS) + return ArgTLSPtr = DFS.ArgTLS; + + IRBuilder<> IRB(F->getEntryBlock().begin()); + return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS); +} + +Value *DFSanFunction::getRetvalTLS() { + if (RetvalTLSPtr) + return RetvalTLSPtr; + if (DFS.RetvalTLS) + return RetvalTLSPtr = DFS.RetvalTLS; + + IRBuilder<> IRB(F->getEntryBlock().begin()); + return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS); +} + +Value *DFSanFunction::getArgTLS(unsigned Idx, Instruction *Pos) { + IRBuilder<> IRB(Pos); + return IRB.CreateConstGEP2_64(getArgTLSPtr(), 0, Idx); +} + +Value *DFSanFunction::getShadow(Value *V) { + if (!isa<Argument>(V) && !isa<Instruction>(V)) + return DFS.ZeroShadow; + Value *&Shadow = ValShadowMap[V]; + if (!Shadow) { + if (Argument *A = dyn_cast<Argument>(V)) { + if (IsNativeABI) + return DFS.ZeroShadow; + switch (IA) { + case DataFlowSanitizer::IA_TLS: { + Value *ArgTLSPtr = getArgTLSPtr(); + Instruction *ArgTLSPos = + DFS.ArgTLS ? &*F->getEntryBlock().begin() + : cast<Instruction>(ArgTLSPtr)->getNextNode(); + IRBuilder<> IRB(ArgTLSPos); + Shadow = IRB.CreateLoad(getArgTLS(A->getArgNo(), ArgTLSPos)); + break; + } + case DataFlowSanitizer::IA_Args: { + unsigned ArgIdx = A->getArgNo() + F->getArgumentList().size() / 2; + Function::arg_iterator i = F->arg_begin(); + while (ArgIdx--) + ++i; + Shadow = i; + assert(Shadow->getType() == DFS.ShadowTy); + break; + } + } + NonZeroChecks.insert(Shadow); + } else { + Shadow = DFS.ZeroShadow; + } + } + return Shadow; +} + +void DFSanFunction::setShadow(Instruction *I, Value *Shadow) { + assert(!ValShadowMap.count(I)); + assert(Shadow->getType() == DFS.ShadowTy); + ValShadowMap[I] = Shadow; +} + +Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) { + assert(Addr != RetvalTLS && "Reinstrumenting?"); + IRBuilder<> IRB(Pos); + return IRB.CreateIntToPtr( + IRB.CreateMul( + IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), ShadowPtrMask), + ShadowPtrMul), + ShadowPtrTy); +} + +// Generates IR to compute the union of the two given shadows, inserting it +// before Pos. Returns the computed union Value. +Value *DataFlowSanitizer::combineShadows(Value *V1, Value *V2, + Instruction *Pos) { + if (V1 == ZeroShadow) + return V2; + if (V2 == ZeroShadow) + return V1; + if (V1 == V2) + return V1; + IRBuilder<> IRB(Pos); + BasicBlock *Head = Pos->getParent(); + Value *Ne = IRB.CreateICmpNE(V1, V2); + Instruction *NeInst = dyn_cast<Instruction>(Ne); + if (NeInst) { + BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen( + NeInst, /*Unreachable=*/ false, ColdCallWeights)); + IRBuilder<> ThenIRB(BI); + CallInst *Call = ThenIRB.CreateCall2(DFSanUnionFn, V1, V2); + Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); + Call->addAttribute(1, Attribute::ZExt); + Call->addAttribute(2, Attribute::ZExt); + + BasicBlock *Tail = BI->getSuccessor(0); + PHINode *Phi = PHINode::Create(ShadowTy, 2, "", Tail->begin()); + Phi->addIncoming(Call, Call->getParent()); + Phi->addIncoming(V1, Head); + Pos = Phi; + return Phi; + } else { + assert(0 && "todo"); + return 0; + } +} + +// A convenience function which folds the shadows of each of the operands +// of the provided instruction Inst, inserting the IR before Inst. Returns +// the computed union Value. +Value *DFSanFunction::combineOperandShadows(Instruction *Inst) { + if (Inst->getNumOperands() == 0) + return DFS.ZeroShadow; + + Value *Shadow = getShadow(Inst->getOperand(0)); + for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) { + Shadow = DFS.combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst); + } + return Shadow; +} + +void DFSanVisitor::visitOperandShadowInst(Instruction &I) { + Value *CombinedShadow = DFSF.combineOperandShadows(&I); + DFSF.setShadow(&I, CombinedShadow); +} + +// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where +// Addr has alignment Align, and take the union of each of those shadows. +Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, + Instruction *Pos) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) { + llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i = + AllocaShadowMap.find(AI); + if (i != AllocaShadowMap.end()) { + IRBuilder<> IRB(Pos); + return IRB.CreateLoad(i->second); + } + } + + uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8; + SmallVector<Value *, 2> Objs; + GetUnderlyingObjects(Addr, Objs, DFS.DL); + bool AllConstants = true; + for (SmallVector<Value *, 2>::iterator i = Objs.begin(), e = Objs.end(); + i != e; ++i) { + if (isa<Function>(*i) || isa<BlockAddress>(*i)) + continue; + if (isa<GlobalVariable>(*i) && cast<GlobalVariable>(*i)->isConstant()) + continue; + + AllConstants = false; + break; + } + if (AllConstants) + return DFS.ZeroShadow; + + Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos); + switch (Size) { + case 0: + return DFS.ZeroShadow; + case 1: { + LoadInst *LI = new LoadInst(ShadowAddr, "", Pos); + LI->setAlignment(ShadowAlign); + return LI; + } + case 2: { + IRBuilder<> IRB(Pos); + Value *ShadowAddr1 = + IRB.CreateGEP(ShadowAddr, ConstantInt::get(DFS.IntptrTy, 1)); + return DFS.combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign), + IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign), + Pos); + } + } + if (Size % (64 / DFS.ShadowWidth) == 0) { + // Fast path for the common case where each byte has identical shadow: load + // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any + // shadow is non-equal. + BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F); + IRBuilder<> FallbackIRB(FallbackBB); + CallInst *FallbackCall = FallbackIRB.CreateCall2( + DFS.DFSanUnionLoadFn, ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)); + FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); + + // Compare each of the shadows stored in the loaded 64 bits to each other, + // by computing (WideShadow rotl ShadowWidth) == WideShadow. + IRBuilder<> IRB(Pos); + Value *WideAddr = + IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx)); + Value *WideShadow = IRB.CreateAlignedLoad(WideAddr, ShadowAlign); + Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.ShadowTy); + Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidth); + Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidth); + Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow); + Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow); + + BasicBlock *Head = Pos->getParent(); + BasicBlock *Tail = Head->splitBasicBlock(Pos); + // In the following code LastBr will refer to the previous basic block's + // conditional branch instruction, whose true successor is fixed up to point + // to the next block during the loop below or to the tail after the final + // iteration. + BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq); + ReplaceInstWithInst(Head->getTerminator(), LastBr); + + for (uint64_t Ofs = 64 / DFS.ShadowWidth; Ofs != Size; + Ofs += 64 / DFS.ShadowWidth) { + BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F); + IRBuilder<> NextIRB(NextBB); + WideAddr = NextIRB.CreateGEP(WideAddr, ConstantInt::get(DFS.IntptrTy, 1)); + Value *NextWideShadow = NextIRB.CreateAlignedLoad(WideAddr, ShadowAlign); + ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow); + LastBr->setSuccessor(0, NextBB); + LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB); + } + + LastBr->setSuccessor(0, Tail); + FallbackIRB.CreateBr(Tail); + PHINode *Shadow = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front()); + Shadow->addIncoming(FallbackCall, FallbackBB); + Shadow->addIncoming(TruncShadow, LastBr->getParent()); + return Shadow; + } + + IRBuilder<> IRB(Pos); + CallInst *FallbackCall = IRB.CreateCall2( + DFS.DFSanUnionLoadFn, ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)); + FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); + return FallbackCall; +} + +void DFSanVisitor::visitLoadInst(LoadInst &LI) { + uint64_t Size = DFSF.DFS.DL->getTypeStoreSize(LI.getType()); + uint64_t Align; + if (ClPreserveAlignment) { + Align = LI.getAlignment(); + if (Align == 0) + Align = DFSF.DFS.DL->getABITypeAlignment(LI.getType()); + } else { + Align = 1; + } + IRBuilder<> IRB(&LI); + Value *LoadedShadow = + DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI); + Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand()); + Value *CombinedShadow = DFSF.DFS.combineShadows(LoadedShadow, PtrShadow, &LI); + if (CombinedShadow != DFSF.DFS.ZeroShadow) + DFSF.NonZeroChecks.insert(CombinedShadow); + + DFSF.setShadow(&LI, CombinedShadow); +} + +void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align, + Value *Shadow, Instruction *Pos) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) { + llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i = + AllocaShadowMap.find(AI); + if (i != AllocaShadowMap.end()) { + IRBuilder<> IRB(Pos); + IRB.CreateStore(Shadow, i->second); + return; + } + } + + uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8; + IRBuilder<> IRB(Pos); + Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos); + if (Shadow == DFS.ZeroShadow) { + IntegerType *ShadowTy = IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidth); + Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0); + Value *ExtShadowAddr = + IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy)); + IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign); + return; + } + + const unsigned ShadowVecSize = 128 / DFS.ShadowWidth; + uint64_t Offset = 0; + if (Size >= ShadowVecSize) { + VectorType *ShadowVecTy = VectorType::get(DFS.ShadowTy, ShadowVecSize); + Value *ShadowVec = UndefValue::get(ShadowVecTy); + for (unsigned i = 0; i != ShadowVecSize; ++i) { + ShadowVec = IRB.CreateInsertElement( + ShadowVec, Shadow, ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), i)); + } + Value *ShadowVecAddr = + IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy)); + do { + Value *CurShadowVecAddr = IRB.CreateConstGEP1_32(ShadowVecAddr, Offset); + IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign); + Size -= ShadowVecSize; + ++Offset; + } while (Size >= ShadowVecSize); + Offset *= ShadowVecSize; + } + while (Size > 0) { + Value *CurShadowAddr = IRB.CreateConstGEP1_32(ShadowAddr, Offset); + IRB.CreateAlignedStore(Shadow, CurShadowAddr, ShadowAlign); + --Size; + ++Offset; + } +} + +void DFSanVisitor::visitStoreInst(StoreInst &SI) { + uint64_t Size = + DFSF.DFS.DL->getTypeStoreSize(SI.getValueOperand()->getType()); + uint64_t Align; + if (ClPreserveAlignment) { + Align = SI.getAlignment(); + if (Align == 0) + Align = DFSF.DFS.DL->getABITypeAlignment(SI.getValueOperand()->getType()); + } else { + Align = 1; + } + DFSF.storeShadow(SI.getPointerOperand(), Size, Align, + DFSF.getShadow(SI.getValueOperand()), &SI); +} + +void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) { + visitOperandShadowInst(BO); +} + +void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); } + +void DFSanVisitor::visitCmpInst(CmpInst &CI) { visitOperandShadowInst(CI); } + +void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { + visitOperandShadowInst(GEPI); +} + +void DFSanVisitor::visitExtractElementInst(ExtractElementInst &I) { + visitOperandShadowInst(I); +} + +void DFSanVisitor::visitInsertElementInst(InsertElementInst &I) { + visitOperandShadowInst(I); +} + +void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) { + visitOperandShadowInst(I); +} + +void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) { + visitOperandShadowInst(I); +} + +void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) { + visitOperandShadowInst(I); +} + +void DFSanVisitor::visitAllocaInst(AllocaInst &I) { + bool AllLoadsStores = true; + for (Instruction::use_iterator i = I.use_begin(), e = I.use_end(); i != e; + ++i) { + if (isa<LoadInst>(*i)) + continue; + + if (StoreInst *SI = dyn_cast<StoreInst>(*i)) { + if (SI->getPointerOperand() == &I) + continue; + } + + AllLoadsStores = false; + break; + } + if (AllLoadsStores) { + IRBuilder<> IRB(&I); + DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.ShadowTy); + } + DFSF.setShadow(&I, DFSF.DFS.ZeroShadow); +} + +void DFSanVisitor::visitSelectInst(SelectInst &I) { + Value *CondShadow = DFSF.getShadow(I.getCondition()); + Value *TrueShadow = DFSF.getShadow(I.getTrueValue()); + Value *FalseShadow = DFSF.getShadow(I.getFalseValue()); + + if (isa<VectorType>(I.getCondition()->getType())) { + DFSF.setShadow( + &I, DFSF.DFS.combineShadows( + CondShadow, + DFSF.DFS.combineShadows(TrueShadow, FalseShadow, &I), &I)); + } else { + Value *ShadowSel; + if (TrueShadow == FalseShadow) { + ShadowSel = TrueShadow; + } else { + ShadowSel = + SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I); + } + DFSF.setShadow(&I, DFSF.DFS.combineShadows(CondShadow, ShadowSel, &I)); + } +} + +void DFSanVisitor::visitMemSetInst(MemSetInst &I) { + IRBuilder<> IRB(&I); + Value *ValShadow = DFSF.getShadow(I.getValue()); + IRB.CreateCall3( + DFSF.DFS.DFSanSetLabelFn, ValShadow, + IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(*DFSF.DFS.Ctx)), + IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)); +} + +void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) { + IRBuilder<> IRB(&I); + Value *DestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I); + Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I); + Value *LenShadow = IRB.CreateMul( + I.getLength(), + ConstantInt::get(I.getLength()->getType(), DFSF.DFS.ShadowWidth / 8)); + Value *AlignShadow; + if (ClPreserveAlignment) { + AlignShadow = IRB.CreateMul(I.getAlignmentCst(), + ConstantInt::get(I.getAlignmentCst()->getType(), + DFSF.DFS.ShadowWidth / 8)); + } else { + AlignShadow = ConstantInt::get(I.getAlignmentCst()->getType(), + DFSF.DFS.ShadowWidth / 8); + } + Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx); + DestShadow = IRB.CreateBitCast(DestShadow, Int8Ptr); + SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr); + IRB.CreateCall5(I.getCalledValue(), DestShadow, SrcShadow, LenShadow, + AlignShadow, I.getVolatileCst()); +} + +void DFSanVisitor::visitReturnInst(ReturnInst &RI) { + if (!DFSF.IsNativeABI && RI.getReturnValue()) { + switch (DFSF.IA) { + case DataFlowSanitizer::IA_TLS: { + Value *S = DFSF.getShadow(RI.getReturnValue()); + IRBuilder<> IRB(&RI); + IRB.CreateStore(S, DFSF.getRetvalTLS()); + break; + } + case DataFlowSanitizer::IA_Args: { + IRBuilder<> IRB(&RI); + Type *RT = DFSF.F->getFunctionType()->getReturnType(); + Value *InsVal = + IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0); + Value *InsShadow = + IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1); + RI.setOperand(0, InsShadow); + break; + } + } + } +} + +void DFSanVisitor::visitCallSite(CallSite CS) { + Function *F = CS.getCalledFunction(); + if ((F && F->isIntrinsic()) || isa<InlineAsm>(CS.getCalledValue())) { + visitOperandShadowInst(*CS.getInstruction()); + return; + } + + IRBuilder<> IRB(CS.getInstruction()); + + DenseMap<Value *, Function *>::iterator i = + DFSF.DFS.UnwrappedFnMap.find(CS.getCalledValue()); + if (i != DFSF.DFS.UnwrappedFnMap.end()) { + Function *F = i->second; + switch (DFSF.DFS.getWrapperKind(F)) { + case DataFlowSanitizer::WK_Warning: { + CS.setCalledFunction(F); + IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn, + IRB.CreateGlobalStringPtr(F->getName())); + DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow); + return; + } + case DataFlowSanitizer::WK_Discard: { + CS.setCalledFunction(F); + DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow); + return; + } + case DataFlowSanitizer::WK_Functional: { + CS.setCalledFunction(F); + visitOperandShadowInst(*CS.getInstruction()); + return; + } + case DataFlowSanitizer::WK_Custom: { + // Don't try to handle invokes of custom functions, it's too complicated. + // Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_ + // wrapper. + if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) { + FunctionType *FT = F->getFunctionType(); + FunctionType *CustomFT = DFSF.DFS.getCustomFunctionType(FT); + std::string CustomFName = "__dfsw_"; + CustomFName += F->getName(); + Constant *CustomF = + DFSF.DFS.Mod->getOrInsertFunction(CustomFName, CustomFT); + if (Function *CustomFn = dyn_cast<Function>(CustomF)) { + CustomFn->copyAttributesFrom(F); + + // Custom functions returning non-void will write to the return label. + if (!FT->getReturnType()->isVoidTy()) { + CustomFn->removeAttributes(AttributeSet::FunctionIndex, + DFSF.DFS.ReadOnlyNoneAttrs); + } + } + + std::vector<Value *> Args; + + CallSite::arg_iterator i = CS.arg_begin(); + for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) { + Type *T = (*i)->getType(); + FunctionType *ParamFT; + if (isa<PointerType>(T) && + (ParamFT = dyn_cast<FunctionType>( + cast<PointerType>(T)->getElementType()))) { + std::string TName = "dfst"; + TName += utostr(FT->getNumParams() - n); + TName += "$"; + TName += F->getName(); + Constant *T = DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName); + Args.push_back(T); + Args.push_back( + IRB.CreateBitCast(*i, Type::getInt8PtrTy(*DFSF.DFS.Ctx))); + } else { + Args.push_back(*i); + } + } + + i = CS.arg_begin(); + for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) + Args.push_back(DFSF.getShadow(*i)); + + if (!FT->getReturnType()->isVoidTy()) { + if (!DFSF.LabelReturnAlloca) { + DFSF.LabelReturnAlloca = + new AllocaInst(DFSF.DFS.ShadowTy, "labelreturn", + DFSF.F->getEntryBlock().begin()); + } + Args.push_back(DFSF.LabelReturnAlloca); + } + + CallInst *CustomCI = IRB.CreateCall(CustomF, Args); + CustomCI->setCallingConv(CI->getCallingConv()); + CustomCI->setAttributes(CI->getAttributes()); + + if (!FT->getReturnType()->isVoidTy()) { + LoadInst *LabelLoad = IRB.CreateLoad(DFSF.LabelReturnAlloca); + DFSF.setShadow(CustomCI, LabelLoad); + } + + CI->replaceAllUsesWith(CustomCI); + CI->eraseFromParent(); + return; + } + break; + } + } + } + + FunctionType *FT = cast<FunctionType>( + CS.getCalledValue()->getType()->getPointerElementType()); + if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) { + for (unsigned i = 0, n = FT->getNumParams(); i != n; ++i) { + IRB.CreateStore(DFSF.getShadow(CS.getArgument(i)), + DFSF.getArgTLS(i, CS.getInstruction())); + } + } + + Instruction *Next = 0; + if (!CS.getType()->isVoidTy()) { + if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) { + if (II->getNormalDest()->getSinglePredecessor()) { + Next = II->getNormalDest()->begin(); + } else { + BasicBlock *NewBB = + SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DFS); + Next = NewBB->begin(); + } + } else { + Next = CS->getNextNode(); + } + + if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) { + IRBuilder<> NextIRB(Next); + LoadInst *LI = NextIRB.CreateLoad(DFSF.getRetvalTLS()); + DFSF.SkipInsts.insert(LI); + DFSF.setShadow(CS.getInstruction(), LI); + DFSF.NonZeroChecks.insert(LI); + } + } + + // Do all instrumentation for IA_Args down here to defer tampering with the + // CFG in a way that SplitEdge may be able to detect. + if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) { + FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT); + Value *Func = + IRB.CreateBitCast(CS.getCalledValue(), PointerType::getUnqual(NewFT)); + std::vector<Value *> Args; + + CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); + for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) + Args.push_back(*i); + + i = CS.arg_begin(); + for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) + Args.push_back(DFSF.getShadow(*i)); + + if (FT->isVarArg()) { + unsigned VarArgSize = CS.arg_size() - FT->getNumParams(); + ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize); + AllocaInst *VarArgShadow = + new AllocaInst(VarArgArrayTy, "", DFSF.F->getEntryBlock().begin()); + Args.push_back(IRB.CreateConstGEP2_32(VarArgShadow, 0, 0)); + for (unsigned n = 0; i != e; ++i, ++n) { + IRB.CreateStore(DFSF.getShadow(*i), + IRB.CreateConstGEP2_32(VarArgShadow, 0, n)); + Args.push_back(*i); + } + } + + CallSite NewCS; + if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) { + NewCS = IRB.CreateInvoke(Func, II->getNormalDest(), II->getUnwindDest(), + Args); + } else { + NewCS = IRB.CreateCall(Func, Args); + } + NewCS.setCallingConv(CS.getCallingConv()); + NewCS.setAttributes(CS.getAttributes().removeAttributes( + *DFSF.DFS.Ctx, AttributeSet::ReturnIndex, + AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType(), + AttributeSet::ReturnIndex))); + + if (Next) { + ExtractValueInst *ExVal = + ExtractValueInst::Create(NewCS.getInstruction(), 0, "", Next); + DFSF.SkipInsts.insert(ExVal); + ExtractValueInst *ExShadow = + ExtractValueInst::Create(NewCS.getInstruction(), 1, "", Next); + DFSF.SkipInsts.insert(ExShadow); + DFSF.setShadow(ExVal, ExShadow); + DFSF.NonZeroChecks.insert(ExShadow); + + CS.getInstruction()->replaceAllUsesWith(ExVal); + } + + CS.getInstruction()->eraseFromParent(); + } +} + +void DFSanVisitor::visitPHINode(PHINode &PN) { + PHINode *ShadowPN = + PHINode::Create(DFSF.DFS.ShadowTy, PN.getNumIncomingValues(), "", &PN); + + // Give the shadow phi node valid predecessors to fool SplitEdge into working. + Value *UndefShadow = UndefValue::get(DFSF.DFS.ShadowTy); + for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e; + ++i) { + ShadowPN->addIncoming(UndefShadow, *i); + } + + DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN)); + DFSF.setShadow(&PN, ShadowPN); +} diff --git a/lib/Transforms/Instrumentation/DebugIR.cpp b/lib/Transforms/Instrumentation/DebugIR.cpp index 651381d..f50a044 100644 --- a/lib/Transforms/Instrumentation/DebugIR.cpp +++ b/lib/Transforms/Instrumentation/DebugIR.cpp @@ -25,6 +25,7 @@ #include "llvm/InstVisitor.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -401,7 +402,7 @@ private: Type *PointeeTy = T->getPointerElementType(); if (!(N = getType(PointeeTy))) N = Builder.createPointerType( - getOrCreateType(PointeeTy), Layout.getPointerSizeInBits(), + getOrCreateType(PointeeTy), Layout.getPointerTypeSizeInBits(T), Layout.getPrefTypeAlignment(T), getTypeName(T)); } else if (T->isArrayTy()) { SmallVector<Value *, 1> Subrange; diff --git a/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/lib/Transforms/Instrumentation/EdgeProfiling.cpp deleted file mode 100644 index a2459fb..0000000 --- a/lib/Transforms/Instrumentation/EdgeProfiling.cpp +++ /dev/null @@ -1,117 +0,0 @@ -//===- EdgeProfiling.cpp - Insert counters for edge profiling -------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass instruments the specified program with counters for edge profiling. -// Edge profiling can give a reasonable approximation of the hot paths through a -// program, and is used for a wide variety of program transformations. -// -// Note that this implementation is very naive. We insert a counter for *every* -// edge in the program, instead of using control flow information to prune the -// number of counters inserted. -// -//===----------------------------------------------------------------------===// -#define DEBUG_TYPE "insert-edge-profiling" - -#include "llvm/Transforms/Instrumentation.h" -#include "ProfilingUtils.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include <set> -using namespace llvm; - -STATISTIC(NumEdgesInserted, "The # of edges inserted."); - -namespace { - class EdgeProfiler : public ModulePass { - bool runOnModule(Module &M); - public: - static char ID; // Pass identification, replacement for typeid - EdgeProfiler() : ModulePass(ID) { - initializeEdgeProfilerPass(*PassRegistry::getPassRegistry()); - } - - virtual const char *getPassName() const { - return "Edge Profiler"; - } - }; -} - -char EdgeProfiler::ID = 0; -INITIALIZE_PASS(EdgeProfiler, "insert-edge-profiling", - "Insert instrumentation for edge profiling", false, false) - -ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); } - -bool EdgeProfiler::runOnModule(Module &M) { - Function *Main = M.getFunction("main"); - if (Main == 0) { - errs() << "WARNING: cannot insert edge profiling into a module" - << " with no main function!\n"; - return false; // No main, no instrumentation! - } - - std::set<BasicBlock*> BlocksToInstrument; - unsigned NumEdges = 0; - for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (F->isDeclaration()) continue; - // Reserve space for (0,entry) edge. - ++NumEdges; - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { - // Keep track of which blocks need to be instrumented. We don't want to - // instrument blocks that are added as the result of breaking critical - // edges! - BlocksToInstrument.insert(BB); - NumEdges += BB->getTerminator()->getNumSuccessors(); - } - } - - Type *ATy = ArrayType::get(Type::getInt32Ty(M.getContext()), NumEdges); - GlobalVariable *Counters = - new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage, - Constant::getNullValue(ATy), "EdgeProfCounters"); - NumEdgesInserted = NumEdges; - - // Instrument all of the edges... - unsigned i = 0; - for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (F->isDeclaration()) continue; - // Create counter for (0,entry) edge. - IncrementCounterInBlock(&F->getEntryBlock(), i++, Counters); - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) - if (BlocksToInstrument.count(BB)) { // Don't instrument inserted blocks - // Okay, we have to add a counter of each outgoing edge. If the - // outgoing edge is not critical don't split it, just insert the counter - // in the source or destination of the edge. - TerminatorInst *TI = BB->getTerminator(); - for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) { - // If the edge is critical, split it. - SplitCriticalEdge(TI, s, this); - - // Okay, we are guaranteed that the edge is no longer critical. If we - // only have a single successor, insert the counter in this block, - // otherwise insert it in the successor block. - if (TI->getNumSuccessors() == 1) { - // Insert counter at the start of the block - IncrementCounterInBlock(BB, i++, Counters, false); - } else { - // Insert counter at the start of the block - IncrementCounterInBlock(TI->getSuccessor(s), i++, Counters); - } - } - } - } - - // Add the initialization call to main. - InsertProfilingInitCall(Main, "llvm_start_edge_profiling", Counters); - return true; -} - diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 4c2681f..206bffb 100644 --- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -17,7 +17,6 @@ #define DEBUG_TYPE "insert-gcov-profiling" #include "llvm/Transforms/Instrumentation.h" -#include "ProfilingUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" @@ -103,6 +102,7 @@ namespace { Constant *getIncrementIndirectCounterFunc(); Constant *getEmitFunctionFunc(); Constant *getEmitArcsFunc(); + Constant *getSummaryInfoFunc(); Constant *getDeleteWriteoutFunctionListFunc(); Constant *getDeleteFlushFunctionListFunc(); Constant *getEndFileFunc(); @@ -519,15 +519,15 @@ bool GCOVProfiler::emitProfileArcs() { TerminatorInst *TI = BB->getTerminator(); int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors(); if (Successors) { - IRBuilder<> Builder(TI); - if (Successors == 1) { + IRBuilder<> Builder(BB->getFirstInsertionPt()); Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Edge); Value *Count = Builder.CreateLoad(Counter); Count = Builder.CreateAdd(Count, Builder.getInt64(1)); Builder.CreateStore(Count, Counter); } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + IRBuilder<> Builder(BI); Value *Sel = Builder.CreateSelect(BI->getCondition(), Builder.getInt64(Edge), Builder.getInt64(Edge + 1)); @@ -543,6 +543,7 @@ bool GCOVProfiler::emitProfileArcs() { for (int i = 0; i != Successors; ++i) ComplexEdgeSuccs.insert(TI->getSuccessor(i)); } + Edge += Successors; } } @@ -554,14 +555,13 @@ bool GCOVProfiler::emitProfileArcs() { GlobalVariable *EdgeState = getEdgeStateValue(); for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) { - IRBuilder<> Builder(ComplexEdgePreds[i+1]->getTerminator()); + IRBuilder<> Builder(ComplexEdgePreds[i + 1]->getFirstInsertionPt()); Builder.CreateStore(Builder.getInt32(i), EdgeState); } + for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) { - // call runtime to perform increment - BasicBlock::iterator InsertPt = - ComplexEdgeSuccs[i+1]->getFirstInsertionPt(); - IRBuilder<> Builder(InsertPt); + // Call runtime to perform increment. + IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstInsertionPt()); Value *CounterPtrArray = Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0, i * ComplexEdgePreds.size()); @@ -599,7 +599,7 @@ bool GCOVProfiler::emitProfileArcs() { }; FTy = FunctionType::get(Builder.getVoidTy(), Params, false); - // Inialize the environment and register the local writeout and flush + // Initialize the environment and register the local writeout and flush // functions. Constant *GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy); Builder.CreateCall2(GCOVInit, WriteoutF, FlushF); @@ -701,6 +701,11 @@ Constant *GCOVProfiler::getEmitArcsFunc() { return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy); } +Constant *GCOVProfiler::getSummaryInfoFunc() { + FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); + return M->getOrInsertFunction("llvm_gcda_summary_info", FTy); +} + Constant *GCOVProfiler::getDeleteWriteoutFunctionListFunc() { FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); return M->getOrInsertFunction("llvm_delete_writeout_function_list", FTy); @@ -747,6 +752,7 @@ Function *GCOVProfiler::insertCounterWriteout( Constant *StartFile = getStartFileFunc(); Constant *EmitFunction = getEmitFunctionFunc(); Constant *EmitArcs = getEmitArcsFunc(); + Constant *SummaryInfo = getSummaryInfoFunc(); Constant *EndFile = getEndFileFunc(); NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu"); @@ -773,6 +779,7 @@ Function *GCOVProfiler::insertCounterWriteout( Builder.getInt32(Arcs), Builder.CreateConstGEP2_64(GV, 0, 0)); } + Builder.CreateCall(SummaryInfo); Builder.CreateCall(EndFile); } } diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp index 9f35396..b1bea38 100644 --- a/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -24,12 +24,10 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) { initializeAddressSanitizerPass(Registry); initializeAddressSanitizerModulePass(Registry); initializeBoundsCheckingPass(Registry); - initializeEdgeProfilerPass(Registry); initializeGCOVProfilerPass(Registry); - initializeOptimalEdgeProfilerPass(Registry); - initializePathProfilerPass(Registry); initializeMemorySanitizerPass(Registry); initializeThreadSanitizerPass(Registry); + initializeDataFlowSanitizerPass(Registry); } /// LLVMInitializeInstrumentation - C binding for diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 0251f16..d547adc 100644 --- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -66,6 +66,31 @@ /// avoids storing origin to memory when a fully initialized value is stored. /// This way it avoids needless overwritting origin of the 4-byte region on /// a short (i.e. 1 byte) clean store, and it is also good for performance. +/// +/// Atomic handling. +/// +/// Ideally, every atomic store of application value should update the +/// corresponding shadow location in an atomic way. Unfortunately, atomic store +/// of two disjoint locations can not be done without severe slowdown. +/// +/// Therefore, we implement an approximation that may err on the safe side. +/// In this implementation, every atomically accessed location in the program +/// may only change from (partially) uninitialized to fully initialized, but +/// not the other way around. We load the shadow _after_ the application load, +/// and we store the shadow _before_ the app store. Also, we always store clean +/// shadow (if the application store is atomic). This way, if the store-load +/// pair constitutes a happens-before arc, shadow store and load are correctly +/// ordered such that the load will get either the value that was stored, or +/// some later value (which is always clean). +/// +/// This does not work very well with Compare-And-Swap (CAS) and +/// Read-Modify-Write (RMW) operations. To follow the above logic, CAS and RMW +/// must store the new shadow before the app operation, and load the shadow +/// after the app operation. Computers don't work this way. Current +/// implementation ignores the load aspect of CAS/RMW, always returning a clean +/// value. It implements the store part as a simple atomic store by storing a +/// clean shadow. + //===----------------------------------------------------------------------===// #define DEBUG_TYPE "msan" @@ -157,6 +182,18 @@ static cl::opt<std::string> ClBlacklistFile("msan-blacklist", cl::desc("File containing the list of functions where MemorySanitizer " "should not report bugs"), cl::Hidden); +// Experimental. Wraps all indirect calls in the instrumented code with +// a call to the given function. This is needed to assist the dynamic +// helper tool (MSanDR) to regain control on transition between instrumented and +// non-instrumented code. +static cl::opt<std::string> ClWrapIndirectCalls("msan-wrap-indirect-calls", + cl::desc("Wrap indirect calls with a given function"), + cl::Hidden); + +static cl::opt<bool> ClWrapIndirectCallsFast("msan-wrap-indirect-calls-fast", + cl::desc("Do not wrap indirect calls with target in the same module"), + cl::Hidden, cl::init(true)); + namespace { /// \brief An instrumentation pass implementing detection of uninitialized @@ -168,12 +205,12 @@ class MemorySanitizer : public FunctionPass { public: MemorySanitizer(bool TrackOrigins = false, StringRef BlacklistFile = StringRef()) - : FunctionPass(ID), - TrackOrigins(TrackOrigins || ClTrackOrigins), - TD(0), - WarningFn(0), - BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile - : BlacklistFile) { } + : FunctionPass(ID), + TrackOrigins(TrackOrigins || ClTrackOrigins), + TD(0), + WarningFn(0), + BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile : BlacklistFile), + WrapIndirectCalls(!ClWrapIndirectCalls.empty()) {} const char *getPassName() const { return "MemorySanitizer"; } bool runOnFunction(Function &F); bool doInitialization(Module &M); @@ -207,13 +244,16 @@ class MemorySanitizer : public FunctionPass { /// function. GlobalVariable *OriginTLS; + GlobalVariable *MsandrModuleStart; + GlobalVariable *MsandrModuleEnd; + /// \brief The run-time callback to print a warning. Value *WarningFn; /// \brief Run-time helper that copies origin info for a memory range. Value *MsanCopyOriginFn; /// \brief Run-time helper that generates a new origin value for a stack /// allocation. - Value *MsanSetAllocaOriginFn; + Value *MsanSetAllocaOrigin4Fn; /// \brief Run-time helper that poisons stack on function entry. Value *MsanPoisonStackFn; /// \brief MSan runtime replacements for memmove, memcpy and memset. @@ -236,6 +276,12 @@ class MemorySanitizer : public FunctionPass { /// \brief An empty volatile inline asm that prevents callback merge. InlineAsm *EmptyAsm; + bool WrapIndirectCalls; + /// \brief Run-time wrapper for indirect calls. + Value *IndirectCallWrapperFn; + // Argument and return type of IndirectCallWrapperFn: void (*f)(void). + Type *AnyFunctionPtrTy; + friend struct MemorySanitizerVisitor; friend struct VarArgAMD64Helper; }; @@ -281,9 +327,9 @@ void MemorySanitizer::initializeCallbacks(Module &M) { MsanCopyOriginFn = M.getOrInsertFunction( "__msan_copy_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, NULL); - MsanSetAllocaOriginFn = M.getOrInsertFunction( - "__msan_set_alloca_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, - IRB.getInt8PtrTy(), NULL); + MsanSetAllocaOrigin4Fn = M.getOrInsertFunction( + "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, + IRB.getInt8PtrTy(), IntptrTy, NULL); MsanPoisonStackFn = M.getOrInsertFunction( "__msan_poison_stack", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, NULL); MemmoveFn = M.getOrInsertFunction( @@ -329,6 +375,24 @@ void MemorySanitizer::initializeCallbacks(Module &M) { EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), StringRef(""), StringRef(""), /*hasSideEffects=*/true); + + if (WrapIndirectCalls) { + AnyFunctionPtrTy = + PointerType::getUnqual(FunctionType::get(IRB.getVoidTy(), false)); + IndirectCallWrapperFn = M.getOrInsertFunction( + ClWrapIndirectCalls, AnyFunctionPtrTy, AnyFunctionPtrTy, NULL); + } + + if (ClWrapIndirectCallsFast) { + MsandrModuleStart = new GlobalVariable( + M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage, + 0, "__executable_start"); + MsandrModuleStart->setVisibility(GlobalVariable::HiddenVisibility); + MsandrModuleEnd = new GlobalVariable( + M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage, + 0, "_end"); + MsandrModuleEnd->setVisibility(GlobalVariable::HiddenVisibility); + } } /// \brief Module-level initialization. @@ -338,7 +402,7 @@ bool MemorySanitizer::doInitialization(Module &M) { TD = getAnalysisIfAvailable<DataLayout>(); if (!TD) return false; - BL.reset(new SpecialCaseList(BlacklistFile)); + BL.reset(SpecialCaseList::createOrDie(BlacklistFile)); C = &(M.getContext()); unsigned PtrSize = TD->getPointerSizeInBits(/* AddressSpace */0); switch (PtrSize) { @@ -423,22 +487,27 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { MemorySanitizer &MS; SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes; ValueMap<Value*, Value*> ShadowMap, OriginMap; + OwningPtr<VarArgHelper> VAHelper; + + // The following flags disable parts of MSan instrumentation based on + // blacklist contents and command-line options. bool InsertChecks; bool LoadShadow; bool PoisonStack; bool PoisonUndef; - OwningPtr<VarArgHelper> VAHelper; + bool CheckReturnValue; struct ShadowOriginAndInsertPoint { - Instruction *Shadow; - Instruction *Origin; + Value *Shadow; + Value *Origin; Instruction *OrigIns; - ShadowOriginAndInsertPoint(Instruction *S, Instruction *O, Instruction *I) + ShadowOriginAndInsertPoint(Value *S, Value *O, Instruction *I) : Shadow(S), Origin(O), OrigIns(I) { } ShadowOriginAndInsertPoint() : Shadow(0), Origin(0), OrigIns(0) { } }; SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList; SmallVector<Instruction*, 16> StoreList; + SmallVector<CallSite, 16> IndirectCallList; MemorySanitizerVisitor(Function &F, MemorySanitizer &MS) : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) { @@ -449,6 +518,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { LoadShadow = SanitizeFunction; PoisonStack = SanitizeFunction && ClPoisonStack; PoisonUndef = SanitizeFunction && ClPoisonUndef; + // FIXME: Consider using SpecialCaseList to specify a list of functions that + // must always return fully initialized values. For now, we hardcode "main". + CheckReturnValue = SanitizeFunction && (F.getName() == "main"); DEBUG(if (!InsertChecks) dbgs() << "MemorySanitizer is not inserting checks into '" @@ -462,7 +534,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(&I); Value *Val = I.getValueOperand(); Value *Addr = I.getPointerOperand(); - Value *Shadow = getShadow(Val); + Value *Shadow = I.isAtomic() ? getCleanShadow(Val) : getShadow(Val); Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB); StoreInst *NewSI = @@ -471,7 +543,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { (void)NewSI; if (ClCheckAccessAddress) - insertCheck(Addr, &I); + insertShadowCheck(Addr, &I); + + if (I.isAtomic()) + I.setOrdering(addReleaseOrdering(I.getOrdering())); if (MS.TrackOrigins) { unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment()); @@ -481,11 +556,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } else { Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); - Constant *Cst = dyn_cast_or_null<Constant>(ConvertedShadow); // TODO(eugenis): handle non-zero constant shadow by inserting an // unconditional check (can not simply fail compilation as this could // be in the dead code). - if (Cst) + if (isa<Constant>(ConvertedShadow)) continue; Value *Cmp = IRB.CreateICmpNE(ConvertedShadow, @@ -503,12 +577,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void materializeChecks() { for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) { - Instruction *Shadow = InstrumentationList[i].Shadow; + Value *Shadow = InstrumentationList[i].Shadow; Instruction *OrigIns = InstrumentationList[i].OrigIns; IRBuilder<> IRB(OrigIns); DEBUG(dbgs() << " SHAD0 : " << *Shadow << "\n"); Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); DEBUG(dbgs() << " SHAD1 : " << *ConvertedShadow << "\n"); + // See the comment in materializeStores(). + if (isa<Constant>(ConvertedShadow)) + continue; Value *Cmp = IRB.CreateICmpNE(ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp"); Instruction *CheckTerm = @@ -518,7 +595,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRB.SetInsertPoint(CheckTerm); if (MS.TrackOrigins) { - Instruction *Origin = InstrumentationList[i].Origin; + Value *Origin = InstrumentationList[i].Origin; IRB.CreateStore(Origin ? (Value*)Origin : (Value*)IRB.getInt32(0), MS.OriginTLS); } @@ -530,6 +607,48 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { DEBUG(dbgs() << "DONE:\n" << F); } + void materializeIndirectCalls() { + for (size_t i = 0, n = IndirectCallList.size(); i < n; i++) { + CallSite CS = IndirectCallList[i]; + Instruction *I = CS.getInstruction(); + BasicBlock *B = I->getParent(); + IRBuilder<> IRB(I); + Value *Fn0 = CS.getCalledValue(); + Value *Fn = IRB.CreateBitCast(Fn0, MS.AnyFunctionPtrTy); + + if (ClWrapIndirectCallsFast) { + // Check that call target is inside this module limits. + Value *Start = + IRB.CreateBitCast(MS.MsandrModuleStart, MS.AnyFunctionPtrTy); + Value *End = IRB.CreateBitCast(MS.MsandrModuleEnd, MS.AnyFunctionPtrTy); + + Value *NotInThisModule = IRB.CreateOr(IRB.CreateICmpULT(Fn, Start), + IRB.CreateICmpUGE(Fn, End)); + + PHINode *NewFnPhi = + IRB.CreatePHI(Fn0->getType(), 2, "msandr.indirect_target"); + + Instruction *CheckTerm = SplitBlockAndInsertIfThen( + cast<Instruction>(NotInThisModule), + /* Unreachable */ false, MS.ColdCallWeights); + + IRB.SetInsertPoint(CheckTerm); + // Slow path: call wrapper function to possibly transform the call + // target. + Value *NewFn = IRB.CreateBitCast( + IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType()); + + NewFnPhi->addIncoming(Fn0, B); + NewFnPhi->addIncoming(NewFn, dyn_cast<Instruction>(NewFn)->getParent()); + CS.setCalledFunction(NewFnPhi); + } else { + Value *NewFn = IRB.CreateBitCast( + IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType()); + CS.setCalledFunction(NewFn); + } + } + } + /// \brief Add MemorySanitizer instrumentation to a function. bool runOnFunction() { MS.initializeCallbacks(*F.getParent()); @@ -572,6 +691,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Insert shadow value checks. materializeChecks(); + // Wrap indirect calls. + materializeIndirectCalls(); + return true; } @@ -835,20 +957,63 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Remember the place where a shadow check should be inserted. /// /// This location will be later instrumented with a check that will print a - /// UMR warning in runtime if the value is not fully defined. - void insertCheck(Value *Val, Instruction *OrigIns) { - assert(Val); + /// UMR warning in runtime if the shadow value is not 0. + void insertShadowCheck(Value *Shadow, Value *Origin, Instruction *OrigIns) { + assert(Shadow); if (!InsertChecks) return; - Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val)); - if (!Shadow) return; #ifndef NDEBUG Type *ShadowTy = Shadow->getType(); assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) && "Can only insert checks for integer and vector shadow types"); #endif - Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val)); InstrumentationList.push_back( - ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns)); + ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns)); + } + + /// \brief Remember the place where a shadow check should be inserted. + /// + /// This location will be later instrumented with a check that will print a + /// UMR warning in runtime if the value is not fully defined. + void insertShadowCheck(Value *Val, Instruction *OrigIns) { + assert(Val); + Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val)); + if (!Shadow) return; + Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val)); + insertShadowCheck(Shadow, Origin, OrigIns); + } + + AtomicOrdering addReleaseOrdering(AtomicOrdering a) { + switch (a) { + case NotAtomic: + return NotAtomic; + case Unordered: + case Monotonic: + case Release: + return Release; + case Acquire: + case AcquireRelease: + return AcquireRelease; + case SequentiallyConsistent: + return SequentiallyConsistent; + } + llvm_unreachable("Unknown ordering"); + } + + AtomicOrdering addAcquireOrdering(AtomicOrdering a) { + switch (a) { + case NotAtomic: + return NotAtomic; + case Unordered: + case Monotonic: + case Acquire: + return Acquire; + case Release: + case AcquireRelease: + return AcquireRelease; + case SequentiallyConsistent: + return SequentiallyConsistent; + } + llvm_unreachable("Unknown ordering"); } // ------------------- Visitors. @@ -859,7 +1024,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// Optionally, checks that the load address is fully defined. void visitLoadInst(LoadInst &I) { assert(I.getType()->isSized() && "Load type must have size"); - IRBuilder<> IRB(&I); + IRBuilder<> IRB(I.getNextNode()); Type *ShadowTy = getShadowTy(&I); Value *Addr = I.getPointerOperand(); if (LoadShadow) { @@ -871,7 +1036,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } if (ClCheckAccessAddress) - insertCheck(I.getPointerOperand(), &I); + insertShadowCheck(I.getPointerOperand(), &I); + + if (I.isAtomic()) + I.setOrdering(addAcquireOrdering(I.getOrdering())); if (MS.TrackOrigins) { if (LoadShadow) { @@ -892,9 +1060,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { StoreList.push_back(&I); } + void handleCASOrRMW(Instruction &I) { + assert(isa<AtomicRMWInst>(I) || isa<AtomicCmpXchgInst>(I)); + + IRBuilder<> IRB(&I); + Value *Addr = I.getOperand(0); + Value *ShadowPtr = getShadowPtr(Addr, I.getType(), IRB); + + if (ClCheckAccessAddress) + insertShadowCheck(Addr, &I); + + // Only test the conditional argument of cmpxchg instruction. + // The other argument can potentially be uninitialized, but we can not + // detect this situation reliably without possible false positives. + if (isa<AtomicCmpXchgInst>(I)) + insertShadowCheck(I.getOperand(1), &I); + + IRB.CreateStore(getCleanShadow(&I), ShadowPtr); + + setShadow(&I, getCleanShadow(&I)); + } + + void visitAtomicRMWInst(AtomicRMWInst &I) { + handleCASOrRMW(I); + I.setOrdering(addReleaseOrdering(I.getOrdering())); + } + + void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) { + handleCASOrRMW(I); + I.setOrdering(addReleaseOrdering(I.getOrdering())); + } + // Vector manipulation. void visitExtractElementInst(ExtractElementInst &I) { - insertCheck(I.getOperand(1), &I); + insertShadowCheck(I.getOperand(1), &I); IRBuilder<> IRB(&I); setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1), "_msprop")); @@ -902,7 +1101,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } void visitInsertElementInst(InsertElementInst &I) { - insertCheck(I.getOperand(2), &I); + insertShadowCheck(I.getOperand(2), &I); IRBuilder<> IRB(&I); setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1), I.getOperand(2), "_msprop")); @@ -910,7 +1109,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } void visitShuffleVectorInst(ShuffleVectorInst &I) { - insertCheck(I.getOperand(2), &I); + insertShadowCheck(I.getOperand(2), &I); IRBuilder<> IRB(&I); setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1), I.getOperand(2), "_msprop")); @@ -1109,18 +1308,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Cast between two shadow types, extending or truncating as /// necessary. - Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy) { + Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy, + bool Signed = false) { Type *srcTy = V->getType(); if (dstTy->isIntegerTy() && srcTy->isIntegerTy()) - return IRB.CreateIntCast(V, dstTy, false); + return IRB.CreateIntCast(V, dstTy, Signed); if (dstTy->isVectorTy() && srcTy->isVectorTy() && dstTy->getVectorNumElements() == srcTy->getVectorNumElements()) - return IRB.CreateIntCast(V, dstTy, false); + return IRB.CreateIntCast(V, dstTy, Signed); size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy); size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy); Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits)); Value *V2 = - IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), false); + IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed); return IRB.CreateBitCast(V2, dstTy); // TODO: handle struct types. } @@ -1145,7 +1345,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void handleDiv(Instruction &I) { IRBuilder<> IRB(&I); // Strict on the second argument. - insertCheck(I.getOperand(1), &I); + insertShadowCheck(I.getOperand(1), &I); setShadow(&I, getShadow(&I, 0)); setOrigin(&I, getOrigin(&I, 0)); } @@ -1428,7 +1628,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRB.CreateAlignedStore(Shadow, ShadowPtr, 1); if (ClCheckAccessAddress) - insertCheck(Addr, &I); + insertShadowCheck(Addr, &I); // FIXME: use ClStoreCleanOrigin // FIXME: factor out common code from materializeStores @@ -1455,9 +1655,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setShadow(&I, getCleanShadow(&I)); } - if (ClCheckAccessAddress) - insertCheck(Addr, &I); + insertShadowCheck(Addr, &I); if (MS.TrackOrigins) { if (LoadShadow) @@ -1554,11 +1753,119 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOrigin(&I, getOrigin(Op)); } + // \brief Instrument vector convert instrinsic. + // + // This function instruments intrinsics like cvtsi2ss: + // %Out = int_xxx_cvtyyy(%ConvertOp) + // or + // %Out = int_xxx_cvtyyy(%CopyOp, %ConvertOp) + // Intrinsic converts \p NumUsedElements elements of \p ConvertOp to the same + // number \p Out elements, and (if has 2 arguments) copies the rest of the + // elements from \p CopyOp. + // In most cases conversion involves floating-point value which may trigger a + // hardware exception when not fully initialized. For this reason we require + // \p ConvertOp[0:NumUsedElements] to be fully initialized and trap otherwise. + // We copy the shadow of \p CopyOp[NumUsedElements:] to \p + // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always + // return a fully initialized value. + void handleVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements) { + IRBuilder<> IRB(&I); + Value *CopyOp, *ConvertOp; + + switch (I.getNumArgOperands()) { + case 2: + CopyOp = I.getArgOperand(0); + ConvertOp = I.getArgOperand(1); + break; + case 1: + ConvertOp = I.getArgOperand(0); + CopyOp = NULL; + break; + default: + llvm_unreachable("Cvt intrinsic with unsupported number of arguments."); + } + + // The first *NumUsedElements* elements of ConvertOp are converted to the + // same number of output elements. The rest of the output is copied from + // CopyOp, or (if not available) filled with zeroes. + // Combine shadow for elements of ConvertOp that are used in this operation, + // and insert a check. + // FIXME: consider propagating shadow of ConvertOp, at least in the case of + // int->any conversion. + Value *ConvertShadow = getShadow(ConvertOp); + Value *AggShadow = 0; + if (ConvertOp->getType()->isVectorTy()) { + AggShadow = IRB.CreateExtractElement( + ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); + for (int i = 1; i < NumUsedElements; ++i) { + Value *MoreShadow = IRB.CreateExtractElement( + ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), i)); + AggShadow = IRB.CreateOr(AggShadow, MoreShadow); + } + } else { + AggShadow = ConvertShadow; + } + assert(AggShadow->getType()->isIntegerTy()); + insertShadowCheck(AggShadow, getOrigin(ConvertOp), &I); + + // Build result shadow by zero-filling parts of CopyOp shadow that come from + // ConvertOp. + if (CopyOp) { + assert(CopyOp->getType() == I.getType()); + assert(CopyOp->getType()->isVectorTy()); + Value *ResultShadow = getShadow(CopyOp); + Type *EltTy = ResultShadow->getType()->getVectorElementType(); + for (int i = 0; i < NumUsedElements; ++i) { + ResultShadow = IRB.CreateInsertElement( + ResultShadow, ConstantInt::getNullValue(EltTy), + ConstantInt::get(IRB.getInt32Ty(), i)); + } + setShadow(&I, ResultShadow); + setOrigin(&I, getOrigin(CopyOp)); + } else { + setShadow(&I, getCleanShadow(&I)); + } + } + void visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { case llvm::Intrinsic::bswap: handleBswap(I); break; + case llvm::Intrinsic::x86_avx512_cvtsd2usi64: + case llvm::Intrinsic::x86_avx512_cvtsd2usi: + case llvm::Intrinsic::x86_avx512_cvtss2usi64: + case llvm::Intrinsic::x86_avx512_cvtss2usi: + case llvm::Intrinsic::x86_avx512_cvttss2usi64: + case llvm::Intrinsic::x86_avx512_cvttss2usi: + case llvm::Intrinsic::x86_avx512_cvttsd2usi64: + case llvm::Intrinsic::x86_avx512_cvttsd2usi: + case llvm::Intrinsic::x86_avx512_cvtusi2sd: + case llvm::Intrinsic::x86_avx512_cvtusi2ss: + case llvm::Intrinsic::x86_avx512_cvtusi642sd: + case llvm::Intrinsic::x86_avx512_cvtusi642ss: + case llvm::Intrinsic::x86_sse2_cvtsd2si64: + case llvm::Intrinsic::x86_sse2_cvtsd2si: + case llvm::Intrinsic::x86_sse2_cvtsd2ss: + case llvm::Intrinsic::x86_sse2_cvtsi2sd: + case llvm::Intrinsic::x86_sse2_cvtsi642sd: + case llvm::Intrinsic::x86_sse2_cvtss2sd: + case llvm::Intrinsic::x86_sse2_cvttsd2si64: + case llvm::Intrinsic::x86_sse2_cvttsd2si: + case llvm::Intrinsic::x86_sse_cvtsi2ss: + case llvm::Intrinsic::x86_sse_cvtsi642ss: + case llvm::Intrinsic::x86_sse_cvtss2si64: + case llvm::Intrinsic::x86_sse_cvtss2si: + case llvm::Intrinsic::x86_sse_cvttss2si64: + case llvm::Intrinsic::x86_sse_cvttss2si: + handleVectorConvertIntrinsic(I, 1); + break; + case llvm::Intrinsic::x86_sse2_cvtdq2pd: + case llvm::Intrinsic::x86_sse2_cvtps2pd: + case llvm::Intrinsic::x86_sse_cvtps2pi: + case llvm::Intrinsic::x86_sse_cvttps2pi: + handleVectorConvertIntrinsic(I, 2); + break; default: if (!handleUnknownIntrinsic(I)) visitInstruction(I); @@ -1604,6 +1911,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } } IRBuilder<> IRB(&I); + + if (MS.WrapIndirectCalls && !CS.getCalledFunction()) + IndirectCallList.push_back(CS); + unsigned ArgOffset = 0; DEBUG(dbgs() << " CallSite: " << I << "\n"); for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end(); @@ -1647,7 +1958,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { DEBUG(dbgs() << " done with call args\n"); FunctionType *FT = - cast<FunctionType>(CS.getCalledValue()->getType()-> getContainedType(0)); + cast<FunctionType>(CS.getCalledValue()->getType()->getContainedType(0)); if (FT->isVarArg()) { VAHelper->visitCallSite(CS, IRB); } @@ -1686,12 +1997,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void visitReturnInst(ReturnInst &I) { IRBuilder<> IRB(&I); - if (Value *RetVal = I.getReturnValue()) { - // Set the shadow for the RetVal. + Value *RetVal = I.getReturnValue(); + if (!RetVal) return; + Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB); + if (CheckReturnValue) { + insertShadowCheck(RetVal, &I); + Value *Shadow = getCleanShadow(RetVal); + IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment); + } else { Value *Shadow = getShadow(RetVal); - Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB); - DEBUG(dbgs() << "Return: " << *Shadow << "\n" << *ShadowPtr << "\n"); IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment); + // FIXME: make it conditional if ClStoreCleanOrigin==0 if (MS.TrackOrigins) IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB)); } @@ -1734,18 +2050,34 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *Descr = createPrivateNonConstGlobalForString(*F.getParent(), StackDescription.str()); - IRB.CreateCall3(MS.MsanSetAllocaOriginFn, + + IRB.CreateCall4(MS.MsanSetAllocaOrigin4Fn, IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), ConstantInt::get(MS.IntptrTy, Size), - IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy())); + IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()), + IRB.CreatePointerCast(&F, MS.IntptrTy)); } } void visitSelectInst(SelectInst& I) { IRBuilder<> IRB(&I); - setShadow(&I, IRB.CreateSelect(I.getCondition(), - getShadow(I.getTrueValue()), getShadow(I.getFalseValue()), - "_msprop")); + // a = select b, c, d + Value *S = IRB.CreateSelect(I.getCondition(), getShadow(I.getTrueValue()), + getShadow(I.getFalseValue())); + if (I.getType()->isAggregateType()) { + // To avoid "sign extending" i1 to an arbitrary aggregate type, we just do + // an extra "select". This results in much more compact IR. + // Sa = select Sb, poisoned, (select b, Sc, Sd) + S = IRB.CreateSelect(getShadow(I.getCondition()), + getPoisonedShadow(getShadowTy(I.getType())), S, + "_msprop_select_agg"); + } else { + // Sa = (sext Sb) | (select b, Sc, Sd) + S = IRB.CreateOr(S, CreateShadowCast(IRB, getShadow(I.getCondition()), + S->getType(), true), + "_msprop_select"); + } + setShadow(&I, S); if (MS.TrackOrigins) { // Origins are always i32, so any vector conditions must be flattened. // FIXME: consider tracking vector origins for app vectors? @@ -1780,7 +2112,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices()); DEBUG(dbgs() << " ResShadow: " << *ResShadow << "\n"); setShadow(&I, ResShadow); - setOrigin(&I, getCleanOrigin()); + setOriginForNaryOp(I); } void visitInsertValueInst(InsertValueInst &I) { @@ -1793,7 +2125,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices()); DEBUG(dbgs() << " Res: " << *Res << "\n"); setShadow(&I, Res); - setOrigin(&I, getCleanOrigin()); + setOriginForNaryOp(I); } void dumpInst(Instruction &I) { @@ -1816,7 +2148,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { dumpInst(I); DEBUG(dbgs() << "DEFAULT: " << I << "\n"); for (size_t i = 0, n = I.getNumOperands(); i < n; i++) - insertCheck(I.getOperand(i), &I); + insertShadowCheck(I.getOperand(i), &I); setShadow(&I, getCleanShadow(&I)); setOrigin(&I, getCleanOrigin()); } @@ -1970,8 +2302,7 @@ struct VarArgAMD64Helper : public VarArgHelper { Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr); Value *OverflowArgAreaShadowPtr = MSV.getShadowPtr(OverflowArgAreaPtr, IRB.getInt8Ty(), IRB); - Value *SrcPtr = - getShadowPtrForVAArgument(VAArgTLSCopy, IRB, AMD64FpEndOffset); + Value *SrcPtr = IRB.CreateConstGEP1_32(VAArgTLSCopy, AMD64FpEndOffset); IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize, 16); } } diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp deleted file mode 100644 index b45aef6..0000000 --- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp +++ /dev/null @@ -1,225 +0,0 @@ -//===- OptimalEdgeProfiling.cpp - Insert counters for opt. edge profiling -===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass instruments the specified program with counters for edge profiling. -// Edge profiling can give a reasonable approximation of the hot paths through a -// program, and is used for a wide variety of program transformations. -// -//===----------------------------------------------------------------------===// -#define DEBUG_TYPE "insert-optimal-edge-profiling" -#include "llvm/Transforms/Instrumentation.h" -#include "MaximumSpanningTree.h" -#include "ProfilingUtils.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/ProfileInfo.h" -#include "llvm/Analysis/ProfileInfoLoader.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -using namespace llvm; - -STATISTIC(NumEdgesInserted, "The # of edges inserted."); - -namespace { - class OptimalEdgeProfiler : public ModulePass { - bool runOnModule(Module &M); - public: - static char ID; // Pass identification, replacement for typeid - OptimalEdgeProfiler() : ModulePass(ID) { - initializeOptimalEdgeProfilerPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequiredID(ProfileEstimatorPassID); - AU.addRequired<ProfileInfo>(); - } - - virtual const char *getPassName() const { - return "Optimal Edge Profiler"; - } - }; -} - -char OptimalEdgeProfiler::ID = 0; -INITIALIZE_PASS_BEGIN(OptimalEdgeProfiler, "insert-optimal-edge-profiling", - "Insert optimal instrumentation for edge profiling", - false, false) -INITIALIZE_PASS_DEPENDENCY(ProfileEstimatorPass) -INITIALIZE_AG_DEPENDENCY(ProfileInfo) -INITIALIZE_PASS_END(OptimalEdgeProfiler, "insert-optimal-edge-profiling", - "Insert optimal instrumentation for edge profiling", - false, false) - -ModulePass *llvm::createOptimalEdgeProfilerPass() { - return new OptimalEdgeProfiler(); -} - -inline static void printEdgeCounter(ProfileInfo::Edge e, - BasicBlock* b, - unsigned i) { - DEBUG(dbgs() << "--Edge Counter for " << (e) << " in " \ - << ((b)?(b)->getName():"0") << " (# " << (i) << ")\n"); -} - -bool OptimalEdgeProfiler::runOnModule(Module &M) { - Function *Main = M.getFunction("main"); - if (Main == 0) { - errs() << "WARNING: cannot insert edge profiling into a module" - << " with no main function!\n"; - return false; // No main, no instrumentation! - } - - // NumEdges counts all the edges that may be instrumented. Later on its - // decided which edges to actually instrument, to achieve optimal profiling. - // For the entry block a virtual edge (0,entry) is reserved, for each block - // with no successors an edge (BB,0) is reserved. These edges are necessary - // to calculate a truly optimal maximum spanning tree and thus an optimal - // instrumentation. - unsigned NumEdges = 0; - - for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (F->isDeclaration()) continue; - // Reserve space for (0,entry) edge. - ++NumEdges; - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { - // Keep track of which blocks need to be instrumented. We don't want to - // instrument blocks that are added as the result of breaking critical - // edges! - if (BB->getTerminator()->getNumSuccessors() == 0) { - // Reserve space for (BB,0) edge. - ++NumEdges; - } else { - NumEdges += BB->getTerminator()->getNumSuccessors(); - } - } - } - - // In the profiling output a counter for each edge is reserved, but only few - // are used. This is done to be able to read back in the profile without - // calulating the maximum spanning tree again, instead each edge counter that - // is not used is initialised with -1 to signal that this edge counter has to - // be calculated from other edge counters on reading the profile info back - // in. - - Type *Int32 = Type::getInt32Ty(M.getContext()); - ArrayType *ATy = ArrayType::get(Int32, NumEdges); - GlobalVariable *Counters = - new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage, - Constant::getNullValue(ATy), "OptEdgeProfCounters"); - NumEdgesInserted = 0; - - std::vector<Constant*> Initializer(NumEdges); - Constant *Zero = ConstantInt::get(Int32, 0); - Constant *Uncounted = ConstantInt::get(Int32, ProfileInfoLoader::Uncounted); - - // Instrument all of the edges not in MST... - unsigned i = 0; - for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (F->isDeclaration()) continue; - DEBUG(dbgs() << "Working on " << F->getName() << "\n"); - - // Calculate a Maximum Spanning Tree with the edge weights determined by - // ProfileEstimator. ProfileEstimator also assign weights to the virtual - // edges (0,entry) and (BB,0) (for blocks with no successors) and this - // edges also participate in the maximum spanning tree calculation. - // The third parameter of MaximumSpanningTree() has the effect that not the - // actual MST is returned but the edges _not_ in the MST. - - ProfileInfo::EdgeWeights ECs = - getAnalysis<ProfileInfo>(*F).getEdgeWeights(F); - std::vector<ProfileInfo::EdgeWeight> EdgeVector(ECs.begin(), ECs.end()); - MaximumSpanningTree<BasicBlock> MST(EdgeVector); - std::stable_sort(MST.begin(), MST.end()); - - // Check if (0,entry) not in the MST. If not, instrument edge - // (IncrementCounterInBlock()) and set the counter initially to zero, if - // the edge is in the MST the counter is initialised to -1. - - BasicBlock *entry = &(F->getEntryBlock()); - ProfileInfo::Edge edge = ProfileInfo::getEdge(0, entry); - if (!std::binary_search(MST.begin(), MST.end(), edge)) { - printEdgeCounter(edge, entry, i); - IncrementCounterInBlock(entry, i, Counters); ++NumEdgesInserted; - Initializer[i++] = (Zero); - } else{ - Initializer[i++] = (Uncounted); - } - - // InsertedBlocks contains all blocks that were inserted for splitting an - // edge, this blocks do not have to be instrumented. - DenseSet<BasicBlock*> InsertedBlocks; - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { - // Check if block was not inserted and thus does not have to be - // instrumented. - if (InsertedBlocks.count(BB)) continue; - - // Okay, we have to add a counter of each outgoing edge not in MST. If - // the outgoing edge is not critical don't split it, just insert the - // counter in the source or destination of the edge. Also, if the block - // has no successors, the virtual edge (BB,0) is processed. - TerminatorInst *TI = BB->getTerminator(); - if (TI->getNumSuccessors() == 0) { - ProfileInfo::Edge edge = ProfileInfo::getEdge(BB, 0); - if (!std::binary_search(MST.begin(), MST.end(), edge)) { - printEdgeCounter(edge, BB, i); - IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted; - Initializer[i++] = (Zero); - } else{ - Initializer[i++] = (Uncounted); - } - } - for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) { - BasicBlock *Succ = TI->getSuccessor(s); - ProfileInfo::Edge edge = ProfileInfo::getEdge(BB,Succ); - if (!std::binary_search(MST.begin(), MST.end(), edge)) { - - // If the edge is critical, split it. - bool wasInserted = SplitCriticalEdge(TI, s, this); - Succ = TI->getSuccessor(s); - if (wasInserted) - InsertedBlocks.insert(Succ); - - // Okay, we are guaranteed that the edge is no longer critical. If - // we only have a single successor, insert the counter in this block, - // otherwise insert it in the successor block. - if (TI->getNumSuccessors() == 1) { - // Insert counter at the start of the block - printEdgeCounter(edge, BB, i); - IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted; - } else { - // Insert counter at the start of the block - printEdgeCounter(edge, Succ, i); - IncrementCounterInBlock(Succ, i, Counters); ++NumEdgesInserted; - } - Initializer[i++] = (Zero); - } else { - Initializer[i++] = (Uncounted); - } - } - } - } - - // Check if the number of edges counted at first was the number of edges we - // considered for instrumentation. - assert(i == NumEdges && "the number of edges in counting array is wrong"); - - // Assign the now completely defined initialiser to the array. - Constant *init = ConstantArray::get(ATy, Initializer); - Counters->setInitializer(init); - - // Add the initialization call to main. - InsertProfilingInitCall(Main, "llvm_start_opt_edge_profiling", Counters); - return true; -} - diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp deleted file mode 100644 index 7de7326..0000000 --- a/lib/Transforms/Instrumentation/PathProfiling.cpp +++ /dev/null @@ -1,1424 +0,0 @@ -//===- PathProfiling.cpp - Inserts counters for path profiling ------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass instruments functions for Ball-Larus path profiling. Ball-Larus -// profiling converts the CFG into a DAG by replacing backedges with edges -// from entry to the start block and from the end block to exit. The paths -// along the new DAG are enumrated, i.e. each path is given a path number. -// Edges are instrumented to increment the path number register, such that the -// path number register will equal the path number of the path taken at the -// exit. -// -// This file defines classes for building a CFG for use with different stages -// in the Ball-Larus path profiling instrumentation [Ball96]. The -// requirements are formatting the llvm CFG into the Ball-Larus DAG, path -// numbering, finding a spanning tree, moving increments from the spanning -// tree to chords. -// -// Terms: -// DAG - Directed Acyclic Graph. -// Ball-Larus DAG - A CFG with an entry node, an exit node, and backedges -// removed in the following manner. For every backedge -// v->w, insert edge ENTRY->w and edge v->EXIT. -// Path Number - The number corresponding to a specific path through a -// Ball-Larus DAG. -// Spanning Tree - A subgraph, S, is a spanning tree if S covers all -// vertices and is a tree. -// Chord - An edge not in the spanning tree. -// -// [Ball96] -// T. Ball and J. R. Larus. "Efficient Path Profiling." -// International Symposium on Microarchitecture, pages 46-57, 1996. -// http://portal.acm.org/citation.cfm?id=243857 -// -// [Ball94] -// Thomas Ball. "Efficiently Counting Program Events with Support for -// On-line queries." -// ACM Transactions on Programmmg Languages and Systems, Vol 16, No 5, -// September 1994, Pages 1399-1410. -//===----------------------------------------------------------------------===// -#define DEBUG_TYPE "insert-path-profiling" - -#include "llvm/Transforms/Instrumentation.h" -#include "ProfilingUtils.h" -#include "llvm/Analysis/PathNumbering.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/TypeBuilder.h" -#include "llvm/Pass.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include <vector> - -#define HASH_THRESHHOLD 100000 - -using namespace llvm; - -namespace { -class BLInstrumentationNode; -class BLInstrumentationEdge; -class BLInstrumentationDag; - -// --------------------------------------------------------------------------- -// BLInstrumentationNode extends BallLarusNode with member used by the -// instrumentation algortihms. -// --------------------------------------------------------------------------- -class BLInstrumentationNode : public BallLarusNode { -public: - // Creates a new BLInstrumentationNode from a BasicBlock. - BLInstrumentationNode(BasicBlock* BB); - - // Get/sets the Value corresponding to the pathNumber register, - // constant or phinode. Used by the instrumentation code to remember - // path number Values. - Value* getStartingPathNumber(); - void setStartingPathNumber(Value* pathNumber); - - Value* getEndingPathNumber(); - void setEndingPathNumber(Value* pathNumber); - - // Get/set the PHINode Instruction for this node. - PHINode* getPathPHI(); - void setPathPHI(PHINode* pathPHI); - -private: - - Value* _startingPathNumber; // The Value for the current pathNumber. - Value* _endingPathNumber; // The Value for the current pathNumber. - PHINode* _pathPHI; // The PHINode for current pathNumber. -}; - -// -------------------------------------------------------------------------- -// BLInstrumentationEdge extends BallLarusEdge with data about the -// instrumentation that will end up on each edge. -// -------------------------------------------------------------------------- -class BLInstrumentationEdge : public BallLarusEdge { -public: - BLInstrumentationEdge(BLInstrumentationNode* source, - BLInstrumentationNode* target); - - // Sets the target node of this edge. Required to split edges. - void setTarget(BallLarusNode* node); - - // Get/set whether edge is in the spanning tree. - bool isInSpanningTree() const; - void setIsInSpanningTree(bool isInSpanningTree); - - // Get/ set whether this edge will be instrumented with a path number - // initialization. - bool isInitialization() const; - void setIsInitialization(bool isInitialization); - - // Get/set whether this edge will be instrumented with a path counter - // increment. Notice this is incrementing the path counter - // corresponding to the path number register. The path number - // increment is determined by getIncrement(). - bool isCounterIncrement() const; - void setIsCounterIncrement(bool isCounterIncrement); - - // Get/set the path number increment that this edge will be instrumented - // with. This is distinct from the path counter increment and the - // weight. The counter increment counts the number of executions of - // some path, whereas the path number keeps track of which path number - // the program is on. - long getIncrement() const; - void setIncrement(long increment); - - // Get/set whether the edge has been instrumented. - bool hasInstrumentation(); - void setHasInstrumentation(bool hasInstrumentation); - - // Returns the successor number of this edge in the source. - unsigned getSuccessorNumber(); - -private: - // The increment that the code will be instrumented with. - long long _increment; - - // Whether this edge is in the spanning tree. - bool _isInSpanningTree; - - // Whether this edge is an initialiation of the path number. - bool _isInitialization; - - // Whether this edge is a path counter increment. - bool _isCounterIncrement; - - // Whether this edge has been instrumented. - bool _hasInstrumentation; -}; - -// --------------------------------------------------------------------------- -// BLInstrumentationDag extends BallLarusDag with algorithms that -// determine where instrumentation should be placed. -// --------------------------------------------------------------------------- -class BLInstrumentationDag : public BallLarusDag { -public: - BLInstrumentationDag(Function &F); - - // Returns the Exit->Root edge. This edge is required for creating - // directed cycles in the algorithm for moving instrumentation off of - // the spanning tree - BallLarusEdge* getExitRootEdge(); - - // Returns an array of phony edges which mark those nodes - // with function calls - BLEdgeVector getCallPhonyEdges(); - - // Gets/sets the path counter array - GlobalVariable* getCounterArray(); - void setCounterArray(GlobalVariable* c); - - // Calculates the increments for the chords, thereby removing - // instrumentation from the spanning tree edges. Implementation is based - // on the algorithm in Figure 4 of [Ball94] - void calculateChordIncrements(); - - // Updates the state when an edge has been split - void splitUpdate(BLInstrumentationEdge* formerEdge, BasicBlock* newBlock); - - // Calculates a spanning tree of the DAG ignoring cycles. Whichever - // edges are in the spanning tree will not be instrumented, but this - // implementation does not try to minimize the instrumentation overhead - // by trying to find hot edges. - void calculateSpanningTree(); - - // Pushes initialization further down in order to group the first - // increment and initialization. - void pushInitialization(); - - // Pushes the path counter increments up in order to group the last path - // number increment. - void pushCounters(); - - // Removes phony edges from the successor list of the source, and the - // predecessor list of the target. - void unlinkPhony(); - - // Generate dot graph for the function - void generateDotGraph(); - -protected: - // BLInstrumentationDag creates BLInstrumentationNode objects in this - // method overriding the creation of BallLarusNode objects. - // - // Allows subclasses to determine which type of Node is created. - // Override this method to produce subclasses of BallLarusNode if - // necessary. - virtual BallLarusNode* createNode(BasicBlock* BB); - - // BLInstrumentationDag create BLInstrumentationEdges. - // - // Allows subclasses to determine which type of Edge is created. - // Override this method to produce subclasses of BallLarusEdge if - // necessary. Parameters source and target will have been created by - // createNode and can be cast to the subclass of BallLarusNode* - // returned by createNode. - virtual BallLarusEdge* createEdge( - BallLarusNode* source, BallLarusNode* target, unsigned edgeNumber); - -private: - BLEdgeVector _treeEdges; // All edges in the spanning tree. - BLEdgeVector _chordEdges; // All edges not in the spanning tree. - GlobalVariable* _counterArray; // Array to store path counters - - // Removes the edge from the appropriate predecessor and successor lists. - void unlinkEdge(BallLarusEdge* edge); - - // Makes an edge part of the spanning tree. - void makeEdgeSpanning(BLInstrumentationEdge* edge); - - // Pushes initialization and calls itself recursively. - void pushInitializationFromEdge(BLInstrumentationEdge* edge); - - // Pushes path counter increments up recursively. - void pushCountersFromEdge(BLInstrumentationEdge* edge); - - // Depth first algorithm for determining the chord increments.f - void calculateChordIncrementsDfs( - long weight, BallLarusNode* v, BallLarusEdge* e); - - // Determines the relative direction of two edges. - int calculateChordIncrementsDir(BallLarusEdge* e, BallLarusEdge* f); -}; - -// --------------------------------------------------------------------------- -// PathProfiler is a module pass which instruments path profiling instructions -// --------------------------------------------------------------------------- -class PathProfiler : public ModulePass { -private: - // Current context for multi threading support. - LLVMContext* Context; - - // Which function are we currently instrumenting - unsigned currentFunctionNumber; - - // The function prototype in the profiling runtime for incrementing a - // single path counter in a hash table. - Constant* llvmIncrementHashFunction; - Constant* llvmDecrementHashFunction; - - // Instruments each function with path profiling. 'main' is instrumented - // with code to save the profile to disk. - bool runOnModule(Module &M); - - // Analyzes the function for Ball-Larus path profiling, and inserts code. - void runOnFunction(std::vector<Constant*> &ftInit, Function &F, Module &M); - - // Creates an increment constant representing incr. - ConstantInt* createIncrementConstant(long incr, int bitsize); - - // Creates an increment constant representing the value in - // edge->getIncrement(). - ConstantInt* createIncrementConstant(BLInstrumentationEdge* edge); - - // Finds the insertion point after pathNumber in block. PathNumber may - // be NULL. - BasicBlock::iterator getInsertionPoint( - BasicBlock* block, Value* pathNumber); - - // Inserts source's pathNumber Value* into target. Target may or may not - // have multiple predecessors, and may or may not have its phiNode - // initalized. - void pushValueIntoNode( - BLInstrumentationNode* source, BLInstrumentationNode* target); - - // Inserts source's pathNumber Value* into the appropriate slot of - // target's phiNode. - void pushValueIntoPHI( - BLInstrumentationNode* target, BLInstrumentationNode* source); - - // The Value* in node, oldVal, is updated with a Value* correspodning to - // oldVal + addition. - void insertNumberIncrement(BLInstrumentationNode* node, Value* addition, - bool atBeginning); - - // Creates a counter increment in the given node. The Value* in node is - // taken as the index into a hash table. - void insertCounterIncrement( - Value* incValue, - BasicBlock::iterator insertPoint, - BLInstrumentationDag* dag, - bool increment = true); - - // A PHINode is created in the node, and its values initialized to -1U. - void preparePHI(BLInstrumentationNode* node); - - // Inserts instrumentation for the given edge - // - // Pre: The edge's source node has pathNumber set if edge is non zero - // path number increment. - // - // Post: Edge's target node has a pathNumber set to the path number Value - // corresponding to the value of the path register after edge's - // execution. - void insertInstrumentationStartingAt( - BLInstrumentationEdge* edge, - BLInstrumentationDag* dag); - - // If this edge is a critical edge, then inserts a node at this edge. - // This edge becomes the first edge, and a new BallLarusEdge is created. - bool splitCritical(BLInstrumentationEdge* edge, BLInstrumentationDag* dag); - - // Inserts instrumentation according to the marked edges in dag. Phony - // edges must be unlinked from the DAG, but accessible from the - // backedges. Dag must have initializations, path number increments, and - // counter increments present. - // - // Counter storage is created here. - void insertInstrumentation( BLInstrumentationDag& dag, Module &M); - -public: - static char ID; // Pass identification, replacement for typeid - PathProfiler() : ModulePass(ID) { - initializePathProfilerPass(*PassRegistry::getPassRegistry()); - } - - virtual const char *getPassName() const { - return "Path Profiler"; - } -}; -} // end anonymous namespace - -// Should we print the dot-graphs -static cl::opt<bool> DotPathDag("path-profile-pathdag", cl::Hidden, - cl::desc("Output the path profiling DAG for each function.")); - -// Register the path profiler as a pass -char PathProfiler::ID = 0; -INITIALIZE_PASS(PathProfiler, "insert-path-profiling", - "Insert instrumentation for Ball-Larus path profiling", - false, false) - -ModulePass *llvm::createPathProfilerPass() { return new PathProfiler(); } - -namespace llvm { - class PathProfilingFunctionTable {}; - - // Type for global array storing references to hashes or arrays - template<bool xcompile> class TypeBuilder<PathProfilingFunctionTable, - xcompile> { - public: - static StructType *get(LLVMContext& C) { - return( StructType::get( - TypeBuilder<types::i<32>, xcompile>::get(C), // type - TypeBuilder<types::i<32>, xcompile>::get(C), // array size - TypeBuilder<types::i<8>*, xcompile>::get(C), // array/hash ptr - NULL)); - } - }; - - typedef TypeBuilder<PathProfilingFunctionTable, true> - ftEntryTypeBuilder; - - // BallLarusEdge << operator overloading - raw_ostream& operator<<(raw_ostream& os, - const BLInstrumentationEdge& edge) - LLVM_ATTRIBUTE_USED; - raw_ostream& operator<<(raw_ostream& os, - const BLInstrumentationEdge& edge) { - os << "[" << edge.getSource()->getName() << " -> " - << edge.getTarget()->getName() << "] init: " - << (edge.isInitialization() ? "yes" : "no") - << " incr:" << edge.getIncrement() << " cinc: " - << (edge.isCounterIncrement() ? "yes" : "no"); - return(os); - } -} - -// Creates a new BLInstrumentationNode from a BasicBlock. -BLInstrumentationNode::BLInstrumentationNode(BasicBlock* BB) : - BallLarusNode(BB), - _startingPathNumber(NULL), _endingPathNumber(NULL), _pathPHI(NULL) {} - -// Constructor for BLInstrumentationEdge. -BLInstrumentationEdge::BLInstrumentationEdge(BLInstrumentationNode* source, - BLInstrumentationNode* target) - : BallLarusEdge(source, target, 0), - _increment(0), _isInSpanningTree(false), _isInitialization(false), - _isCounterIncrement(false), _hasInstrumentation(false) {} - -// Sets the target node of this edge. Required to split edges. -void BLInstrumentationEdge::setTarget(BallLarusNode* node) { - _target = node; -} - -// Returns whether this edge is in the spanning tree. -bool BLInstrumentationEdge::isInSpanningTree() const { - return(_isInSpanningTree); -} - -// Sets whether this edge is in the spanning tree. -void BLInstrumentationEdge::setIsInSpanningTree(bool isInSpanningTree) { - _isInSpanningTree = isInSpanningTree; -} - -// Returns whether this edge will be instrumented with a path number -// initialization. -bool BLInstrumentationEdge::isInitialization() const { - return(_isInitialization); -} - -// Sets whether this edge will be instrumented with a path number -// initialization. -void BLInstrumentationEdge::setIsInitialization(bool isInitialization) { - _isInitialization = isInitialization; -} - -// Returns whether this edge will be instrumented with a path counter -// increment. Notice this is incrementing the path counter -// corresponding to the path number register. The path number -// increment is determined by getIncrement(). -bool BLInstrumentationEdge::isCounterIncrement() const { - return(_isCounterIncrement); -} - -// Sets whether this edge will be instrumented with a path counter -// increment. -void BLInstrumentationEdge::setIsCounterIncrement(bool isCounterIncrement) { - _isCounterIncrement = isCounterIncrement; -} - -// Gets the path number increment that this edge will be instrumented -// with. This is distinct from the path counter increment and the -// weight. The counter increment is counts the number of executions of -// some path, whereas the path number keeps track of which path number -// the program is on. -long BLInstrumentationEdge::getIncrement() const { - return(_increment); -} - -// Set whether this edge will be instrumented with a path number -// increment. -void BLInstrumentationEdge::setIncrement(long increment) { - _increment = increment; -} - -// True iff the edge has already been instrumented. -bool BLInstrumentationEdge::hasInstrumentation() { - return(_hasInstrumentation); -} - -// Set whether this edge has been instrumented. -void BLInstrumentationEdge::setHasInstrumentation(bool hasInstrumentation) { - _hasInstrumentation = hasInstrumentation; -} - -// Returns the successor number of this edge in the source. -unsigned BLInstrumentationEdge::getSuccessorNumber() { - BallLarusNode* sourceNode = getSource(); - BallLarusNode* targetNode = getTarget(); - BasicBlock* source = sourceNode->getBlock(); - BasicBlock* target = targetNode->getBlock(); - - if(source == NULL || target == NULL) - return(0); - - TerminatorInst* terminator = source->getTerminator(); - - unsigned i; - for(i=0; i < terminator->getNumSuccessors(); i++) { - if(terminator->getSuccessor(i) == target) - break; - } - - return(i); -} - -// BLInstrumentationDag constructor initializes a DAG for the given Function. -BLInstrumentationDag::BLInstrumentationDag(Function &F) : BallLarusDag(F), - _counterArray(0) { -} - -// Returns the Exit->Root edge. This edge is required for creating -// directed cycles in the algorithm for moving instrumentation off of -// the spanning tree -BallLarusEdge* BLInstrumentationDag::getExitRootEdge() { - BLEdgeIterator erEdge = getExit()->succBegin(); - return(*erEdge); -} - -BLEdgeVector BLInstrumentationDag::getCallPhonyEdges () { - BLEdgeVector callEdges; - - for( BLEdgeIterator edge = _edges.begin(), end = _edges.end(); - edge != end; edge++ ) { - if( (*edge)->getType() == BallLarusEdge::CALLEDGE_PHONY ) - callEdges.push_back(*edge); - } - - return callEdges; -} - -// Gets the path counter array -GlobalVariable* BLInstrumentationDag::getCounterArray() { - return _counterArray; -} - -void BLInstrumentationDag::setCounterArray(GlobalVariable* c) { - _counterArray = c; -} - -// Calculates the increment for the chords, thereby removing -// instrumentation from the spanning tree edges. Implementation is based on -// the algorithm in Figure 4 of [Ball94] -void BLInstrumentationDag::calculateChordIncrements() { - calculateChordIncrementsDfs(0, getRoot(), NULL); - - BLInstrumentationEdge* chord; - for(BLEdgeIterator chordEdge = _chordEdges.begin(), - end = _chordEdges.end(); chordEdge != end; chordEdge++) { - chord = (BLInstrumentationEdge*) *chordEdge; - chord->setIncrement(chord->getIncrement() + chord->getWeight()); - } -} - -// Updates the state when an edge has been split -void BLInstrumentationDag::splitUpdate(BLInstrumentationEdge* formerEdge, - BasicBlock* newBlock) { - BallLarusNode* oldTarget = formerEdge->getTarget(); - BallLarusNode* newNode = addNode(newBlock); - formerEdge->setTarget(newNode); - newNode->addPredEdge(formerEdge); - - DEBUG(dbgs() << " Edge split: " << *formerEdge << "\n"); - - oldTarget->removePredEdge(formerEdge); - BallLarusEdge* newEdge = addEdge(newNode, oldTarget,0); - - if( formerEdge->getType() == BallLarusEdge::BACKEDGE || - formerEdge->getType() == BallLarusEdge::SPLITEDGE) { - newEdge->setType(formerEdge->getType()); - newEdge->setPhonyRoot(formerEdge->getPhonyRoot()); - newEdge->setPhonyExit(formerEdge->getPhonyExit()); - formerEdge->setType(BallLarusEdge::NORMAL); - formerEdge->setPhonyRoot(NULL); - formerEdge->setPhonyExit(NULL); - } -} - -// Calculates a spanning tree of the DAG ignoring cycles. Whichever -// edges are in the spanning tree will not be instrumented, but this -// implementation does not try to minimize the instrumentation overhead -// by trying to find hot edges. -void BLInstrumentationDag::calculateSpanningTree() { - std::stack<BallLarusNode*> dfsStack; - - for(BLNodeIterator nodeIt = _nodes.begin(), end = _nodes.end(); - nodeIt != end; nodeIt++) { - (*nodeIt)->setColor(BallLarusNode::WHITE); - } - - dfsStack.push(getRoot()); - while(dfsStack.size() > 0) { - BallLarusNode* node = dfsStack.top(); - dfsStack.pop(); - - if(node->getColor() == BallLarusNode::WHITE) - continue; - - BallLarusNode* nextNode; - bool forward = true; - BLEdgeIterator succEnd = node->succEnd(); - - node->setColor(BallLarusNode::WHITE); - // first iterate over successors then predecessors - for(BLEdgeIterator edge = node->succBegin(), predEnd = node->predEnd(); - edge != predEnd; edge++) { - if(edge == succEnd) { - edge = node->predBegin(); - forward = false; - } - - // Ignore split edges - if ((*edge)->getType() == BallLarusEdge::SPLITEDGE) - continue; - - nextNode = forward? (*edge)->getTarget(): (*edge)->getSource(); - if(nextNode->getColor() != BallLarusNode::WHITE) { - nextNode->setColor(BallLarusNode::WHITE); - makeEdgeSpanning((BLInstrumentationEdge*)(*edge)); - } - } - } - - for(BLEdgeIterator edge = _edges.begin(), end = _edges.end(); - edge != end; edge++) { - BLInstrumentationEdge* instEdge = (BLInstrumentationEdge*) (*edge); - // safe since createEdge is overriden - if(!instEdge->isInSpanningTree() && (*edge)->getType() - != BallLarusEdge::SPLITEDGE) - _chordEdges.push_back(instEdge); - } -} - -// Pushes initialization further down in order to group the first -// increment and initialization. -void BLInstrumentationDag::pushInitialization() { - BLInstrumentationEdge* exitRootEdge = - (BLInstrumentationEdge*) getExitRootEdge(); - exitRootEdge->setIsInitialization(true); - pushInitializationFromEdge(exitRootEdge); -} - -// Pushes the path counter increments up in order to group the last path -// number increment. -void BLInstrumentationDag::pushCounters() { - BLInstrumentationEdge* exitRootEdge = - (BLInstrumentationEdge*) getExitRootEdge(); - exitRootEdge->setIsCounterIncrement(true); - pushCountersFromEdge(exitRootEdge); -} - -// Removes phony edges from the successor list of the source, and the -// predecessor list of the target. -void BLInstrumentationDag::unlinkPhony() { - BallLarusEdge* edge; - - for(BLEdgeIterator next = _edges.begin(), - end = _edges.end(); next != end; next++) { - edge = (*next); - - if( edge->getType() == BallLarusEdge::BACKEDGE_PHONY || - edge->getType() == BallLarusEdge::SPLITEDGE_PHONY || - edge->getType() == BallLarusEdge::CALLEDGE_PHONY ) { - unlinkEdge(edge); - } - } -} - -// Generate a .dot graph to represent the DAG and pathNumbers -void BLInstrumentationDag::generateDotGraph() { - std::string errorInfo; - std::string functionName = getFunction().getName().str(); - std::string filename = "pathdag." + functionName + ".dot"; - - DEBUG (dbgs() << "Writing '" << filename << "'...\n"); - raw_fd_ostream dotFile(filename.c_str(), errorInfo); - - if (!errorInfo.empty()) { - errs() << "Error opening '" << filename.c_str() <<"' for writing!"; - errs() << "\n"; - return; - } - - dotFile << "digraph " << functionName << " {\n"; - - for( BLEdgeIterator edge = _edges.begin(), end = _edges.end(); - edge != end; edge++) { - std::string sourceName = (*edge)->getSource()->getName(); - std::string targetName = (*edge)->getTarget()->getName(); - - dotFile << "\t\"" << sourceName.c_str() << "\" -> \"" - << targetName.c_str() << "\" "; - - long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement(); - - switch( (*edge)->getType() ) { - case BallLarusEdge::NORMAL: - dotFile << "[label=" << inc << "] [color=black];\n"; - break; - - case BallLarusEdge::BACKEDGE: - dotFile << "[color=cyan];\n"; - break; - - case BallLarusEdge::BACKEDGE_PHONY: - dotFile << "[label=" << inc - << "] [color=blue];\n"; - break; - - case BallLarusEdge::SPLITEDGE: - dotFile << "[color=violet];\n"; - break; - - case BallLarusEdge::SPLITEDGE_PHONY: - dotFile << "[label=" << inc << "] [color=red];\n"; - break; - - case BallLarusEdge::CALLEDGE_PHONY: - dotFile << "[label=" << inc << "] [color=green];\n"; - break; - } - } - - dotFile << "}\n"; -} - -// Allows subclasses to determine which type of Node is created. -// Override this method to produce subclasses of BallLarusNode if -// necessary. The destructor of BallLarusDag will call free on each pointer -// created. -BallLarusNode* BLInstrumentationDag::createNode(BasicBlock* BB) { - return( new BLInstrumentationNode(BB) ); -} - -// Allows subclasses to determine which type of Edge is created. -// Override this method to produce subclasses of BallLarusEdge if -// necessary. The destructor of BallLarusDag will call free on each pointer -// created. -BallLarusEdge* BLInstrumentationDag::createEdge(BallLarusNode* source, - BallLarusNode* target, unsigned edgeNumber) { - // One can cast from BallLarusNode to BLInstrumentationNode since createNode - // is overriden to produce BLInstrumentationNode. - return( new BLInstrumentationEdge((BLInstrumentationNode*)source, - (BLInstrumentationNode*)target) ); -} - -// Sets the Value corresponding to the pathNumber register, constant, -// or phinode. Used by the instrumentation code to remember path -// number Values. -Value* BLInstrumentationNode::getStartingPathNumber(){ - return(_startingPathNumber); -} - -// Sets the Value of the pathNumber. Used by the instrumentation code. -void BLInstrumentationNode::setStartingPathNumber(Value* pathNumber) { - DEBUG(dbgs() << " SPN-" << getName() << " <-- " << (pathNumber ? - pathNumber->getName() : - "unused") << "\n"); - _startingPathNumber = pathNumber; -} - -Value* BLInstrumentationNode::getEndingPathNumber(){ - return(_endingPathNumber); -} - -void BLInstrumentationNode::setEndingPathNumber(Value* pathNumber) { - DEBUG(dbgs() << " EPN-" << getName() << " <-- " - << (pathNumber ? pathNumber->getName() : "unused") << "\n"); - _endingPathNumber = pathNumber; -} - -// Get the PHINode Instruction for this node. Used by instrumentation -// code. -PHINode* BLInstrumentationNode::getPathPHI() { - return(_pathPHI); -} - -// Set the PHINode Instruction for this node. Used by instrumentation -// code. -void BLInstrumentationNode::setPathPHI(PHINode* pathPHI) { - _pathPHI = pathPHI; -} - -// Removes the edge from the appropriate predecessor and successor -// lists. -void BLInstrumentationDag::unlinkEdge(BallLarusEdge* edge) { - if(edge == getExitRootEdge()) - DEBUG(dbgs() << " Removing exit->root edge\n"); - - edge->getSource()->removeSuccEdge(edge); - edge->getTarget()->removePredEdge(edge); -} - -// Makes an edge part of the spanning tree. -void BLInstrumentationDag::makeEdgeSpanning(BLInstrumentationEdge* edge) { - edge->setIsInSpanningTree(true); - _treeEdges.push_back(edge); -} - -// Pushes initialization and calls itself recursively. -void BLInstrumentationDag::pushInitializationFromEdge( - BLInstrumentationEdge* edge) { - BallLarusNode* target; - - target = edge->getTarget(); - if( target->getNumberPredEdges() > 1 || target == getExit() ) { - return; - } else { - for(BLEdgeIterator next = target->succBegin(), - end = target->succEnd(); next != end; next++) { - BLInstrumentationEdge* intoEdge = (BLInstrumentationEdge*) *next; - - // Skip split edges - if (intoEdge->getType() == BallLarusEdge::SPLITEDGE) - continue; - - intoEdge->setIncrement(intoEdge->getIncrement() + - edge->getIncrement()); - intoEdge->setIsInitialization(true); - pushInitializationFromEdge(intoEdge); - } - - edge->setIncrement(0); - edge->setIsInitialization(false); - } -} - -// Pushes path counter increments up recursively. -void BLInstrumentationDag::pushCountersFromEdge(BLInstrumentationEdge* edge) { - BallLarusNode* source; - - source = edge->getSource(); - if(source->getNumberSuccEdges() > 1 || source == getRoot() - || edge->isInitialization()) { - return; - } else { - for(BLEdgeIterator previous = source->predBegin(), - end = source->predEnd(); previous != end; previous++) { - BLInstrumentationEdge* fromEdge = (BLInstrumentationEdge*) *previous; - - // Skip split edges - if (fromEdge->getType() == BallLarusEdge::SPLITEDGE) - continue; - - fromEdge->setIncrement(fromEdge->getIncrement() + - edge->getIncrement()); - fromEdge->setIsCounterIncrement(true); - pushCountersFromEdge(fromEdge); - } - - edge->setIncrement(0); - edge->setIsCounterIncrement(false); - } -} - -// Depth first algorithm for determining the chord increments. -void BLInstrumentationDag::calculateChordIncrementsDfs(long weight, - BallLarusNode* v, BallLarusEdge* e) { - BLInstrumentationEdge* f; - - for(BLEdgeIterator treeEdge = _treeEdges.begin(), - end = _treeEdges.end(); treeEdge != end; treeEdge++) { - f = (BLInstrumentationEdge*) *treeEdge; - if(e != f && v == f->getTarget()) { - calculateChordIncrementsDfs( - calculateChordIncrementsDir(e,f)*(weight) + - f->getWeight(), f->getSource(), f); - } - if(e != f && v == f->getSource()) { - calculateChordIncrementsDfs( - calculateChordIncrementsDir(e,f)*(weight) + - f->getWeight(), f->getTarget(), f); - } - } - - for(BLEdgeIterator chordEdge = _chordEdges.begin(), - end = _chordEdges.end(); chordEdge != end; chordEdge++) { - f = (BLInstrumentationEdge*) *chordEdge; - if(v == f->getSource() || v == f->getTarget()) { - f->setIncrement(f->getIncrement() + - calculateChordIncrementsDir(e,f)*weight); - } - } -} - -// Determines the relative direction of two edges. -int BLInstrumentationDag::calculateChordIncrementsDir(BallLarusEdge* e, - BallLarusEdge* f) { - if( e == NULL) - return(1); - else if(e->getSource() == f->getTarget() - || e->getTarget() == f->getSource()) - return(1); - - return(-1); -} - -// Creates an increment constant representing incr. -ConstantInt* PathProfiler::createIncrementConstant(long incr, - int bitsize) { - return(ConstantInt::get(IntegerType::get(*Context, 32), incr)); -} - -// Creates an increment constant representing the value in -// edge->getIncrement(). -ConstantInt* PathProfiler::createIncrementConstant( - BLInstrumentationEdge* edge) { - return(createIncrementConstant(edge->getIncrement(), 32)); -} - -// Finds the insertion point after pathNumber in block. PathNumber may -// be NULL. -BasicBlock::iterator PathProfiler::getInsertionPoint(BasicBlock* block, Value* - pathNumber) { - if(pathNumber == NULL || isa<ConstantInt>(pathNumber) - || (((Instruction*)(pathNumber))->getParent()) != block) { - return(block->getFirstInsertionPt()); - } else { - Instruction* pathNumberInst = (Instruction*) (pathNumber); - BasicBlock::iterator insertPoint; - BasicBlock::iterator end = block->end(); - - for(insertPoint = block->begin(); - insertPoint != end; insertPoint++) { - Instruction* insertInst = &(*insertPoint); - - if(insertInst == pathNumberInst) - return(++insertPoint); - } - - return(insertPoint); - } -} - -// A PHINode is created in the node, and its values initialized to -1U. -void PathProfiler::preparePHI(BLInstrumentationNode* node) { - BasicBlock* block = node->getBlock(); - BasicBlock::iterator insertPoint = block->getFirstInsertionPt(); - pred_iterator PB = pred_begin(node->getBlock()), - PE = pred_end(node->getBlock()); - PHINode* phi = PHINode::Create(Type::getInt32Ty(*Context), - std::distance(PB, PE), "pathNumber", - insertPoint ); - node->setPathPHI(phi); - node->setStartingPathNumber(phi); - node->setEndingPathNumber(phi); - - for(pred_iterator predIt = PB; predIt != PE; predIt++) { - BasicBlock* pred = (*predIt); - - if(pred != NULL) - phi->addIncoming(createIncrementConstant((long)-1, 32), pred); - } -} - -// Inserts source's pathNumber Value* into target. Target may or may not -// have multiple predecessors, and may or may not have its phiNode -// initalized. -void PathProfiler::pushValueIntoNode(BLInstrumentationNode* source, - BLInstrumentationNode* target) { - if(target->getBlock() == NULL) - return; - - - if(target->getNumberPredEdges() <= 1) { - assert(target->getStartingPathNumber() == NULL && - "Target already has path number"); - target->setStartingPathNumber(source->getEndingPathNumber()); - target->setEndingPathNumber(source->getEndingPathNumber()); - DEBUG(dbgs() << " Passing path number" - << (source->getEndingPathNumber() ? "" : " (null)") - << " value through.\n"); - } else { - if(target->getPathPHI() == NULL) { - DEBUG(dbgs() << " Initializing PHI node for block '" - << target->getName() << "'\n"); - preparePHI(target); - } - pushValueIntoPHI(target, source); - DEBUG(dbgs() << " Passing number value into PHI for block '" - << target->getName() << "'\n"); - } -} - -// Inserts source's pathNumber Value* into the appropriate slot of -// target's phiNode. -void PathProfiler::pushValueIntoPHI(BLInstrumentationNode* target, - BLInstrumentationNode* source) { - PHINode* phi = target->getPathPHI(); - assert(phi != NULL && " Tried to push value into node with PHI, but node" - " actually had no PHI."); - phi->removeIncomingValue(source->getBlock(), false); - phi->addIncoming(source->getEndingPathNumber(), source->getBlock()); -} - -// The Value* in node, oldVal, is updated with a Value* correspodning to -// oldVal + addition. -void PathProfiler::insertNumberIncrement(BLInstrumentationNode* node, - Value* addition, bool atBeginning) { - BasicBlock* block = node->getBlock(); - assert(node->getStartingPathNumber() != NULL); - assert(node->getEndingPathNumber() != NULL); - - BasicBlock::iterator insertPoint; - - if( atBeginning ) - insertPoint = block->getFirstInsertionPt(); - else - insertPoint = block->getTerminator(); - - DEBUG(errs() << " Creating addition instruction.\n"); - Value* newpn = BinaryOperator::Create(Instruction::Add, - node->getStartingPathNumber(), - addition, "pathNumber", insertPoint); - - node->setEndingPathNumber(newpn); - - if( atBeginning ) - node->setStartingPathNumber(newpn); -} - -// Creates a counter increment in the given node. The Value* in node is -// taken as the index into an array or hash table. The hash table access -// is a call to the runtime. -void PathProfiler::insertCounterIncrement(Value* incValue, - BasicBlock::iterator insertPoint, - BLInstrumentationDag* dag, - bool increment) { - // Counter increment for array - if( dag->getNumberOfPaths() <= HASH_THRESHHOLD ) { - // Get pointer to the array location - std::vector<Value*> gepIndices(2); - gepIndices[0] = Constant::getNullValue(Type::getInt32Ty(*Context)); - gepIndices[1] = incValue; - - GetElementPtrInst* pcPointer = - GetElementPtrInst::Create(dag->getCounterArray(), gepIndices, - "counterInc", insertPoint); - - // Load from the array - call it oldPC - LoadInst* oldPc = new LoadInst(pcPointer, "oldPC", insertPoint); - - // Test to see whether adding 1 will overflow the counter - ICmpInst* isMax = new ICmpInst(insertPoint, CmpInst::ICMP_ULT, oldPc, - createIncrementConstant(0xffffffff, 32), - "isMax"); - - // Select increment for the path counter based on overflow - SelectInst* inc = - SelectInst::Create( isMax, createIncrementConstant(increment?1:-1,32), - createIncrementConstant(0,32), - "pathInc", insertPoint); - - // newPc = oldPc + inc - BinaryOperator* newPc = BinaryOperator::Create(Instruction::Add, - oldPc, inc, "newPC", - insertPoint); - - // Store back in to the array - new StoreInst(newPc, pcPointer, insertPoint); - } else { // Counter increment for hash - std::vector<Value*> args(2); - args[0] = ConstantInt::get(Type::getInt32Ty(*Context), - currentFunctionNumber); - args[1] = incValue; - - CallInst::Create( - increment ? llvmIncrementHashFunction : llvmDecrementHashFunction, - args, "", insertPoint); - } -} - -// Inserts instrumentation for the given edge -// -// Pre: The edge's source node has pathNumber set if edge is non zero -// path number increment. -// -// Post: Edge's target node has a pathNumber set to the path number Value -// corresponding to the value of the path register after edge's -// execution. -// -// FIXME: This should be reworked so it's not recursive. -void PathProfiler::insertInstrumentationStartingAt(BLInstrumentationEdge* edge, - BLInstrumentationDag* dag) { - // Mark the edge as instrumented - edge->setHasInstrumentation(true); - DEBUG(dbgs() << "\nInstrumenting edge: " << (*edge) << "\n"); - - // create a new node for this edge's instrumentation - splitCritical(edge, dag); - - BLInstrumentationNode* sourceNode = (BLInstrumentationNode*)edge->getSource(); - BLInstrumentationNode* targetNode = (BLInstrumentationNode*)edge->getTarget(); - BLInstrumentationNode* instrumentNode; - BLInstrumentationNode* nextSourceNode; - - bool atBeginning = false; - - // Source node has only 1 successor so any information can be simply - // inserted in to it without splitting - if( sourceNode->getBlock() && sourceNode->getNumberSuccEdges() <= 1) { - DEBUG(dbgs() << " Potential instructions to be placed in: " - << sourceNode->getName() << " (at end)\n"); - instrumentNode = sourceNode; - nextSourceNode = targetNode; // ... since we never made any new nodes - } - - // The target node only has one predecessor, so we can safely insert edge - // instrumentation into it. If there was splitting, it must have been - // successful. - else if( targetNode->getNumberPredEdges() == 1 ) { - DEBUG(dbgs() << " Potential instructions to be placed in: " - << targetNode->getName() << " (at beginning)\n"); - pushValueIntoNode(sourceNode, targetNode); - instrumentNode = targetNode; - nextSourceNode = NULL; // ... otherwise we'll just keep splitting - atBeginning = true; - } - - // Somehow, splitting must have failed. - else { - errs() << "Instrumenting could not split a critical edge.\n"; - DEBUG(dbgs() << " Couldn't split edge " << (*edge) << ".\n"); - return; - } - - // Insert instrumentation if this is a back or split edge - if( edge->getType() == BallLarusEdge::BACKEDGE || - edge->getType() == BallLarusEdge::SPLITEDGE ) { - BLInstrumentationEdge* top = - (BLInstrumentationEdge*) edge->getPhonyRoot(); - BLInstrumentationEdge* bottom = - (BLInstrumentationEdge*) edge->getPhonyExit(); - - assert( top->isInitialization() && " Top phony edge did not" - " contain a path number initialization."); - assert( bottom->isCounterIncrement() && " Bottom phony edge" - " did not contain a path counter increment."); - - // split edge has yet to be initialized - if( !instrumentNode->getEndingPathNumber() ) { - instrumentNode->setStartingPathNumber(createIncrementConstant(0,32)); - instrumentNode->setEndingPathNumber(createIncrementConstant(0,32)); - } - - BasicBlock::iterator insertPoint = atBeginning ? - instrumentNode->getBlock()->getFirstInsertionPt() : - instrumentNode->getBlock()->getTerminator(); - - // add information from the bottom edge, if it exists - if( bottom->getIncrement() ) { - Value* newpn = - BinaryOperator::Create(Instruction::Add, - instrumentNode->getStartingPathNumber(), - createIncrementConstant(bottom), - "pathNumber", insertPoint); - instrumentNode->setEndingPathNumber(newpn); - } - - insertCounterIncrement(instrumentNode->getEndingPathNumber(), - insertPoint, dag); - - if( atBeginning ) - instrumentNode->setStartingPathNumber(createIncrementConstant(top)); - - instrumentNode->setEndingPathNumber(createIncrementConstant(top)); - - // Check for path counter increments - if( top->isCounterIncrement() ) { - insertCounterIncrement(instrumentNode->getEndingPathNumber(), - instrumentNode->getBlock()->getTerminator(),dag); - instrumentNode->setEndingPathNumber(0); - } - } - - // Insert instrumentation if this is a normal edge - else { - BasicBlock::iterator insertPoint = atBeginning ? - instrumentNode->getBlock()->getFirstInsertionPt() : - instrumentNode->getBlock()->getTerminator(); - - if( edge->isInitialization() ) { // initialize path number - instrumentNode->setEndingPathNumber(createIncrementConstant(edge)); - } else if( edge->getIncrement() ) {// increment path number - Value* newpn = - BinaryOperator::Create(Instruction::Add, - instrumentNode->getStartingPathNumber(), - createIncrementConstant(edge), - "pathNumber", insertPoint); - instrumentNode->setEndingPathNumber(newpn); - - if( atBeginning ) - instrumentNode->setStartingPathNumber(newpn); - } - - // Check for path counter increments - if( edge->isCounterIncrement() ) { - insertCounterIncrement(instrumentNode->getEndingPathNumber(), - insertPoint, dag); - instrumentNode->setEndingPathNumber(0); - } - } - - // Push it along - if (nextSourceNode && instrumentNode->getEndingPathNumber()) - pushValueIntoNode(instrumentNode, nextSourceNode); - - // Add all the successors - for( BLEdgeIterator next = targetNode->succBegin(), - end = targetNode->succEnd(); next != end; next++ ) { - // So long as it is un-instrumented, add it to the list - if( !((BLInstrumentationEdge*)(*next))->hasInstrumentation() ) - insertInstrumentationStartingAt((BLInstrumentationEdge*)*next,dag); - else - DEBUG(dbgs() << " Edge " << *(BLInstrumentationEdge*)(*next) - << " already instrumented.\n"); - } -} - -// Inserts instrumentation according to the marked edges in dag. Phony edges -// must be unlinked from the DAG, but accessible from the backedges. Dag -// must have initializations, path number increments, and counter increments -// present. -// -// Counter storage is created here. -void PathProfiler::insertInstrumentation( - BLInstrumentationDag& dag, Module &M) { - - BLInstrumentationEdge* exitRootEdge = - (BLInstrumentationEdge*) dag.getExitRootEdge(); - insertInstrumentationStartingAt(exitRootEdge, &dag); - - // Iterate through each call edge and apply the appropriate hash increment - // and decrement functions - BLEdgeVector callEdges = dag.getCallPhonyEdges(); - for( BLEdgeIterator edge = callEdges.begin(), - end = callEdges.end(); edge != end; edge++ ) { - BLInstrumentationNode* node = - (BLInstrumentationNode*)(*edge)->getSource(); - BasicBlock::iterator insertPoint = node->getBlock()->getFirstInsertionPt(); - - // Find the first function call - while( ((Instruction&)(*insertPoint)).getOpcode() != Instruction::Call ) - insertPoint++; - - DEBUG(dbgs() << "\nInstrumenting method call block '" - << node->getBlock()->getName() << "'\n"); - DEBUG(dbgs() << " Path number initialized: " - << ((node->getStartingPathNumber()) ? "yes" : "no") << "\n"); - - Value* newpn; - if( node->getStartingPathNumber() ) { - long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement(); - if ( inc ) - newpn = BinaryOperator::Create(Instruction::Add, - node->getStartingPathNumber(), - createIncrementConstant(inc,32), - "pathNumber", insertPoint); - else - newpn = node->getStartingPathNumber(); - } else { - newpn = (Value*)createIncrementConstant( - ((BLInstrumentationEdge*)(*edge))->getIncrement(), 32); - } - - insertCounterIncrement(newpn, insertPoint, &dag); - insertCounterIncrement(newpn, node->getBlock()->getTerminator(), - &dag, false); - } -} - -// Entry point of the module -void PathProfiler::runOnFunction(std::vector<Constant*> &ftInit, - Function &F, Module &M) { - // Build DAG from CFG - BLInstrumentationDag dag = BLInstrumentationDag(F); - dag.init(); - - // give each path a unique integer value - dag.calculatePathNumbers(); - - // modify path increments to increase the efficiency - // of instrumentation - dag.calculateSpanningTree(); - dag.calculateChordIncrements(); - dag.pushInitialization(); - dag.pushCounters(); - dag.unlinkPhony(); - - // potentially generate .dot graph for the dag - if (DotPathDag) - dag.generateDotGraph (); - - // Should we store the information in an array or hash - if( dag.getNumberOfPaths() <= HASH_THRESHHOLD ) { - Type* t = ArrayType::get(Type::getInt32Ty(*Context), - dag.getNumberOfPaths()); - - dag.setCounterArray(new GlobalVariable(M, t, false, - GlobalValue::InternalLinkage, - Constant::getNullValue(t), "")); - } - - insertInstrumentation(dag, M); - - // Add to global function reference table - unsigned type; - Type* voidPtr = TypeBuilder<types::i<8>*, true>::get(*Context); - - if( dag.getNumberOfPaths() <= HASH_THRESHHOLD ) - type = ProfilingArray; - else - type = ProfilingHash; - - std::vector<Constant*> entryArray(3); - entryArray[0] = createIncrementConstant(type,32); - entryArray[1] = createIncrementConstant(dag.getNumberOfPaths(),32); - entryArray[2] = dag.getCounterArray() ? - ConstantExpr::getBitCast(dag.getCounterArray(), voidPtr) : - Constant::getNullValue(voidPtr); - - StructType* at = ftEntryTypeBuilder::get(*Context); - ConstantStruct* functionEntry = - (ConstantStruct*)ConstantStruct::get(at, entryArray); - ftInit.push_back(functionEntry); -} - -// Output the bitcode if we want to observe instrumentation changess -#define PRINT_MODULE dbgs() << \ - "\n\n============= MODULE BEGIN ===============\n" << M << \ - "\n============== MODULE END ================\n" - -bool PathProfiler::runOnModule(Module &M) { - Context = &M.getContext(); - - DEBUG(dbgs() - << "****************************************\n" - << "****************************************\n" - << "** **\n" - << "** PATH PROFILING INSTRUMENTATION **\n" - << "** **\n" - << "****************************************\n" - << "****************************************\n"); - - // No main, no instrumentation! - Function *Main = M.getFunction("main"); - - // Using fortran? ... this kind of works - if (!Main) - Main = M.getFunction("MAIN__"); - - if (!Main) { - errs() << "WARNING: cannot insert path profiling into a module" - << " with no main function!\n"; - return false; - } - - llvmIncrementHashFunction = M.getOrInsertFunction( - "llvm_increment_path_count", - Type::getVoidTy(*Context), // return type - Type::getInt32Ty(*Context), // function number - Type::getInt32Ty(*Context), // path number - NULL ); - - llvmDecrementHashFunction = M.getOrInsertFunction( - "llvm_decrement_path_count", - Type::getVoidTy(*Context), // return type - Type::getInt32Ty(*Context), // function number - Type::getInt32Ty(*Context), // path number - NULL ); - - std::vector<Constant*> ftInit; - unsigned functionNumber = 0; - for (Module::iterator F = M.begin(), E = M.end(); F != E; F++) { - if (F->isDeclaration()) - continue; - - DEBUG(dbgs() << "Function: " << F->getName() << "\n"); - functionNumber++; - - // set function number - currentFunctionNumber = functionNumber; - runOnFunction(ftInit, *F, M); - } - - Type *t = ftEntryTypeBuilder::get(*Context); - ArrayType* ftArrayType = ArrayType::get(t, ftInit.size()); - Constant* ftInitConstant = ConstantArray::get(ftArrayType, ftInit); - - DEBUG(dbgs() << " ftArrayType:" << *ftArrayType << "\n"); - - GlobalVariable* functionTable = - new GlobalVariable(M, ftArrayType, false, GlobalValue::InternalLinkage, - ftInitConstant, "functionPathTable"); - Type *eltType = ftArrayType->getTypeAtIndex((unsigned)0); - InsertProfilingInitCall(Main, "llvm_start_path_profiling", functionTable, - PointerType::getUnqual(eltType)); - - DEBUG(PRINT_MODULE); - - return true; -} - -// If this edge is a critical edge, then inserts a node at this edge. -// This edge becomes the first edge, and a new BallLarusEdge is created. -// Returns true if the edge was split -bool PathProfiler::splitCritical(BLInstrumentationEdge* edge, - BLInstrumentationDag* dag) { - unsigned succNum = edge->getSuccessorNumber(); - BallLarusNode* sourceNode = edge->getSource(); - BallLarusNode* targetNode = edge->getTarget(); - BasicBlock* sourceBlock = sourceNode->getBlock(); - BasicBlock* targetBlock = targetNode->getBlock(); - - if(sourceBlock == NULL || targetBlock == NULL - || sourceNode->getNumberSuccEdges() <= 1 - || targetNode->getNumberPredEdges() == 1 ) { - return(false); - } - - TerminatorInst* terminator = sourceBlock->getTerminator(); - - if( SplitCriticalEdge(terminator, succNum, this, false)) { - BasicBlock* newBlock = terminator->getSuccessor(succNum); - dag->splitUpdate(edge, newBlock); - return(true); - } else - return(false); -} diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/lib/Transforms/Instrumentation/ProfilingUtils.cpp deleted file mode 100644 index 4b3de6d..0000000 --- a/lib/Transforms/Instrumentation/ProfilingUtils.cpp +++ /dev/null @@ -1,169 +0,0 @@ -//===- ProfilingUtils.cpp - Helper functions shared by profilers ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements a few helper functions which are used by profile -// instrumentation code to instrument the code. This allows the profiler pass -// to worry about *what* to insert, and these functions take care of *how* to do -// it. -// -//===----------------------------------------------------------------------===// - -#include "ProfilingUtils.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" - -void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName, - GlobalValue *Array, - PointerType *arrayType) { - LLVMContext &Context = MainFn->getContext(); - Type *ArgVTy = - PointerType::getUnqual(Type::getInt8PtrTy(Context)); - PointerType *UIntPtr = arrayType ? arrayType : - Type::getInt32PtrTy(Context); - Module &M = *MainFn->getParent(); - Constant *InitFn = M.getOrInsertFunction(FnName, Type::getInt32Ty(Context), - Type::getInt32Ty(Context), - ArgVTy, UIntPtr, - Type::getInt32Ty(Context), - (Type *)0); - - // This could force argc and argv into programs that wouldn't otherwise have - // them, but instead we just pass null values in. - std::vector<Value*> Args(4); - Args[0] = Constant::getNullValue(Type::getInt32Ty(Context)); - Args[1] = Constant::getNullValue(ArgVTy); - - // Skip over any allocas in the entry block. - BasicBlock *Entry = MainFn->begin(); - BasicBlock::iterator InsertPos = Entry->begin(); - while (isa<AllocaInst>(InsertPos)) ++InsertPos; - - std::vector<Constant*> GEPIndices(2, - Constant::getNullValue(Type::getInt32Ty(Context))); - unsigned NumElements = 0; - if (Array) { - Args[2] = ConstantExpr::getGetElementPtr(Array, GEPIndices); - NumElements = - cast<ArrayType>(Array->getType()->getElementType())->getNumElements(); - } else { - // If this profiling instrumentation doesn't have a constant array, just - // pass null. - Args[2] = ConstantPointerNull::get(UIntPtr); - } - Args[3] = ConstantInt::get(Type::getInt32Ty(Context), NumElements); - - CallInst *InitCall = CallInst::Create(InitFn, Args, "newargc", InsertPos); - - // If argc or argv are not available in main, just pass null values in. - Function::arg_iterator AI; - switch (MainFn->arg_size()) { - default: - case 2: - AI = MainFn->arg_begin(); ++AI; - if (AI->getType() != ArgVTy) { - Instruction::CastOps opcode = CastInst::getCastOpcode(AI, false, ArgVTy, - false); - InitCall->setArgOperand(1, - CastInst::Create(opcode, AI, ArgVTy, "argv.cast", InitCall)); - } else { - InitCall->setArgOperand(1, AI); - } - /* FALL THROUGH */ - - case 1: - AI = MainFn->arg_begin(); - // If the program looked at argc, have it look at the return value of the - // init call instead. - if (!AI->getType()->isIntegerTy(32)) { - Instruction::CastOps opcode; - if (!AI->use_empty()) { - opcode = CastInst::getCastOpcode(InitCall, true, AI->getType(), true); - AI->replaceAllUsesWith( - CastInst::Create(opcode, InitCall, AI->getType(), "", InsertPos)); - } - opcode = CastInst::getCastOpcode(AI, true, - Type::getInt32Ty(Context), true); - InitCall->setArgOperand(0, - CastInst::Create(opcode, AI, Type::getInt32Ty(Context), - "argc.cast", InitCall)); - } else { - AI->replaceAllUsesWith(InitCall); - InitCall->setArgOperand(0, AI); - } - - case 0: break; - } -} - -void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum, - GlobalValue *CounterArray, bool beginning) { - // Insert the increment after any alloca or PHI instructions... - BasicBlock::iterator InsertPos = beginning ? BB->getFirstInsertionPt() : - BB->getTerminator(); - while (isa<AllocaInst>(InsertPos)) - ++InsertPos; - - LLVMContext &Context = BB->getContext(); - - // Create the getelementptr constant expression - std::vector<Constant*> Indices(2); - Indices[0] = Constant::getNullValue(Type::getInt32Ty(Context)); - Indices[1] = ConstantInt::get(Type::getInt32Ty(Context), CounterNum); - Constant *ElementPtr = - ConstantExpr::getGetElementPtr(CounterArray, Indices); - - // Load, increment and store the value back. - Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos); - Value *NewVal = BinaryOperator::Create(Instruction::Add, OldVal, - ConstantInt::get(Type::getInt32Ty(Context), 1), - "NewFuncCounter", InsertPos); - new StoreInst(NewVal, ElementPtr, InsertPos); -} - -void llvm::InsertProfilingShutdownCall(Function *Callee, Module *Mod) { - // llvm.global_dtors is an array of type { i32, void ()* }. Prepare those - // types. - Type *GlobalDtorElems[2] = { - Type::getInt32Ty(Mod->getContext()), - FunctionType::get(Type::getVoidTy(Mod->getContext()), false)->getPointerTo() - }; - StructType *GlobalDtorElemTy = - StructType::get(Mod->getContext(), GlobalDtorElems, false); - - // Construct the new element we'll be adding. - Constant *Elem[2] = { - ConstantInt::get(Type::getInt32Ty(Mod->getContext()), 65535), - ConstantExpr::getBitCast(Callee, GlobalDtorElems[1]) - }; - - // If llvm.global_dtors exists, make a copy of the things in its list and - // delete it, to replace it with one that has a larger array type. - std::vector<Constant *> dtors; - if (GlobalVariable *GlobalDtors = Mod->getNamedGlobal("llvm.global_dtors")) { - if (ConstantArray *InitList = - dyn_cast<ConstantArray>(GlobalDtors->getInitializer())) { - for (unsigned i = 0, e = InitList->getType()->getNumElements(); - i != e; ++i) - dtors.push_back(cast<Constant>(InitList->getOperand(i))); - } - GlobalDtors->eraseFromParent(); - } - - // Build up llvm.global_dtors with our new item in it. - GlobalVariable *GlobalDtors = new GlobalVariable( - *Mod, ArrayType::get(GlobalDtorElemTy, 1), false, - GlobalValue::AppendingLinkage, NULL, "llvm.global_dtors"); - - dtors.push_back(ConstantStruct::get(GlobalDtorElemTy, Elem)); - GlobalDtors->setInitializer(ConstantArray::get( - cast<ArrayType>(GlobalDtors->getType()->getElementType()), dtors)); -} diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.h b/lib/Transforms/Instrumentation/ProfilingUtils.h deleted file mode 100644 index 09b2217..0000000 --- a/lib/Transforms/Instrumentation/ProfilingUtils.h +++ /dev/null @@ -1,36 +0,0 @@ -//===- ProfilingUtils.h - Helper functions shared by profilers --*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines a few helper functions which are used by profile -// instrumentation code to instrument the code. This allows the profiler pass -// to worry about *what* to insert, and these functions take care of *how* to do -// it. -// -//===----------------------------------------------------------------------===// - -#ifndef PROFILINGUTILS_H -#define PROFILINGUTILS_H - -namespace llvm { - class BasicBlock; - class Function; - class GlobalValue; - class Module; - class PointerType; - - void InsertProfilingInitCall(Function *MainFn, const char *FnName, - GlobalValue *Arr = 0, - PointerType *arrayType = 0); - void IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum, - GlobalValue *CounterArray, - bool beginning = true); - void InsertProfilingShutdownCall(Function *Callee, Module *Mod); -} - -#endif diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index cc971a3..89fb746 100644 --- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -227,7 +227,7 @@ bool ThreadSanitizer::doInitialization(Module &M) { TD = getAnalysisIfAvailable<DataLayout>(); if (!TD) return false; - BL.reset(new SpecialCaseList(BlacklistFile)); + BL.reset(SpecialCaseList::createOrDie(BlacklistFile)); // Always insert a call to __tsan_init into the module's CTORs. IRBuilder<> IRB(M.getContext()); @@ -240,12 +240,8 @@ bool ThreadSanitizer::doInitialization(Module &M) { } static bool isVtableAccess(Instruction *I) { - if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) { - if (Tag->getNumOperands() < 1) return false; - if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) { - if (Tag1->getString() == "vtable pointer") return true; - } - } + if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) + return Tag->isTBAAVtableAccess(); return false; } @@ -362,7 +358,7 @@ bool ThreadSanitizer::runOnFunction(Function &F) { // (e.g. variables that do not escape, etc). // Instrument memory accesses. - if (ClInstrumentMemoryAccesses) + if (ClInstrumentMemoryAccesses && F.hasFnAttribute(Attribute::SanitizeThread)) for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) { Res |= instrumentLoadOrStore(AllLoadsAndStores[i]); } diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 6f94a7c..2976df6 100644 --- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -176,91 +176,6 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { return 0; } -/// \brief Test whether the given retainable object pointer escapes. -/// -/// This differs from regular escape analysis in that a use as an -/// argument to a call is not considered an escape. -/// -static bool DoesRetainableObjPtrEscape(const User *Ptr) { - DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Target: " << *Ptr << "\n"); - - // Walk the def-use chains. - SmallVector<const Value *, 4> Worklist; - Worklist.push_back(Ptr); - // If Ptr has any operands add them as well. - for (User::const_op_iterator I = Ptr->op_begin(), E = Ptr->op_end(); I != E; - ++I) { - Worklist.push_back(*I); - } - - // Ensure we do not visit any value twice. - SmallPtrSet<const Value *, 8> VisitedSet; - - do { - const Value *V = Worklist.pop_back_val(); - - DEBUG(dbgs() << "Visiting: " << *V << "\n"); - - for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end(); - UI != UE; ++UI) { - const User *UUser = *UI; - - DEBUG(dbgs() << "User: " << *UUser << "\n"); - - // Special - Use by a call (callee or argument) is not considered - // to be an escape. - switch (GetBasicInstructionClass(UUser)) { - case IC_StoreWeak: - case IC_InitWeak: - case IC_StoreStrong: - case IC_Autorelease: - case IC_AutoreleaseRV: { - DEBUG(dbgs() << "User copies pointer arguments. Pointer Escapes!\n"); - // These special functions make copies of their pointer arguments. - return true; - } - case IC_IntrinsicUser: - // Use by the use intrinsic is not an escape. - continue; - case IC_User: - case IC_None: - // Use by an instruction which copies the value is an escape if the - // result is an escape. - if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) || - isa<PHINode>(UUser) || isa<SelectInst>(UUser)) { - - if (VisitedSet.insert(UUser)) { - DEBUG(dbgs() << "User copies value. Ptr escapes if result escapes." - " Adding to list.\n"); - Worklist.push_back(UUser); - } else { - DEBUG(dbgs() << "Already visited node.\n"); - } - continue; - } - // Use by a load is not an escape. - if (isa<LoadInst>(UUser)) - continue; - // Use by a store is not an escape if the use is the address. - if (const StoreInst *SI = dyn_cast<StoreInst>(UUser)) - if (V != SI->getValueOperand()) - continue; - break; - default: - // Regular calls and other stuff are not considered escapes. - continue; - } - // Otherwise, conservatively assume an escape. - DEBUG(dbgs() << "Assuming ptr escapes.\n"); - return true; - } - } while (!Worklist.empty()); - - // No escapes found. - DEBUG(dbgs() << "Ptr does not escape.\n"); - return false; -} - /// This is a wrapper around getUnderlyingObjCPtr along the lines of /// GetUnderlyingObjects except that it returns early when it sees the first /// alloca. @@ -517,7 +432,7 @@ namespace { bool Partial; /// The current position in the sequence. - Sequence Seq : 8; + unsigned char Seq : 8; /// Unidirectional information about the current sequence. RRInfo RRI; @@ -583,7 +498,7 @@ namespace { } Sequence GetSeq() const { - return Seq; + return static_cast<Sequence>(Seq); } void ClearSequenceProgress() { @@ -623,7 +538,8 @@ namespace { void PtrState::Merge(const PtrState &Other, bool TopDown) { - Seq = MergeSeqs(Seq, Other.Seq, TopDown); + Seq = MergeSeqs(static_cast<Sequence>(Seq), static_cast<Sequence>(Other.Seq), + TopDown); KnownPositiveRefCount &= Other.KnownPositiveRefCount; // If we're not in a sequence (anymore), drop all associated state. @@ -674,7 +590,9 @@ namespace { SmallVector<BasicBlock *, 2> Succs; public: - BBState() : TopDownPathCount(0), BottomUpPathCount(0) {} + static const unsigned OverflowOccurredValue; + + BBState() : TopDownPathCount(0), BottomUpPathCount(0) { } typedef MapTy::iterator ptr_iterator; typedef MapTy::const_iterator ptr_const_iterator; @@ -745,27 +663,31 @@ namespace { /// Returns true if overflow occured. Returns false if overflow did not /// occur. bool GetAllPathCountWithOverflow(unsigned &PathCount) const { - assert(TopDownPathCount != 0); - assert(BottomUpPathCount != 0); + if (TopDownPathCount == OverflowOccurredValue || + BottomUpPathCount == OverflowOccurredValue) + return true; unsigned long long Product = (unsigned long long)TopDownPathCount*BottomUpPathCount; - PathCount = Product; - // Overflow occured if any of the upper bits of Product are set. - return Product >> 32; + // Overflow occured if any of the upper bits of Product are set or if all + // the lower bits of Product are all set. + return (Product >> 32) || + ((PathCount = Product) == OverflowOccurredValue); } // Specialized CFG utilities. typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator; - edge_iterator pred_begin() { return Preds.begin(); } - edge_iterator pred_end() { return Preds.end(); } - edge_iterator succ_begin() { return Succs.begin(); } - edge_iterator succ_end() { return Succs.end(); } + edge_iterator pred_begin() const { return Preds.begin(); } + edge_iterator pred_end() const { return Preds.end(); } + edge_iterator succ_begin() const { return Succs.begin(); } + edge_iterator succ_end() const { return Succs.end(); } void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); } void addPred(BasicBlock *Pred) { Preds.push_back(Pred); } bool isExit() const { return Succs.empty(); } }; + + const unsigned BBState::OverflowOccurredValue = 0xffffffff; } void BBState::InitFromPred(const BBState &Other) { @@ -781,13 +703,25 @@ void BBState::InitFromSucc(const BBState &Other) { /// The top-down traversal uses this to merge information about predecessors to /// form the initial state for a new block. void BBState::MergePred(const BBState &Other) { + if (TopDownPathCount == OverflowOccurredValue) + return; + // Other.TopDownPathCount can be 0, in which case it is either dead or a // loop backedge. Loop backedges are special. TopDownPathCount += Other.TopDownPathCount; + // In order to be consistent, we clear the top down pointers when by adding + // TopDownPathCount becomes OverflowOccurredValue even though "true" overflow + // has not occured. + if (TopDownPathCount == OverflowOccurredValue) { + clearTopDownPointers(); + return; + } + // Check for overflow. If we have overflow, fall back to conservative // behavior. if (TopDownPathCount < Other.TopDownPathCount) { + TopDownPathCount = OverflowOccurredValue; clearTopDownPointers(); return; } @@ -813,13 +747,25 @@ void BBState::MergePred(const BBState &Other) { /// The bottom-up traversal uses this to merge information about successors to /// form the initial state for a new block. void BBState::MergeSucc(const BBState &Other) { + if (BottomUpPathCount == OverflowOccurredValue) + return; + // Other.BottomUpPathCount can be 0, in which case it is either dead or a // loop backedge. Loop backedges are special. BottomUpPathCount += Other.BottomUpPathCount; + // In order to be consistent, we clear the top down pointers when by adding + // BottomUpPathCount becomes OverflowOccurredValue even though "true" overflow + // has not occured. + if (BottomUpPathCount == OverflowOccurredValue) { + clearBottomUpPointers(); + return; + } + // Check for overflow. If we have overflow, fall back to conservative // behavior. if (BottomUpPathCount < Other.BottomUpPathCount) { + BottomUpPathCount = OverflowOccurredValue; clearBottomUpPointers(); return; } @@ -1158,13 +1104,9 @@ namespace { unsigned ARCAnnotationProvenanceSourceMDKind; #endif // ARC_ANNOATIONS - bool IsRetainBlockOptimizable(const Instruction *Inst); - bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV); void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV, InstructionClass &Class); - bool OptimizeRetainBlockCall(Function &F, Instruction *RetainBlock, - InstructionClass &Class); void OptimizeIndividualCalls(Function &F); void CheckForCFGHazards(const BasicBlock *BB, @@ -1253,22 +1195,6 @@ void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); } -bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) { - // Without the magic metadata tag, we have to assume this might be an - // objc_retainBlock call inserted to convert a block pointer to an id, - // in which case it really is needed. - if (!Inst->getMetadata(CopyOnEscapeMDKind)) - return false; - - // If the pointer "escapes" (not including being used in a call), - // the copy may be needed. - if (DoesRetainableObjPtrEscape(Inst)) - return false; - - // Otherwise, it's not needed. - return true; -} - /// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is /// not a return value. Or, if it can be paired with an /// objc_autoreleaseReturnValue, delete the pair and return true. @@ -1369,41 +1295,6 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV, } -// \brief Attempt to strength reduce objc_retainBlock calls to objc_retain -// calls. -// -// Specifically: If an objc_retainBlock call has the copy_on_escape metadata and -// does not escape (following the rules of block escaping), strength reduce the -// objc_retainBlock to an objc_retain. -// -// TODO: If an objc_retainBlock call is dominated period by a previous -// objc_retainBlock call, strength reduce the objc_retainBlock to an -// objc_retain. -bool -ObjCARCOpt::OptimizeRetainBlockCall(Function &F, Instruction *Inst, - InstructionClass &Class) { - assert(GetBasicInstructionClass(Inst) == Class); - assert(IC_RetainBlock == Class); - - // If we can not optimize Inst, return false. - if (!IsRetainBlockOptimizable(Inst)) - return false; - - Changed = true; - ++NumPeeps; - - DEBUG(dbgs() << "Strength reduced retainBlock => retain.\n"); - DEBUG(dbgs() << "Old: " << *Inst << "\n"); - CallInst *RetainBlock = cast<CallInst>(Inst); - Constant *NewDecl = EP.get(ARCRuntimeEntryPoints::EPT_Retain); - RetainBlock->setCalledFunction(NewDecl); - // Remove copy_on_escape metadata. - RetainBlock->setMetadata(CopyOnEscapeMDKind, 0); - Class = IC_Retain; - DEBUG(dbgs() << "New: " << *Inst << "\n"); - return true; -} - /// Visit each call, one at a time, and make simplifications without doing any /// additional analysis. void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { @@ -1480,11 +1371,6 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { } break; } - case IC_RetainBlock: - // If we strength reduce an objc_retainBlock to an objc_retain, continue - // onto the objc_retain peephole optimizations. Otherwise break. - OptimizeRetainBlockCall(F, Inst, Class); - break; case IC_RetainRV: if (OptimizeRetainRVCall(F, Inst)) continue; @@ -2520,15 +2406,26 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> if (Jt == Releases.end()) return false; const RRInfo &NewRetainReleaseRRI = Jt->second; - assert(NewRetainReleaseRRI.Calls.count(NewRetain)); + + // If the release does not have a reference to the retain as well, + // something happened which is unaccounted for. Do not do anything. + // + // This can happen if we catch an additive overflow during path count + // merging. + if (!NewRetainReleaseRRI.Calls.count(NewRetain)) + return false; + if (ReleasesToMove.Calls.insert(NewRetainRelease)) { // If we overflow when we compute the path count, don't remove/move // anything. const BBState &NRRBBState = BBStates[NewRetainRelease->getParent()]; - unsigned PathCount; + unsigned PathCount = BBState::OverflowOccurredValue; if (NRRBBState.GetAllPathCountWithOverflow(PathCount)) return false; + assert(PathCount != BBState::OverflowOccurredValue && + "PathCount at this point can not be " + "OverflowOccurredValue."); OldDelta -= PathCount; // Merge the ReleaseMetadata and IsTailCallRelease values. @@ -2558,8 +2455,12 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> // If we overflow when we compute the path count, don't // remove/move anything. const BBState &RIPBBState = BBStates[RIP->getParent()]; + PathCount = BBState::OverflowOccurredValue; if (RIPBBState.GetAllPathCountWithOverflow(PathCount)) return false; + assert(PathCount != BBState::OverflowOccurredValue && + "PathCount at this point can not be " + "OverflowOccurredValue."); NewDelta -= PathCount; } } @@ -2589,15 +2490,25 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> if (Jt == Retains.end()) return false; const RRInfo &NewReleaseRetainRRI = Jt->second; - assert(NewReleaseRetainRRI.Calls.count(NewRelease)); - if (RetainsToMove.Calls.insert(NewReleaseRetain)) { + // If the retain does not have a reference to the release as well, + // something happened which is unaccounted for. Do not do anything. + // + // This can happen if we catch an additive overflow during path count + // merging. + if (!NewReleaseRetainRRI.Calls.count(NewRelease)) + return false; + + if (RetainsToMove.Calls.insert(NewReleaseRetain)) { // If we overflow when we compute the path count, don't remove/move // anything. const BBState &NRRBBState = BBStates[NewReleaseRetain->getParent()]; - unsigned PathCount; + unsigned PathCount = BBState::OverflowOccurredValue; if (NRRBBState.GetAllPathCountWithOverflow(PathCount)) return false; + assert(PathCount != BBState::OverflowOccurredValue && + "PathCount at this point can not be " + "OverflowOccurredValue."); OldDelta += PathCount; OldCount += PathCount; @@ -2612,8 +2523,13 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> // If we overflow when we compute the path count, don't // remove/move anything. const BBState &RIPBBState = BBStates[RIP->getParent()]; + + PathCount = BBState::OverflowOccurredValue; if (RIPBBState.GetAllPathCountWithOverflow(PathCount)) return false; + assert(PathCount != BBState::OverflowOccurredValue && + "PathCount at this point can not be " + "OverflowOccurredValue."); NewDelta += PathCount; NewCount += PathCount; } diff --git a/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/lib/Transforms/Scalar/BasicBlockPlacement.cpp deleted file mode 100644 index e755008..0000000 --- a/lib/Transforms/Scalar/BasicBlockPlacement.cpp +++ /dev/null @@ -1,152 +0,0 @@ -//===-- BasicBlockPlacement.cpp - Basic Block Code Layout optimization ----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements a very simple profile guided basic block placement -// algorithm. The idea is to put frequently executed blocks together at the -// start of the function, and hopefully increase the number of fall-through -// conditional branches. If there is no profile information for a particular -// function, this pass basically orders blocks in depth-first order -// -// The algorithm implemented here is basically "Algo1" from "Profile Guided Code -// Positioning" by Pettis and Hansen, except that it uses basic block counts -// instead of edge counts. This should be improved in many ways, but is very -// simple for now. -// -// Basically we "place" the entry block, then loop over all successors in a DFO, -// placing the most frequently executed successor until we run out of blocks. I -// told you this was _extremely_ simplistic. :) This is also much slower than it -// could be. When it becomes important, this pass will be rewritten to use a -// better algorithm, and then we can worry about efficiency. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "block-placement" -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/ProfileInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/Pass.h" -#include "llvm/Support/CFG.h" -#include <set> -using namespace llvm; - -STATISTIC(NumMoved, "Number of basic blocks moved"); - -namespace { - struct BlockPlacement : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - BlockPlacement() : FunctionPass(ID) { - initializeBlockPlacementPass(*PassRegistry::getPassRegistry()); - } - - virtual bool runOnFunction(Function &F); - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesCFG(); - AU.addRequired<ProfileInfo>(); - //AU.addPreserved<ProfileInfo>(); // Does this work? - } - private: - /// PI - The profile information that is guiding us. - /// - ProfileInfo *PI; - - /// NumMovedBlocks - Every time we move a block, increment this counter. - /// - unsigned NumMovedBlocks; - - /// PlacedBlocks - Every time we place a block, remember it so we don't get - /// into infinite loops. - std::set<BasicBlock*> PlacedBlocks; - - /// InsertPos - This an iterator to the next place we want to insert a - /// block. - Function::iterator InsertPos; - - /// PlaceBlocks - Recursively place the specified blocks and any unplaced - /// successors. - void PlaceBlocks(BasicBlock *BB); - }; -} - -char BlockPlacement::ID = 0; -INITIALIZE_PASS_BEGIN(BlockPlacement, "block-placement", - "Profile Guided Basic Block Placement", false, false) -INITIALIZE_AG_DEPENDENCY(ProfileInfo) -INITIALIZE_PASS_END(BlockPlacement, "block-placement", - "Profile Guided Basic Block Placement", false, false) - -FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); } - -bool BlockPlacement::runOnFunction(Function &F) { - PI = &getAnalysis<ProfileInfo>(); - - NumMovedBlocks = 0; - InsertPos = F.begin(); - - // Recursively place all blocks. - PlaceBlocks(F.begin()); - - PlacedBlocks.clear(); - NumMoved += NumMovedBlocks; - return NumMovedBlocks != 0; -} - - -/// PlaceBlocks - Recursively place the specified blocks and any unplaced -/// successors. -void BlockPlacement::PlaceBlocks(BasicBlock *BB) { - assert(!PlacedBlocks.count(BB) && "Already placed this block!"); - PlacedBlocks.insert(BB); - - // Place the specified block. - if (&*InsertPos != BB) { - // Use splice to move the block into the right place. This avoids having to - // remove the block from the function then readd it, which causes a bunch of - // symbol table traffic that is entirely pointless. - Function::BasicBlockListType &Blocks = BB->getParent()->getBasicBlockList(); - Blocks.splice(InsertPos, Blocks, BB); - - ++NumMovedBlocks; - } else { - // This block is already in the right place, we don't have to do anything. - ++InsertPos; - } - - // Keep placing successors until we run out of ones to place. Note that this - // loop is very inefficient (N^2) for blocks with many successors, like switch - // statements. FIXME! - while (1) { - // Okay, now place any unplaced successors. - succ_iterator SI = succ_begin(BB), E = succ_end(BB); - - // Scan for the first unplaced successor. - for (; SI != E && PlacedBlocks.count(*SI); ++SI) - /*empty*/; - if (SI == E) return; // No more successors to place. - - double MaxExecutionCount = PI->getExecutionCount(*SI); - BasicBlock *MaxSuccessor = *SI; - - // Scan for more frequently executed successors - for (; SI != E; ++SI) - if (!PlacedBlocks.count(*SI)) { - double Count = PI->getExecutionCount(*SI); - if (Count > MaxExecutionCount || - // Prefer to not disturb the code. - (Count == MaxExecutionCount && *SI == &*InsertPos)) { - MaxExecutionCount = Count; - MaxSuccessor = *SI; - } - } - - // Now that we picked the maximally executed successor, place it. - PlaceBlocks(MaxSuccessor); - } -} diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index f5d1db1..626c810 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -1,6 +1,5 @@ add_llvm_library(LLVMScalarOpts ADCE.cpp - BasicBlockPlacement.cpp CodeGenPrepare.cpp ConstantProp.cpp CorrelatedValuePropagation.cpp @@ -17,12 +16,15 @@ add_llvm_library(LLVMScalarOpts LoopInstSimplify.cpp LoopRotation.cpp LoopStrengthReduce.cpp + LoopRerollPass.cpp LoopUnrollPass.cpp LoopUnswitch.cpp LowerAtomic.cpp MemCpyOptimizer.cpp + PartiallyInlineLibCalls.cpp Reassociate.cpp Reg2Mem.cpp + SampleProfile.cpp SCCP.cpp SROA.cpp Scalar.cpp diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp index 44804a2..007e9b7 100644 --- a/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -22,7 +22,6 @@ #include "llvm/Analysis/DominatorInternals.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/ProfileInfo.h" #include "llvm/Assembly/Writer.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -80,7 +79,6 @@ namespace { const TargetLowering *TLI; const TargetLibraryInfo *TLInfo; DominatorTree *DT; - ProfileInfo *PFI; /// CurInstIterator - As we scan instructions optimizing them, this is the /// next instruction to optimize. Xforms that can invalidate this should @@ -111,7 +109,6 @@ namespace { virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<DominatorTree>(); - AU.addPreserved<ProfileInfo>(); AU.addRequired<TargetLibraryInfo>(); } @@ -151,7 +148,6 @@ bool CodeGenPrepare::runOnFunction(Function &F) { if (TM) TLI = TM->getTargetLowering(); TLInfo = &getAnalysis<TargetLibraryInfo>(); DT = getAnalysisIfAvailable<DominatorTree>(); - PFI = getAnalysisIfAvailable<ProfileInfo>(); OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); @@ -442,10 +438,6 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { DT->changeImmediateDominator(DestBB, NewIDom); DT->eraseNode(BB); } - if (PFI) { - PFI->replaceAllUses(BB, DestBB); - PFI->removeEdge(ProfileInfo::getEdge(BB, DestBB)); - } BB->eraseFromParent(); ++NumBlocksElim; @@ -840,10 +832,12 @@ struct ExtAddrMode : public TargetLowering::AddrMode { } }; +#ifndef NDEBUG static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) { AM.print(OS); return OS; } +#endif void ExtAddrMode::print(raw_ostream &OS) const { bool NeedPlus = false; @@ -1035,7 +1029,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, case Instruction::IntToPtr: // This inttoptr is a no-op if the integer type is pointer sized. if (TLI.getValueType(AddrInst->getOperand(0)->getType()) == - TLI.getPointerTy()) + TLI.getPointerTy(AddrInst->getType()->getPointerAddressSpace())) return MatchAddr(AddrInst->getOperand(0), Depth); return false; case Instruction::BitCast: @@ -1418,8 +1412,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, Value *Address = User->getOperand(OpNo); if (!Address->getType()->isPointerTy()) return false; - Type *AddressAccessTy = - cast<PointerType>(Address->getType())->getElementType(); + Type *AddressAccessTy = Address->getType()->getPointerElementType(); // Do a match against the root of this address, ignoring profitability. This // will tell us if the addressing mode for the memory operation will @@ -1573,9 +1566,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, } else { DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst); - Type *IntPtrTy = - TLI->getDataLayout()->getIntPtrType(AccessTy->getContext()); - + Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType()); Value *Result = 0; // Start with the base register. Do this first so that subsequent address @@ -1894,7 +1885,8 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { // It is possible for very late stage optimizations (such as SimplifyCFG) // to introduce PHI nodes too late to be cleaned up. If we detect such a // trivial PHI, go ahead and zap it here. - if (Value *V = SimplifyInstruction(P)) { + if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : 0, + TLInfo, DT)) { P->replaceAllUsesWith(V); P->eraseFromParent(); ++NumPHIsElim; diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index 3c08634..5266894 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -72,11 +72,6 @@ namespace { } namespace llvm { -// SimpleValue is POD. -template<> struct isPodLike<SimpleValue> { - static const bool value = true; -}; - template<> struct DenseMapInfo<SimpleValue> { static inline SimpleValue getEmptyKey() { return DenseMapInfo<Instruction*>::getEmptyKey(); @@ -220,11 +215,6 @@ namespace { } namespace llvm { - // CallValue is POD. - template<> struct isPodLike<CallValue> { - static const bool value = true; - }; - template<> struct DenseMapInfo<CallValue> { static inline CallValue getEmptyKey() { return DenseMapInfo<Instruction*>::getEmptyKey(); diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index bc418af..6af269d 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" @@ -507,7 +508,9 @@ namespace { enum ValType { SimpleVal, // A simple offsetted value that is accessed. LoadVal, // A value produced by a load. - MemIntrin // A memory intrinsic which is loaded from. + MemIntrin, // A memory intrinsic which is loaded from. + UndefVal // A UndefValue representing a value from dead block (which + // is not yet physically removed from the CFG). }; /// V - The value that is live out of the block. @@ -545,10 +548,20 @@ namespace { Res.Offset = Offset; return Res; } - + + static AvailableValueInBlock getUndef(BasicBlock *BB) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(0); + Res.Val.setInt(UndefVal); + Res.Offset = 0; + return Res; + } + bool isSimpleValue() const { return Val.getInt() == SimpleVal; } bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; } bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; } + bool isUndefValue() const { return Val.getInt() == UndefVal; } Value *getSimpleValue() const { assert(isSimpleValue() && "Wrong accessor"); @@ -576,6 +589,7 @@ namespace { DominatorTree *DT; const DataLayout *TD; const TargetLibraryInfo *TLI; + SetVector<BasicBlock *> DeadBlocks; ValueTable VN; @@ -698,6 +712,9 @@ namespace { unsigned replaceAllDominatedUsesWith(Value *From, Value *To, const BasicBlockEdge &Root); bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root); + bool processFoldableCondBr(BranchInst *BI); + void addDeadBlock(BasicBlock *BB); + void assignValNumForDeadCode(); }; char GVN::ID = 0; @@ -1071,14 +1088,15 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, if (Offset == -1) return Offset; + unsigned AS = Src->getType()->getPointerAddressSpace(); // Otherwise, see if we can constant fold a load from the constant with the // offset applied as appropriate. Src = ConstantExpr::getBitCast(Src, - llvm::Type::getInt8PtrTy(Src->getContext())); + Type::getInt8PtrTy(Src->getContext(), AS)); Constant *OffsetCst = ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); - Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); + Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); if (ConstantFoldLoadFromConstPtr(Src, &TD)) return Offset; return -1; @@ -1155,7 +1173,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *DestPTy = IntegerType::get(LoadTy->getContext(), NewLoadSize*8); DestPTy = PointerType::get(DestPTy, - cast<PointerType>(PtrVal->getType())->getAddressSpace()); + PtrVal->getType()->getPointerAddressSpace()); Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc()); PtrVal = Builder.CreateBitCast(PtrVal, DestPTy); LoadInst *NewLoad = Builder.CreateLoad(PtrVal); @@ -1230,15 +1248,16 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, // Otherwise, this is a memcpy/memmove from a constant global. MemTransferInst *MTI = cast<MemTransferInst>(SrcInst); Constant *Src = cast<Constant>(MTI->getSource()); + unsigned AS = Src->getType()->getPointerAddressSpace(); // Otherwise, see if we can constant fold a load from the constant with the // offset applied as appropriate. Src = ConstantExpr::getBitCast(Src, - llvm::Type::getInt8PtrTy(Src->getContext())); + Type::getInt8PtrTy(Src->getContext(), AS)); Constant *OffsetCst = - ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); + ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); - Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); + Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); return ConstantFoldLoadFromConstPtr(Src, &TD); } @@ -1253,8 +1272,10 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, // just use the dominating value directly. if (ValuesPerBlock.size() == 1 && gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB, - LI->getParent())) + LI->getParent())) { + assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block"); return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn); + } // Otherwise, we have to construct SSA form. SmallVector<PHINode*, 8> NewPHIs; @@ -1324,7 +1345,7 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c << *getCoercedLoadValue() << '\n' << *Res << '\n' << "\n\n\n"); } - } else { + } else if (isMemIntrinValue()) { const DataLayout *TD = gvn.getDataLayout(); assert(TD && "Need target data to handle type mismatch case"); Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, @@ -1332,6 +1353,10 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset << " " << *getMemIntrinValue() << '\n' << *Res << '\n' << "\n\n\n"); + } else { + assert(isUndefValue() && "Should be UndefVal"); + DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";); + return UndefValue::get(LoadTy); } return Res; } @@ -1355,6 +1380,13 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, BasicBlock *DepBB = Deps[i].getBB(); MemDepResult DepInfo = Deps[i].getResult(); + if (DeadBlocks.count(DepBB)) { + // Dead dependent mem-op disguise as a load evaluating the same value + // as the load in question. + ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB)); + continue; + } + if (!DepInfo.isDef() && !DepInfo.isClobber()) { UnavailableBlocks.push_back(DepBB); continue; @@ -2191,11 +2223,13 @@ bool GVN::processInstruction(Instruction *I) { // For conditional branches, we can perform simple conditional propagation on // the condition value itself. if (BranchInst *BI = dyn_cast<BranchInst>(I)) { - if (!BI->isConditional() || isa<Constant>(BI->getCondition())) + if (!BI->isConditional()) return false; - Value *BranchCond = BI->getCondition(); + if (isa<Constant>(BI->getCondition())) + return processFoldableCondBr(BI); + Value *BranchCond = BI->getCondition(); BasicBlock *TrueSucc = BI->getSuccessor(0); BasicBlock *FalseSucc = BI->getSuccessor(1); // Avoid multiple edges early. @@ -2312,6 +2346,9 @@ bool GVN::runOnFunction(Function& F) { } if (EnablePRE) { + // Fabricate val-num for dead-code in order to suppress assertion in + // performPRE(). + assignValNumForDeadCode(); bool PREChanged = true; while (PREChanged) { PREChanged = performPRE(F); @@ -2325,6 +2362,9 @@ bool GVN::runOnFunction(Function& F) { // Actually, when this happens, we should just fully integrate PRE into GVN. cleanupGlobalSets(); + // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each + // iteration. + DeadBlocks.clear(); return Changed; } @@ -2335,6 +2375,9 @@ bool GVN::processBlock(BasicBlock *BB) { // (and incrementing BI before processing an instruction). assert(InstrsToErase.empty() && "We expect InstrsToErase to be empty across iterations"); + if (DeadBlocks.count(BB)) + return false; + bool ChangedFunction = false; for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); @@ -2628,3 +2671,133 @@ void GVN::verifyRemoved(const Instruction *Inst) const { } } } + +// BB is declared dead, which implied other blocks become dead as well. This +// function is to add all these blocks to "DeadBlocks". For the dead blocks' +// live successors, update their phi nodes by replacing the operands +// corresponding to dead blocks with UndefVal. +// +void GVN::addDeadBlock(BasicBlock *BB) { + SmallVector<BasicBlock *, 4> NewDead; + SmallSetVector<BasicBlock *, 4> DF; + + NewDead.push_back(BB); + while (!NewDead.empty()) { + BasicBlock *D = NewDead.pop_back_val(); + if (DeadBlocks.count(D)) + continue; + + // All blocks dominated by D are dead. + SmallVector<BasicBlock *, 8> Dom; + DT->getDescendants(D, Dom); + DeadBlocks.insert(Dom.begin(), Dom.end()); + + // Figure out the dominance-frontier(D). + for (SmallVectorImpl<BasicBlock *>::iterator I = Dom.begin(), + E = Dom.end(); I != E; I++) { + BasicBlock *B = *I; + for (succ_iterator SI = succ_begin(B), SE = succ_end(B); SI != SE; SI++) { + BasicBlock *S = *SI; + if (DeadBlocks.count(S)) + continue; + + bool AllPredDead = true; + for (pred_iterator PI = pred_begin(S), PE = pred_end(S); PI != PE; PI++) + if (!DeadBlocks.count(*PI)) { + AllPredDead = false; + break; + } + + if (!AllPredDead) { + // S could be proved dead later on. That is why we don't update phi + // operands at this moment. + DF.insert(S); + } else { + // While S is not dominated by D, it is dead by now. This could take + // place if S already have a dead predecessor before D is declared + // dead. + NewDead.push_back(S); + } + } + } + } + + // For the dead blocks' live successors, update their phi nodes by replacing + // the operands corresponding to dead blocks with UndefVal. + for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end(); + I != E; I++) { + BasicBlock *B = *I; + if (DeadBlocks.count(B)) + continue; + + SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B)); + for (SmallVectorImpl<BasicBlock *>::iterator PI = Preds.begin(), + PE = Preds.end(); PI != PE; PI++) { + BasicBlock *P = *PI; + + if (!DeadBlocks.count(P)) + continue; + + if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) { + if (BasicBlock *S = splitCriticalEdges(P, B)) + DeadBlocks.insert(P = S); + } + + for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) { + PHINode &Phi = cast<PHINode>(*II); + Phi.setIncomingValue(Phi.getBasicBlockIndex(P), + UndefValue::get(Phi.getType())); + } + } + } +} + +// If the given branch is recognized as a foldable branch (i.e. conditional +// branch with constant condition), it will perform following analyses and +// transformation. +// 1) If the dead out-coming edge is a critical-edge, split it. Let +// R be the target of the dead out-coming edge. +// 1) Identify the set of dead blocks implied by the branch's dead outcoming +// edge. The result of this step will be {X| X is dominated by R} +// 2) Identify those blocks which haves at least one dead prodecessor. The +// result of this step will be dominance-frontier(R). +// 3) Update the PHIs in DF(R) by replacing the operands corresponding to +// dead blocks with "UndefVal" in an hope these PHIs will optimized away. +// +// Return true iff *NEW* dead code are found. +bool GVN::processFoldableCondBr(BranchInst *BI) { + if (!BI || BI->isUnconditional()) + return false; + + ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition()); + if (!Cond) + return false; + + BasicBlock *DeadRoot = Cond->getZExtValue() ? + BI->getSuccessor(1) : BI->getSuccessor(0); + if (DeadBlocks.count(DeadRoot)) + return false; + + if (!DeadRoot->getSinglePredecessor()) + DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot); + + addDeadBlock(DeadRoot); + return true; +} + +// performPRE() will trigger assert if it come across an instruciton without +// associated val-num. As it normally has far more live instructions than dead +// instructions, it makes more sense just to "fabricate" a val-number for the +// dead code than checking if instruction involved is dead or not. +void GVN::assignValNumForDeadCode() { + for (SetVector<BasicBlock *>::iterator I = DeadBlocks.begin(), + E = DeadBlocks.end(); I != E; I++) { + BasicBlock *BB = *I; + for (BasicBlock::iterator II = BB->begin(), EE = BB->end(); + II != EE; II++) { + Instruction *Inst = &*II; + unsigned ValNum = VN.lookup_or_add(Inst); + addToLeaderTable(ValNum, Inst, BB); + } + } +} diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index d51e034..235aaaa 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -532,7 +532,8 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { // and varies predictably *inside* the loop. Evaluate the value it // contains when the loop exits, if possible. const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop()); - if (!SE->isLoopInvariant(ExitValue, L) || !isSafeToExpand(ExitValue)) + if (!SE->isLoopInvariant(ExitValue, L) || + !isSafeToExpand(ExitValue, *SE)) continue; // Computing the value outside of the loop brings no benefit if : @@ -1479,8 +1480,14 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, if (IndVar->getType()->isPointerTy() && !IVCount->getType()->isPointerTy()) { + // IVOffset will be the new GEP offset that is interpreted by GEP as a + // signed value. IVCount on the other hand represents the loop trip count, + // which is an unsigned value. FindLoopCounter only allows induction + // variables that have a positive unit stride of one. This means we don't + // have to handle the case of negative offsets (yet) and just need to zero + // extend IVCount. Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType()); - const SCEV *IVOffset = SE->getTruncateOrSignExtend(IVCount, OfsTy); + const SCEV *IVOffset = SE->getTruncateOrZeroExtend(IVCount, OfsTy); // Expand the code for the iteration count. assert(SE->isLoopInvariant(IVOffset, L) && @@ -1492,7 +1499,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter"); // We could handle pointer IVs other than i8*, but we need to compensate for // gep index scaling. See canExpandBackedgeTakenCount comments. - assert(SE->getSizeOfExpr( + assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()), cast<PointerType>(GEPBase->getType())->getElementType())->isOne() && "unit stride pointer IV must be i8*"); @@ -1506,9 +1513,10 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc). // // Valid Cases: (1) both integers is most common; (2) both may be pointers - // for simple memset-style loops; (3) IVInit is an integer and IVCount is a - // pointer may occur when enable-iv-rewrite generates a canonical IV on top - // of case #2. + // for simple memset-style loops. + // + // IVInit integer and IVCount pointer would only occur if a canonical IV + // were generated on top of case #2, which is not expected. const SCEV *IVLimit = 0; // For unit stride, IVCount = Start + BECount with 2's complement overflow. diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 0b8906d..b3ec2fc 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -827,7 +827,6 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { return false; } - /// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant /// load instruction, eliminate it by replacing it with a PHI node. This is an /// important optimization that encourages jump threading, and needs to be run @@ -842,6 +841,12 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (LoadBB->getSinglePredecessor()) return false; + // If the load is defined in a landing pad, it can't be partially redundant, + // because the edges between the invoke and the landing pad cannot have other + // instructions between them. + if (LoadBB->isLandingPad()) + return false; + Value *LoadedPtr = LI->getOperand(0); // If the loaded operand is defined in the LoadBB, it can't be available. diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 07d991b..952b76b 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -314,7 +314,7 @@ bool NclPopcountRecognize::preliminaryScreen() { if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) return false; - // Counting population are usually conducted by few arithmetic instrutions. + // Counting population are usually conducted by few arithmetic instructions. // Such instructions can be easilly "absorbed" by vacant slots in a // non-compact loop. Therefore, recognizing popcount idiom only makes sense // in a compact loop. @@ -953,6 +953,8 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = 0; + unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); + // If we're allowed to form a memset, and the stored value would be acceptable // for memset, use it. if (SplatValue && TLI->has(LibFunc::memset) && @@ -961,8 +963,10 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, CurLoop->isLoopInvariant(SplatValue)) { // Keep and use SplatValue. PatternValue = 0; - } else if (TLI->has(LibFunc::memset_pattern16) && + } else if (DestAS == 0 && + TLI->has(LibFunc::memset_pattern16) && (PatternValue = getMemSetPatternValue(StoredVal, *TD))) { + // Don't create memset_pattern16s with address spaces. // It looks like we can use PatternValue! SplatValue = 0; } else { @@ -978,20 +982,20 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, IRBuilder<> Builder(Preheader->getTerminator()); SCEVExpander Expander(*SE, "loop-idiom"); + Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); + // Okay, we have a strided store "p[i]" of a splattable value. We can turn // this into a memset in the loop preheader now if we want. However, this // would be unsafe to do if there is anything else in the loop that may read // or write to the aliased location. Check for any overlap by generating the // base pointer and checking the region. - unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace(); Value *BasePtr = - Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace), + Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy, Preheader->getTerminator()); - if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef, CurLoop, BECount, - StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){ + StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) { Expander.clear(); // If we generated new code for the base pointer, clean up. deleteIfDeadInstruction(BasePtr, *SE, TLI); @@ -1002,27 +1006,35 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = TD->getIntPtrType(DestPtr->getContext()); + Type *IntPtr = Builder.getIntPtrTy(TD, DestAS); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), SCEV::FlagNUW); - if (StoreSize != 1) + if (StoreSize != 1) { NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), SCEV::FlagNUW); + } Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); CallInst *NewCall; - if (SplatValue) - NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment); - else { + if (SplatValue) { + NewCall = Builder.CreateMemSet(BasePtr, + SplatValue, + NumBytes, + StoreAlignment); + } else { + // Everything is emitted in default address space + Type *Int8PtrTy = DestInt8PtrTy; + Module *M = TheStore->getParent()->getParent()->getParent(); Value *MSP = M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), - Builder.getInt8PtrTy(), - Builder.getInt8PtrTy(), IntPtr, + Int8PtrTy, + Int8PtrTy, + IntPtr, (void*)0); // Otherwise we should form a memset_pattern16. PatternValue is known to be @@ -1032,7 +1044,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, PatternValue, ".memset_pattern"); GV->setUnnamedAddr(true); // Ok to merge these. GV->setAlignment(16); - Value *PatternPtr = ConstantExpr::getBitCast(GV, Builder.getInt8PtrTy()); + Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes); } @@ -1108,17 +1120,17 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = TD->getIntPtrType(SI->getContext()); - BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); + Type *IntPtrTy = Builder.getIntPtrTy(TD, SI->getPointerAddressSpace()); + BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); - const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), + const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1), SCEV::FlagNUW); if (StoreSize != 1) - NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), + NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), SCEV::FlagNUW); Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); CallInst *NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp new file mode 100644 index 0000000..335af81 --- /dev/null +++ b/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -0,0 +1,1184 @@ +//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements a simple loop reroller. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-reroll" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +using namespace llvm; + +STATISTIC(NumRerolledLoops, "Number of rerolled loops"); + +static cl::opt<unsigned> +MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, + cl::desc("The maximum increment for loop rerolling")); + +// This loop re-rolling transformation aims to transform loops like this: +// +// int foo(int a); +// void bar(int *x) { +// for (int i = 0; i < 500; i += 3) { +// foo(i); +// foo(i+1); +// foo(i+2); +// } +// } +// +// into a loop like this: +// +// void bar(int *x) { +// for (int i = 0; i < 500; ++i) +// foo(i); +// } +// +// It does this by looking for loops that, besides the latch code, are composed +// of isomorphic DAGs of instructions, with each DAG rooted at some increment +// to the induction variable, and where each DAG is isomorphic to the DAG +// rooted at the induction variable (excepting the sub-DAGs which root the +// other induction-variable increments). In other words, we're looking for loop +// bodies of the form: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// f(%iv) +// %iv.1 = add %iv, 1 <-- a root increment +// f(%iv.1) +// %iv.2 = add %iv, 2 <-- a root increment +// f(%iv.2) +// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment +// f(%iv.scale_m_1) +// ... +// %iv.next = add %iv, scale +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit +// +// where each f(i) is a set of instructions that, collectively, are a function +// only of i (and other loop-invariant values). +// +// As a special case, we can also reroll loops like this: +// +// int foo(int); +// void bar(int *x) { +// for (int i = 0; i < 500; ++i) { +// x[3*i] = foo(0); +// x[3*i+1] = foo(0); +// x[3*i+2] = foo(0); +// } +// } +// +// into this: +// +// void bar(int *x) { +// for (int i = 0; i < 1500; ++i) +// x[i] = foo(0); +// } +// +// in which case, we're looking for inputs like this: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// %scaled.iv = mul %iv, scale +// f(%scaled.iv) +// %scaled.iv.1 = add %scaled.iv, 1 +// f(%scaled.iv.1) +// %scaled.iv.2 = add %scaled.iv, 2 +// f(%scaled.iv.2) +// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 +// f(%scaled.iv.scale_m_1) +// ... +// %iv.next = add %iv, 1 +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit + +namespace { + class LoopReroll : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopReroll() : LoopPass(ID) { + initializeLoopRerollPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<AliasAnalysis>(); + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<TargetLibraryInfo>(); + } + +protected: + AliasAnalysis *AA; + LoopInfo *LI; + ScalarEvolution *SE; + DataLayout *DL; + TargetLibraryInfo *TLI; + DominatorTree *DT; + + typedef SmallVector<Instruction *, 16> SmallInstructionVector; + typedef SmallSet<Instruction *, 16> SmallInstructionSet; + + // A chain of isomorphic instructions, indentified by a single-use PHI, + // representing a reduction. Only the last value may be used outside the + // loop. + struct SimpleLoopReduction { + SimpleLoopReduction(Instruction *P, Loop *L) + : Valid(false), Instructions(1, P) { + assert(isa<PHINode>(P) && "First reduction instruction must be a PHI"); + add(L); + } + + bool valid() const { + return Valid; + } + + Instruction *getPHI() const { + assert(Valid && "Using invalid reduction"); + return Instructions.front(); + } + + Instruction *getReducedValue() const { + assert(Valid && "Using invalid reduction"); + return Instructions.back(); + } + + Instruction *get(size_t i) const { + assert(Valid && "Using invalid reduction"); + return Instructions[i+1]; + } + + Instruction *operator [] (size_t i) const { return get(i); } + + // The size, ignoring the initial PHI. + size_t size() const { + assert(Valid && "Using invalid reduction"); + return Instructions.size()-1; + } + + typedef SmallInstructionVector::iterator iterator; + typedef SmallInstructionVector::const_iterator const_iterator; + + iterator begin() { + assert(Valid && "Using invalid reduction"); + return llvm::next(Instructions.begin()); + } + + const_iterator begin() const { + assert(Valid && "Using invalid reduction"); + return llvm::next(Instructions.begin()); + } + + iterator end() { return Instructions.end(); } + const_iterator end() const { return Instructions.end(); } + + protected: + bool Valid; + SmallInstructionVector Instructions; + + void add(Loop *L); + }; + + // The set of all reductions, and state tracking of possible reductions + // during loop instruction processing. + struct ReductionTracker { + typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector; + + // Add a new possible reduction. + void addSLR(SimpleLoopReduction &SLR) { + PossibleReds.push_back(SLR); + } + + // Setup to track possible reductions corresponding to the provided + // rerolling scale. Only reductions with a number of non-PHI instructions + // that is divisible by the scale are considered. Three instructions sets + // are filled in: + // - A set of all possible instructions in eligible reductions. + // - A set of all PHIs in eligible reductions + // - A set of all reduced values (last instructions) in eligible reductions. + void restrictToScale(uint64_t Scale, + SmallInstructionSet &PossibleRedSet, + SmallInstructionSet &PossibleRedPHISet, + SmallInstructionSet &PossibleRedLastSet) { + PossibleRedIdx.clear(); + PossibleRedIter.clear(); + Reds.clear(); + + for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i) + if (PossibleReds[i].size() % Scale == 0) { + PossibleRedLastSet.insert(PossibleReds[i].getReducedValue()); + PossibleRedPHISet.insert(PossibleReds[i].getPHI()); + + PossibleRedSet.insert(PossibleReds[i].getPHI()); + PossibleRedIdx[PossibleReds[i].getPHI()] = i; + for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(), + JE = PossibleReds[i].end(); J != JE; ++J) { + PossibleRedSet.insert(*J); + PossibleRedIdx[*J] = i; + } + } + } + + // The functions below are used while processing the loop instructions. + + // Are the two instructions both from reductions, and furthermore, from + // the same reduction? + bool isPairInSame(Instruction *J1, Instruction *J2) { + DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1); + if (J1I != PossibleRedIdx.end()) { + DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2); + if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second) + return true; + } + + return false; + } + + // The two provided instructions, the first from the base iteration, and + // the second from iteration i, form a matched pair. If these are part of + // a reduction, record that fact. + void recordPair(Instruction *J1, Instruction *J2, unsigned i) { + if (PossibleRedIdx.count(J1)) { + assert(PossibleRedIdx.count(J2) && + "Recording reduction vs. non-reduction instruction?"); + + PossibleRedIter[J1] = 0; + PossibleRedIter[J2] = i; + + int Idx = PossibleRedIdx[J1]; + assert(Idx == PossibleRedIdx[J2] && + "Recording pair from different reductions?"); + Reds.insert(Idx); + } + } + + // The functions below can be called after we've finished processing all + // instructions in the loop, and we know which reductions were selected. + + // Is the provided instruction the PHI of a reduction selected for + // rerolling? + bool isSelectedPHI(Instruction *J) { + if (!isa<PHINode>(J)) + return false; + + for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + if (cast<Instruction>(J) == PossibleReds[i].getPHI()) + return true; + } + + return false; + } + + bool validateSelected(); + void replaceSelected(); + + protected: + // The vector of all possible reductions (for any scale). + SmallReductionVector PossibleReds; + + DenseMap<Instruction *, int> PossibleRedIdx; + DenseMap<Instruction *, int> PossibleRedIter; + DenseSet<int> Reds; + }; + + void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); + void collectPossibleReductions(Loop *L, + ReductionTracker &Reductions); + void collectInLoopUserSet(Loop *L, + const SmallInstructionVector &Roots, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users); + void collectInLoopUserSet(Loop *L, + Instruction * Root, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users); + bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale, + Instruction *&IV, + SmallInstructionVector &LoopIncs); + bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV, + SmallVector<SmallInstructionVector, 32> &Roots, + SmallInstructionSet &AllRoots, + SmallInstructionVector &LoopIncs); + bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, + ReductionTracker &Reductions); + }; +} + +char LoopReroll::ID = 0; +INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) + +Pass *llvm::createLoopRerollPass() { + return new LoopReroll; +} + +// Returns true if the provided instruction is used outside the given loop. +// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in +// non-loop blocks to be outside the loop. +static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { + for (Value::use_iterator UI = I->use_begin(), + UIE = I->use_end(); UI != UIE; ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (!L->contains(User)) + return true; + } + + return false; +} + +// Collect the list of loop induction variables with respect to which it might +// be possible to reroll the loop. +void LoopReroll::collectPossibleIVs(Loop *L, + SmallInstructionVector &PossibleIVs) { + BasicBlock *Header = L->getHeader(); + for (BasicBlock::iterator I = Header->begin(), + IE = Header->getFirstInsertionPt(); I != IE; ++I) { + if (!isa<PHINode>(I)) + continue; + if (!I->getType()->isIntegerTy()) + continue; + + if (const SCEVAddRecExpr *PHISCEV = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) { + if (PHISCEV->getLoop() != L) + continue; + if (!PHISCEV->isAffine()) + continue; + if (const SCEVConstant *IncSCEV = + dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) { + if (!IncSCEV->getValue()->getValue().isStrictlyPositive()) + continue; + if (IncSCEV->getValue()->uge(MaxInc)) + continue; + + DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << + *PHISCEV << "\n"); + PossibleIVs.push_back(I); + } + } + } +} + +// Add the remainder of the reduction-variable chain to the instruction vector +// (the initial PHINode has already been added). If successful, the object is +// marked as valid. +void LoopReroll::SimpleLoopReduction::add(Loop *L) { + assert(!Valid && "Cannot add to an already-valid chain"); + + // The reduction variable must be a chain of single-use instructions + // (including the PHI), except for the last value (which is used by the PHI + // and also outside the loop). + Instruction *C = Instructions.front(); + + do { + C = cast<Instruction>(*C->use_begin()); + if (C->hasOneUse()) { + if (!C->isBinaryOp()) + return; + + if (!(isa<PHINode>(Instructions.back()) || + C->isSameOperationAs(Instructions.back()))) + return; + + Instructions.push_back(C); + } + } while (C->hasOneUse()); + + if (Instructions.size() < 2 || + !C->isSameOperationAs(Instructions.back()) || + C->use_begin() == C->use_end()) + return; + + // C is now the (potential) last instruction in the reduction chain. + for (Value::use_iterator UI = C->use_begin(), UIE = C->use_end(); + UI != UIE; ++UI) { + // The only in-loop user can be the initial PHI. + if (L->contains(cast<Instruction>(*UI))) + if (cast<Instruction>(*UI ) != Instructions.front()) + return; + } + + Instructions.push_back(C); + Valid = true; +} + +// Collect the vector of possible reduction variables. +void LoopReroll::collectPossibleReductions(Loop *L, + ReductionTracker &Reductions) { + BasicBlock *Header = L->getHeader(); + for (BasicBlock::iterator I = Header->begin(), + IE = Header->getFirstInsertionPt(); I != IE; ++I) { + if (!isa<PHINode>(I)) + continue; + if (!I->getType()->isSingleValueType()) + continue; + + SimpleLoopReduction SLR(I, L); + if (!SLR.valid()) + continue; + + DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " << + SLR.size() << " chained instructions)\n"); + Reductions.addSLR(SLR); + } +} + +// Collect the set of all users of the provided root instruction. This set of +// users contains not only the direct users of the root instruction, but also +// all users of those users, and so on. There are two exceptions: +// +// 1. Instructions in the set of excluded instructions are never added to the +// use set (even if they are users). This is used, for example, to exclude +// including root increments in the use set of the primary IV. +// +// 2. Instructions in the set of final instructions are added to the use set +// if they are users, but their users are not added. This is used, for +// example, to prevent a reduction update from forcing all later reduction +// updates into the use set. +void LoopReroll::collectInLoopUserSet(Loop *L, + Instruction *Root, const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users) { + SmallInstructionVector Queue(1, Root); + while (!Queue.empty()) { + Instruction *I = Queue.pop_back_val(); + if (!Users.insert(I).second) + continue; + + if (!Final.count(I)) + for (Value::use_iterator UI = I->use_begin(), + UIE = I->use_end(); UI != UIE; ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (PHINode *PN = dyn_cast<PHINode>(User)) { + // Ignore "wrap-around" uses to PHIs of this loop's header. + if (PN->getIncomingBlock(UI) == L->getHeader()) + continue; + } + + if (L->contains(User) && !Exclude.count(User)) { + Queue.push_back(User); + } + } + + // We also want to collect single-user "feeder" values. + for (User::op_iterator OI = I->op_begin(), + OIE = I->op_end(); OI != OIE; ++OI) { + if (Instruction *Op = dyn_cast<Instruction>(*OI)) + if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) && + !Final.count(Op)) + Queue.push_back(Op); + } + } +} + +// Collect all of the users of all of the provided root instructions (combined +// into a single set). +void LoopReroll::collectInLoopUserSet(Loop *L, + const SmallInstructionVector &Roots, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users) { + for (SmallInstructionVector::const_iterator I = Roots.begin(), + IE = Roots.end(); I != IE; ++I) + collectInLoopUserSet(L, *I, Exclude, Final, Users); +} + +static bool isSimpleLoadStore(Instruction *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->isSimple(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->isSimple(); + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) + return !MI->isVolatile(); + return false; +} + +// Recognize loops that are setup like this: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// %scaled.iv = mul %iv, scale +// f(%scaled.iv) +// %scaled.iv.1 = add %scaled.iv, 1 +// f(%scaled.iv.1) +// %scaled.iv.2 = add %scaled.iv, 2 +// f(%scaled.iv.2) +// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 +// f(%scaled.iv.scale_m_1) +// ... +// %iv.next = add %iv, 1 +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit +// +// and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs. +bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale, + Instruction *&IV, + SmallInstructionVector &LoopIncs) { + // This is a special case: here we're looking for all uses (except for + // the increment) to be multiplied by a common factor. The increment must + // be by one. This is to capture loops like: + // for (int i = 0; i < 500; ++i) { + // foo(3*i); foo(3*i+1); foo(3*i+2); + // } + if (RealIV->getNumUses() != 2) + return false; + const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV)); + Instruction *User1 = cast<Instruction>(*RealIV->use_begin()), + *User2 = cast<Instruction>(*llvm::next(RealIV->use_begin())); + if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType())) + return false; + const SCEVAddRecExpr *User1SCEV = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)), + *User2SCEV = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2)); + if (!User1SCEV || !User1SCEV->isAffine() || + !User2SCEV || !User2SCEV->isAffine()) + return false; + + // We assume below that User1 is the scale multiply and User2 is the + // increment. If this can't be true, then swap them. + if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) { + std::swap(User1, User2); + std::swap(User1SCEV, User2SCEV); + } + + if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE)) + return false; + assert(User2SCEV->getStepRecurrence(*SE)->isOne() && + "Invalid non-unit step for multiplicative scaling"); + LoopIncs.push_back(User2); + + if (const SCEVConstant *MulScale = + dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) { + // Make sure that both the start and step have the same multiplier. + if (RealIVSCEV->getStart()->getType() != MulScale->getType()) + return false; + if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) != + User1SCEV->getStart()) + return false; + + ConstantInt *MulScaleCI = MulScale->getValue(); + if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc)) + return false; + Scale = MulScaleCI->getZExtValue(); + IV = User1; + } else + return false; + + DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n"); + return true; +} + +// Collect all root increments with respect to the provided induction variable +// (normally the PHI, but sometimes a multiply). A root increment is an +// instruction, normally an add, with a positive constant less than Scale. In a +// rerollable loop, each of these increments is the root of an instruction +// graph isomorphic to the others. Also, we collect the final induction +// increment (the increment equal to the Scale), and its users in LoopIncs. +bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, + Instruction *IV, + SmallVector<SmallInstructionVector, 32> &Roots, + SmallInstructionSet &AllRoots, + SmallInstructionVector &LoopIncs) { + for (Value::use_iterator UI = IV->use_begin(), + UIE = IV->use_end(); UI != UIE; ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (!SE->isSCEVable(User->getType())) + continue; + if (User->getType() != IV->getType()) + continue; + if (!L->contains(User)) + continue; + if (hasUsesOutsideLoop(User, L)) + continue; + + if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV( + SE->getSCEV(User), SE->getSCEV(IV)))) { + uint64_t Idx = Diff->getValue()->getValue().getZExtValue(); + if (Idx > 0 && Idx < Scale) { + Roots[Idx-1].push_back(User); + AllRoots.insert(User); + } else if (Idx == Scale && Inc > 1) { + LoopIncs.push_back(User); + } + } + } + + if (Roots[0].empty()) + return false; + bool AllSame = true; + for (unsigned i = 1; i < Scale-1; ++i) + if (Roots[i].size() != Roots[0].size()) { + AllSame = false; + break; + } + + if (!AllSame) + return false; + + return true; +} + +// Validate the selected reductions. All iterations must have an isomorphic +// part of the reduction chain and, for non-associative reductions, the chain +// entries must appear in order. +bool LoopReroll::ReductionTracker::validateSelected() { + // For a non-associative reduction, the chain entries must appear in order. + for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + int PrevIter = 0, BaseCount = 0, Count = 0; + for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(), + JE = PossibleReds[i].end(); J != JE; ++J) { + // Note that all instructions in the chain must have been found because + // all instructions in the function must have been assigned to some + // iteration. + int Iter = PossibleRedIter[*J]; + if (Iter != PrevIter && Iter != PrevIter + 1 && + !PossibleReds[i].getReducedValue()->isAssociative()) { + DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " << + *J << "\n"); + return false; + } + + if (Iter != PrevIter) { + if (Count != BaseCount) { + DEBUG(dbgs() << "LRR: Iteration " << PrevIter << + " reduction use count " << Count << + " is not equal to the base use count " << + BaseCount << "\n"); + return false; + } + + Count = 0; + } + + ++Count; + if (Iter == 0) + ++BaseCount; + + PrevIter = Iter; + } + } + + return true; +} + +// For all selected reductions, remove all parts except those in the first +// iteration (and the PHI). Replace outside uses of the reduced value with uses +// of the first-iteration reduced value (in other words, reroll the selected +// reductions). +void LoopReroll::ReductionTracker::replaceSelected() { + // Fixup reductions to refer to the last instruction associated with the + // first iteration (not the last). + for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + int j = 0; + for (int e = PossibleReds[i].size(); j != e; ++j) + if (PossibleRedIter[PossibleReds[i][j]] != 0) { + --j; + break; + } + + // Replace users with the new end-of-chain value. + SmallInstructionVector Users; + for (Value::use_iterator UI = + PossibleReds[i].getReducedValue()->use_begin(), + UIE = PossibleReds[i].getReducedValue()->use_end(); UI != UIE; ++UI) + Users.push_back(cast<Instruction>(*UI)); + + for (SmallInstructionVector::iterator J = Users.begin(), + JE = Users.end(); J != JE; ++J) + (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(), + PossibleReds[i][j]); + } +} + +// Reroll the provided loop with respect to the provided induction variable. +// Generally, we're looking for a loop like this: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// f(%iv) +// %iv.1 = add %iv, 1 <-- a root increment +// f(%iv.1) +// %iv.2 = add %iv, 2 <-- a root increment +// f(%iv.2) +// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment +// f(%iv.scale_m_1) +// ... +// %iv.next = add %iv, scale +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit +// +// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of +// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can +// be intermixed with eachother. The restriction imposed by this algorithm is +// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), +// etc. be the same. +// +// First, we collect the use set of %iv, excluding the other increment roots. +// This gives us f(%iv). Then we iterate over the loop instructions (scale-1) +// times, having collected the use set of f(%iv.(i+1)), during which we: +// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to +// the next unmatched instruction in f(%iv.(i+1)). +// - Ensure that both matched instructions don't have any external users +// (with the exception of last-in-chain reduction instructions). +// - Track the (aliasing) write set, and other side effects, of all +// instructions that belong to future iterations that come before the matched +// instructions. If the matched instructions read from that write set, then +// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in +// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, +// if any of these future instructions had side effects (could not be +// speculatively executed), and so do the matched instructions, when we +// cannot reorder those side-effect-producing instructions, and rerolling +// fails. +// +// Finally, we make sure that all loop instructions are either loop increment +// roots, belong to simple latch code, parts of validated reductions, part of +// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions +// have been validated), then we reroll the loop. +bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, + const SCEV *IterCount, + ReductionTracker &Reductions) { + const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); + uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> + getValue()->getZExtValue(); + // The collection of loop increment instructions. + SmallInstructionVector LoopIncs; + uint64_t Scale = Inc; + + // The effective induction variable, IV, is normally also the real induction + // variable. When we're dealing with a loop like: + // for (int i = 0; i < 500; ++i) + // x[3*i] = ...; + // x[3*i+1] = ...; + // x[3*i+2] = ...; + // then the real IV is still i, but the effective IV is (3*i). + Instruction *RealIV = IV; + if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs)) + return false; + + assert(Scale <= MaxInc && "Scale is too large"); + assert(Scale > 1 && "Scale must be at least 2"); + + // The set of increment instructions for each increment value. + SmallVector<SmallInstructionVector, 32> Roots(Scale-1); + SmallInstructionSet AllRoots; + if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs)) + return false; + + DEBUG(dbgs() << "LRR: Found all root induction increments for: " << + *RealIV << "\n"); + + // An array of just the possible reductions for this scale factor. When we + // collect the set of all users of some root instructions, these reduction + // instructions are treated as 'final' (their uses are not considered). + // This is important because we don't want the root use set to search down + // the reduction chain. + SmallInstructionSet PossibleRedSet; + SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet; + Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet, + PossibleRedLastSet); + + // We now need to check for equivalence of the use graph of each root with + // that of the primary induction variable (excluding the roots). Our goal + // here is not to solve the full graph isomorphism problem, but rather to + // catch common cases without a lot of work. As a result, we will assume + // that the relative order of the instructions in each unrolled iteration + // is the same (although we will not make an assumption about how the + // different iterations are intermixed). Note that while the order must be + // the same, the instructions may not be in the same basic block. + SmallInstructionSet Exclude(AllRoots); + Exclude.insert(LoopIncs.begin(), LoopIncs.end()); + + DenseSet<Instruction *> BaseUseSet; + collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet); + + DenseSet<Instruction *> AllRootUses; + std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1); + + bool MatchFailed = false; + for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) { + DenseSet<Instruction *> &RootUseSet = RootUseSets[i]; + collectInLoopUserSet(L, Roots[i], SmallInstructionSet(), + PossibleRedSet, RootUseSet); + + DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() << + " vs. iteration increment " << (i+1) << + " use set size: " << RootUseSet.size() << "\n"); + + if (BaseUseSet.size() != RootUseSet.size()) { + MatchFailed = true; + break; + } + + // In addition to regular aliasing information, we need to look for + // instructions from later (future) iterations that have side effects + // preventing us from reordering them past other instructions with side + // effects. + bool FutureSideEffects = false; + AliasSetTracker AST(*AA); + + // The map between instructions in f(%iv.(i+1)) and f(%iv). + DenseMap<Value *, Value *> BaseMap; + + assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops"); + for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(), + JE = Header->end(); J1 != JE && !MatchFailed; ++J1) { + if (cast<Instruction>(J1) == RealIV) + continue; + if (cast<Instruction>(J1) == IV) + continue; + if (!BaseUseSet.count(J1)) + continue; + if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs. + continue; + + while (J2 != JE && (!RootUseSet.count(J2) || + std::find(Roots[i].begin(), Roots[i].end(), J2) != + Roots[i].end())) { + // As we iterate through the instructions, instructions that don't + // belong to previous iterations (or the base case), must belong to + // future iterations. We want to track the alias set of writes from + // previous iterations. + if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) && + !AllRootUses.count(J2)) { + if (J2->mayWriteToMemory()) + AST.add(J2); + + // Note: This is specifically guarded by a check on isa<PHINode>, + // which while a valid (somewhat arbitrary) micro-optimization, is + // needed because otherwise isSafeToSpeculativelyExecute returns + // false on PHI nodes. + if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL)) + FutureSideEffects = true; + } + + ++J2; + } + + if (!J1->isSameOperationAs(J2)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << "\n"); + MatchFailed = true; + break; + } + + // Make sure that this instruction, which is in the use set of this + // root instruction, does not also belong to the base set or the set of + // some previous root instruction. + if (BaseUseSet.count(J2) || AllRootUses.count(J2)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (prev. case overlap)\n"); + MatchFailed = true; + break; + } + + // Make sure that we don't alias with any instruction in the alias set + // tracker. If we do, then we depend on a future iteration, and we + // can't reroll. + if (J2->mayReadFromMemory()) { + for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end(); + K != KE && !MatchFailed; ++K) { + if (K->aliasesUnknownInst(J2, *AA)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (depends on future store)\n"); + MatchFailed = true; + break; + } + } + } + + // If we've past an instruction from a future iteration that may have + // side effects, and this instruction might also, then we can't reorder + // them, and this matching fails. As an exception, we allow the alias + // set tracker to handle regular (simple) load/store dependencies. + if (FutureSideEffects && + ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) || + (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << + " (side effects prevent reordering)\n"); + MatchFailed = true; + break; + } + + // For instructions that are part of a reduction, if the operation is + // associative, then don't bother matching the operands (because we + // already know that the instructions are isomorphic, and the order + // within the iteration does not matter). For non-associative reductions, + // we do need to match the operands, because we need to reject + // out-of-order instructions within an iteration! + // For example (assume floating-point addition), we need to reject this: + // x += a[i]; x += b[i]; + // x += a[i+1]; x += b[i+1]; + // x += b[i+2]; x += a[i+2]; + bool InReduction = Reductions.isPairInSame(J1, J2); + + if (!(InReduction && J1->isAssociative())) { + bool Swapped = false, SomeOpMatched = false;; + for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) { + Value *Op2 = J2->getOperand(j); + + // If this is part of a reduction (and the operation is not + // associatve), then we match all operands, but not those that are + // part of the reduction. + if (InReduction) + if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) + if (Reductions.isPairInSame(J2, Op2I)) + continue; + + DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); + if (BMI != BaseMap.end()) + Op2 = BMI->second; + else if (std::find(Roots[i].begin(), Roots[i].end(), + (Instruction*) Op2) != Roots[i].end()) + Op2 = IV; + + if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) { + // If we've not already decided to swap the matched operands, and + // we've not already matched our first operand (note that we could + // have skipped matching the first operand because it is part of a + // reduction above), and the instruction is commutative, then try + // the swapped match. + if (!Swapped && J1->isCommutative() && !SomeOpMatched && + J1->getOperand(!j) == Op2) { + Swapped = true; + } else { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (operand " << j << ")\n"); + MatchFailed = true; + break; + } + } + + SomeOpMatched = true; + } + } + + if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) || + (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (uses outside loop)\n"); + MatchFailed = true; + break; + } + + if (!MatchFailed) + BaseMap.insert(std::pair<Value *, Value *>(J2, J1)); + + AllRootUses.insert(J2); + Reductions.recordPair(J1, J2, i+1); + + ++J2; + } + } + + if (MatchFailed) + return false; + + DEBUG(dbgs() << "LRR: Matched all iteration increments for " << + *RealIV << "\n"); + + DenseSet<Instruction *> LoopIncUseSet; + collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(), + SmallInstructionSet(), LoopIncUseSet); + DEBUG(dbgs() << "LRR: Loop increment set size: " << + LoopIncUseSet.size() << "\n"); + + // Make sure that all instructions in the loop have been included in some + // use set. + for (BasicBlock::iterator J = Header->begin(), JE = Header->end(); + J != JE; ++J) { + if (isa<DbgInfoIntrinsic>(J)) + continue; + if (cast<Instruction>(J) == RealIV) + continue; + if (cast<Instruction>(J) == IV) + continue; + if (BaseUseSet.count(J) || AllRootUses.count(J) || + (LoopIncUseSet.count(J) && (J->isTerminator() || + isSafeToSpeculativelyExecute(J, DL)))) + continue; + + if (AllRoots.count(J)) + continue; + + if (Reductions.isSelectedPHI(J)) + continue; + + DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV << + " unprocessed instruction found: " << *J << "\n"); + MatchFailed = true; + break; + } + + if (MatchFailed) + return false; + + DEBUG(dbgs() << "LRR: all instructions processed from " << + *RealIV << "\n"); + + if (!Reductions.validateSelected()) + return false; + + // At this point, we've validated the rerolling, and we're committed to + // making changes! + + Reductions.replaceSelected(); + + // Remove instructions associated with non-base iterations. + for (BasicBlock::reverse_iterator J = Header->rbegin(); + J != Header->rend();) { + if (AllRootUses.count(&*J)) { + Instruction *D = &*J; + DEBUG(dbgs() << "LRR: removing: " << *D << "\n"); + D->eraseFromParent(); + continue; + } + + ++J; + } + + // Insert the new induction variable. + const SCEV *Start = RealIVSCEV->getStart(); + if (Inc == 1) + Start = SE->getMulExpr(Start, + SE->getConstant(Start->getType(), Scale)); + const SCEVAddRecExpr *H = + cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start, + SE->getConstant(RealIVSCEV->getType(), 1), + L, SCEV::FlagAnyWrap)); + { // Limit the lifetime of SCEVExpander. + SCEVExpander Expander(*SE, "reroll"); + PHINode *NewIV = + cast<PHINode>(Expander.expandCodeFor(H, IV->getType(), + Header->begin())); + for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(), + JE = BaseUseSet.end(); J != JE; ++J) + (*J)->replaceUsesOfWith(IV, NewIV); + + if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { + if (LoopIncUseSet.count(BI)) { + const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); + if (Inc == 1) + ICSCEV = + SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale)); + Value *IC; + if (isa<SCEVConstant>(ICSCEV)) { + IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), BI); + } else { + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + Preheader = InsertPreheaderForLoop(L, this); + + IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), + Preheader->getTerminator()); + } + + Value *NewIVNext = NewIV->getIncomingValueForBlock(Header); + Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIVNext, IC, + "exitcond"); + BI->setCondition(Cond); + + if (BI->getSuccessor(1) != Header) + BI->swapSuccessors(); + } + } + } + + SimplifyInstructionsInBlock(Header, DL, TLI); + DeleteDeadPHIs(Header, TLI); + ++NumRerolledLoops; + return true; +} + +bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { + AA = &getAnalysis<AliasAnalysis>(); + LI = &getAnalysis<LoopInfo>(); + SE = &getAnalysis<ScalarEvolution>(); + TLI = &getAnalysis<TargetLibraryInfo>(); + DL = getAnalysisIfAvailable<DataLayout>(); + DT = &getAnalysis<DominatorTree>(); + + BasicBlock *Header = L->getHeader(); + DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << + "] Loop %" << Header->getName() << " (" << + L->getNumBlocks() << " block(s))\n"); + + bool Changed = false; + + // For now, we'll handle only single BB loops. + if (L->getNumBlocks() > 1) + return Changed; + + if (!SE->hasLoopInvariantBackedgeTakenCount(L)) + return Changed; + + const SCEV *LIBETC = SE->getBackedgeTakenCount(L); + const SCEV *IterCount = + SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1)); + DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n"); + + // First, we need to find the induction variable with respect to which we can + // reroll (there may be several possible options). + SmallInstructionVector PossibleIVs; + collectPossibleIVs(L, PossibleIVs); + + if (PossibleIVs.empty()) { + DEBUG(dbgs() << "LRR: No possible IVs found\n"); + return Changed; + } + + ReductionTracker Reductions; + collectPossibleReductions(L, Reductions); + + // For each possible IV, collect the associated possible set of 'root' nodes + // (i+1, i+2, etc.). + for (SmallInstructionVector::iterator I = PossibleIVs.begin(), + IE = PossibleIVs.end(); I != IE; ++I) + if (reroll(*I, L, Header, IterCount, Reductions)) { + Changed = true; + break; + } + + return Changed; +} + diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 14cb979..eff5268 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1170,6 +1170,13 @@ public: /// may be used. bool AllFixupsOutsideLoop; + /// RigidFormula is set to true to guarantee that this use will be associated + /// with a single formula--the one that initially matched. Some SCEV + /// expressions cannot be expanded. This allows LSR to consider the registers + /// used by those expressions without the need to expand them later after + /// changing the formula. + bool RigidFormula; + /// WidestFixupType - This records the widest use type for any fixup using /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different /// max fixup widths to be equivalent, because the narrower one may be relying @@ -1188,6 +1195,7 @@ public: MinOffset(INT64_MAX), MaxOffset(INT64_MIN), AllFixupsOutsideLoop(true), + RigidFormula(false), WidestFixupType(0) {} bool HasFormulaWithSameRegs(const Formula &F) const; @@ -1214,6 +1222,9 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { /// InsertFormula - If the given formula has not yet been inserted, add it to /// the list, and return true. Return false otherwise. bool LSRUse::InsertFormula(const Formula &F) { + if (!Formulae.empty() && RigidFormula) + return false; + SmallVector<const SCEV *, 4> Key = F.BaseRegs; if (F.ScaledReg) Key.push_back(F.ScaledReg); // Unstable sort by host order ok, because this is only used for uniquifying. @@ -1433,7 +1444,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, } case LSRUse::ICmpZero: // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg. - // Therefore, return 0 in case F.Scale == -1. + // Therefore, return 0 in case F.Scale == -1. return F.Scale != -1; case LSRUse::Basic: @@ -2943,7 +2954,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { // x == y --> x - y == 0 const SCEV *N = SE.getSCEV(NV); - if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) { + if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) { // S is normalized, so normalize N before folding it into S // to keep the result normalized. N = TransformForPostIncUse(Normalize, N, CI, 0, @@ -2986,6 +2997,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { /// and loop-computable portions. void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { + // Mark uses whose expressions cannot be expanded. + if (!isSafeToExpand(S, SE)) + LU.RigidFormula = true; + Formula F; F.InitialMatch(S, L, SE); bool Inserted = InsertFormula(LU, LUIdx, F); @@ -4353,6 +4368,8 @@ Value *LSRInstance::Expand(const LSRFixup &LF, SCEVExpander &Rewriter, SmallVectorImpl<WeakVH> &DeadInsts) const { const LSRUse &LU = Uses[LF.LUIdx]; + if (LU.RigidFormula) + return LF.OperandValToReplace; // Determine an input position which will be dominated by the operands and // which will dominate the result. diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index 80d060b..08ac38d 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -49,12 +49,17 @@ namespace { class LoopUnroll : public LoopPass { public: static char ID; // Pass ID, replacement for typeid - LoopUnroll(int T = -1, int C = -1, int P = -1) : LoopPass(ID) { + LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) { CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T); CurrentCount = (C == -1) ? UnrollCount : unsigned(C); CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P; + CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R; UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0); + UserAllowPartial = (P != -1) || + (UnrollAllowPartial.getNumOccurrences() > 0); + UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0); + UserCount = (C != -1) || (UnrollCount.getNumOccurrences() > 0); initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -75,7 +80,11 @@ namespace { unsigned CurrentCount; unsigned CurrentThreshold; bool CurrentAllowPartial; + bool CurrentRuntime; + bool UserCount; // CurrentCount is user-specified. bool UserThreshold; // CurrentThreshold is user-specified. + bool UserAllowPartial; // CurrentAllowPartial is user-specified. + bool UserRuntime; // CurrentRuntime is user-specified. bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -110,8 +119,9 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) -Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) { - return new LoopUnroll(Threshold, Count, AllowPartial); +Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, + int Runtime) { + return new LoopUnroll(Threshold, Count, AllowPartial, Runtime); } /// ApproximateLoopSize - Approximate the size of the loop. @@ -145,16 +155,24 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { << "] Loop %" << Header->getName() << "\n"); (void)Header; + TargetTransformInfo::UnrollingPreferences UP; + UP.Threshold = CurrentThreshold; + UP.OptSizeThreshold = OptSizeUnrollThreshold; + UP.Count = CurrentCount; + UP.Partial = CurrentAllowPartial; + UP.Runtime = CurrentRuntime; + TTI.getUnrollingPreferences(L, UP); + // Determine the current unrolling threshold. While this is normally set // from UnrollThreshold, it is overridden to a smaller value if the current // function is marked as optimize-for-size, and the unroll threshold was // not user specified. - unsigned Threshold = CurrentThreshold; + unsigned Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; if (!UserThreshold && Header->getParent()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize)) - Threshold = OptSizeUnrollThreshold; + Threshold = UP.OptSizeThreshold; // Find trip count and trip multiple if count is not available unsigned TripCount = 0; @@ -167,11 +185,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { TripCount = SE->getSmallConstantTripCount(L, LatchBlock); TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock); } + + bool Runtime = UserRuntime ? CurrentRuntime : UP.Runtime; + // Use a default unroll-count if the user doesn't specify a value // and the trip count is a run-time value. The default is different // for run-time or compile-time trip count loops. - unsigned Count = CurrentCount; - if (UnrollRuntime && CurrentCount == 0 && TripCount == 0) + unsigned Count = UserCount ? CurrentCount : UP.Count; + if (Runtime && Count == 0 && TripCount == 0) Count = UnrollRuntimeCount; if (Count == 0) { @@ -204,7 +225,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { if (TripCount != 1 && Size > Threshold) { DEBUG(dbgs() << " Too large to fully unroll with count: " << Count << " because size: " << Size << ">" << Threshold << "\n"); - if (!CurrentAllowPartial && !(UnrollRuntime && TripCount == 0)) { + bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; + if (!AllowPartial && !(Runtime && TripCount == 0)) { DEBUG(dbgs() << " will not try to unroll partially because " << "-unroll-allow-partial not given\n"); return false; @@ -215,7 +237,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { while (Count != 0 && TripCount%Count != 0) Count--; } - else if (UnrollRuntime) { + else if (Runtime) { // Reduce unroll count to be a lower power-of-two value while (Count != 0 && Size > Threshold) { Count >>= 1; @@ -231,7 +253,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } // Unroll the loop. - if (!UnrollLoop(L, Count, TripCount, UnrollRuntime, TripMultiple, LI, &LPM)) + if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, &LPM)) return false; return true; diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 59aff31..c4ebfd5 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -212,8 +212,6 @@ namespace { Instruction *InsertPt); void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L); - void RemoveBlockIfDead(BasicBlock *BB, - std::vector<Instruction*> &Worklist, Loop *l); void RemoveLoopFromHierarchy(Loop *L); bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0, BasicBlock **LoopExit = 0); @@ -946,114 +944,6 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V, ++NumSimplify; } -/// RemoveBlockIfDead - If the specified block is dead, remove it, update loop -/// information, and remove any dead successors it has. -/// -void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB, - std::vector<Instruction*> &Worklist, - Loop *L) { - if (pred_begin(BB) != pred_end(BB)) { - // This block isn't dead, since an edge to BB was just removed, see if there - // are any easy simplifications we can do now. - if (BasicBlock *Pred = BB->getSinglePredecessor()) { - // If it has one pred, fold phi nodes in BB. - while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) - ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM); - - // If this is the header of a loop and the only pred is the latch, we now - // have an unreachable loop. - if (Loop *L = LI->getLoopFor(BB)) - if (loopHeader == BB && L->contains(Pred)) { - // Remove the branch from the latch to the header block, this makes - // the header dead, which will make the latch dead (because the header - // dominates the latch). - LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L); - Pred->getTerminator()->eraseFromParent(); - new UnreachableInst(BB->getContext(), Pred); - - // The loop is now broken, remove it from LI. - RemoveLoopFromHierarchy(L); - - // Reprocess the header, which now IS dead. - RemoveBlockIfDead(BB, Worklist, L); - return; - } - - // If pred ends in a uncond branch, add uncond branch to worklist so that - // the two blocks will get merged. - if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator())) - if (BI->isUnconditional()) - Worklist.push_back(BI); - } - return; - } - - DEBUG(dbgs() << "Nuking dead block: " << *BB); - - // Remove the instructions in the basic block from the worklist. - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - RemoveFromWorklist(I, Worklist); - - // Anything that uses the instructions in this basic block should have their - // uses replaced with undefs. - // If I is not void type then replaceAllUsesWith undef. - // This allows ValueHandlers and custom metadata to adjust itself. - if (!I->getType()->isVoidTy()) - I->replaceAllUsesWith(UndefValue::get(I->getType())); - } - - // If this is the edge to the header block for a loop, remove the loop and - // promote all subloops. - if (Loop *BBLoop = LI->getLoopFor(BB)) { - if (BBLoop->getLoopLatch() == BB) { - RemoveLoopFromHierarchy(BBLoop); - if (currentLoop == BBLoop) { - currentLoop = 0; - redoLoop = false; - } - } - } - - // Remove the block from the loop info, which removes it from any loops it - // was in. - LI->removeBlock(BB); - - // Remove phi node entries in successors for this block. - TerminatorInst *TI = BB->getTerminator(); - SmallVector<BasicBlock*, 4> Succs; - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { - Succs.push_back(TI->getSuccessor(i)); - TI->getSuccessor(i)->removePredecessor(BB); - } - - // Unique the successors, remove anything with multiple uses. - array_pod_sort(Succs.begin(), Succs.end()); - Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end()); - - // Remove the basic block, including all of the instructions contained in it. - LPM->deleteSimpleAnalysisValue(BB, L); - BB->eraseFromParent(); - // Remove successor blocks here that are not dead, so that we know we only - // have dead blocks in this list. Nondead blocks have a way of becoming dead, - // then getting removed before we revisit them, which is badness. - // - for (unsigned i = 0; i != Succs.size(); ++i) - if (pred_begin(Succs[i]) != pred_end(Succs[i])) { - // One exception is loop headers. If this block was the preheader for a - // loop, then we DO want to visit the loop so the loop gets deleted. - // We know that if the successor is a loop header, that this loop had to - // be the preheader: the case where this was the latch block was handled - // above and headers can only have two predecessors. - if (!LI->isLoopHeader(Succs[i])) { - Succs.erase(Succs.begin()+i); - --i; - } - } - - for (unsigned i = 0, e = Succs.size(); i != e; ++i) - RemoveBlockIfDead(Succs[i], Worklist, L); -} - /// RemoveLoopFromHierarchy - We have discovered that the specified loop has /// become unwrapped, either because the backedge was deleted, or because the /// edge into the header was removed. If the edge into the header from the @@ -1262,23 +1152,6 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { continue; } - if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){ - // Conditional branch. Turn it into an unconditional branch, then - // remove dead blocks. - continue; // FIXME: Enable. - - DEBUG(dbgs() << "Folded branch: " << *BI); - BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue()); - BasicBlock *LiveSucc = BI->getSuccessor(!CB->getZExtValue()); - DeadSucc->removePredecessor(BI->getParent(), true); - Worklist.push_back(BranchInst::Create(LiveSucc, BI)); - LPM->deleteSimpleAnalysisValue(BI, L); - BI->eraseFromParent(); - RemoveFromWorklist(BI, Worklist); - ++NumSimplify; - - RemoveBlockIfDead(DeadSucc, Worklist, L); - } continue; } } diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 8f61ffd..9912d3d 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -170,14 +170,17 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const { // pessimize the llvm optimizer. // // Since we don't have perfect knowledge here, make some assumptions: assume - // the maximum GPR width is the same size as the pointer size and assume that - // this width can be stored. If so, check to see whether we will end up - // actually reducing the number of stores used. + // the maximum GPR width is the same size as the largest legal integer + // size. If so, check to see whether we will end up actually reducing the + // number of stores used. unsigned Bytes = unsigned(End-Start); - unsigned NumPointerStores = Bytes/TD.getPointerSize(); + unsigned MaxIntSize = TD.getLargestLegalIntTypeSize(); + if (MaxIntSize == 0) + MaxIntSize = 1; + unsigned NumPointerStores = Bytes / MaxIntSize; // Assume the remaining bytes if any are done a byte at a time. - unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize(); + unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize; // If we will reduce the # stores (according to this heuristic), do the // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp new file mode 100644 index 0000000..15cee44 --- /dev/null +++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -0,0 +1,156 @@ +//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to partially inline the fast path of well-known library +// functions, such as using square-root instructions for cases where sqrt() +// does not need to set errno. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "partially-inline-libcalls" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +namespace { + class PartiallyInlineLibCalls : public FunctionPass { + public: + static char ID; + + PartiallyInlineLibCalls() : + FunctionPass(ID) { + initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry()); + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual bool runOnFunction(Function &F); + + private: + /// Optimize calls to sqrt. + bool optimizeSQRT(CallInst *Call, Function *CalledFunc, + BasicBlock &CurrBB, Function::iterator &BB); + }; + + char PartiallyInlineLibCalls::ID = 0; +} + +INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls", + "Partially inline calls to library functions", false, false) + +void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetTransformInfo>(); + FunctionPass::getAnalysisUsage(AU); +} + +bool PartiallyInlineLibCalls::runOnFunction(Function &F) { + bool Changed = false; + Function::iterator CurrBB; + TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfo>(); + for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) { + CurrBB = BB++; + + for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end(); + II != IE; ++II) { + CallInst *Call = dyn_cast<CallInst>(&*II); + Function *CalledFunc; + + if (!Call || !(CalledFunc = Call->getCalledFunction())) + continue; + + // Skip if function either has local linkage or is not a known library + // function. + LibFunc::Func LibFunc; + if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() || + !TLI->getLibFunc(CalledFunc->getName(), LibFunc)) + continue; + + switch (LibFunc) { + case LibFunc::sqrtf: + case LibFunc::sqrt: + if (TTI->haveFastSqrt(Call->getType()) && + optimizeSQRT(Call, CalledFunc, *CurrBB, BB)) + break; + continue; + default: + continue; + } + + Changed = true; + break; + } + } + + return Changed; +} + +bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call, + Function *CalledFunc, + BasicBlock &CurrBB, + Function::iterator &BB) { + // There is no need to change the IR, since backend will emit sqrt + // instruction if the call has already been marked read-only. + if (Call->onlyReadsMemory()) + return false; + + // Do the following transformation: + // + // (before) + // dst = sqrt(src) + // + // (after) + // v0 = sqrt_noreadmem(src) # native sqrt instruction. + // if (v0 is a NaN) + // v1 = sqrt(src) # library call. + // dst = phi(v0, v1) + // + + // Move all instructions following Call to newly created block JoinBB. + // Create phi and replace all uses. + BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode(), this); + IRBuilder<> Builder(JoinBB, JoinBB->begin()); + PHINode *Phi = Builder.CreatePHI(Call->getType(), 2); + Call->replaceAllUsesWith(Phi); + + // Create basic block LibCallBB and insert a call to library function sqrt. + BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt", + CurrBB.getParent(), JoinBB); + Builder.SetInsertPoint(LibCallBB); + Instruction *LibCall = Call->clone(); + Builder.Insert(LibCall); + Builder.CreateBr(JoinBB); + + // Add attribute "readnone" so that backend can use a native sqrt instruction + // for this call. Insert a FP compare instruction and a conditional branch + // at the end of CurrBB. + Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone); + CurrBB.getTerminator()->eraseFromParent(); + Builder.SetInsertPoint(&CurrBB); + Value *FCmp = Builder.CreateFCmpOEQ(Call, Call); + Builder.CreateCondBr(FCmp, JoinBB, LibCallBB); + + // Add phi operands. + Phi->addIncoming(Call, &CurrBB); + Phi->addIncoming(LibCall, LibCallBB); + + BB = JoinBB; + return true; +} + +FunctionPass *llvm::createPartiallyInlineLibCallsPass() { + return new PartiallyInlineLibCalls(); +} diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 5c55143..9f3fc83 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -733,9 +733,9 @@ class AllocaPromoter : public LoadAndStorePromoter { SmallVector<DbgValueInst *, 4> DVIs; public: - AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, + AllocaPromoter(const SmallVectorImpl<Instruction *> &Insts, SSAUpdater &S, AllocaInst &AI, DIBuilder &DIB) - : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} + : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} void run(const SmallVectorImpl<Instruction*> &Insts) { // Retain the debug information attached to the alloca for use when @@ -762,9 +762,30 @@ public: virtual bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction*> &Insts) const { + Value *Ptr; if (LoadInst *LI = dyn_cast<LoadInst>(I)) - return LI->getOperand(0) == &AI; - return cast<StoreInst>(I)->getPointerOperand() == &AI; + Ptr = LI->getOperand(0); + else + Ptr = cast<StoreInst>(I)->getPointerOperand(); + + // Only used to detect cycles, which will be rare and quickly found as + // we're walking up a chain of defs rather than down through uses. + SmallPtrSet<Value *, 4> Visited; + + do { + if (Ptr == &AI) + return true; + + if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr)) + Ptr = BCI->getOperand(0); + else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) + Ptr = GEPI->getPointerOperand(); + else + return false; + + } while (Visited.insert(Ptr)); + + return false; } virtual void updateDebugInfo(Instruction *Inst) const { @@ -917,6 +938,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset) { Type *Ty = 0; + bool IgnoreNonIntegralTypes = false; for (AllocaSlices::const_iterator I = B; I != E; ++I) { Use *U = I->getUse(); if (isa<IntrinsicInst>(*U->getUser())) @@ -925,29 +947,37 @@ static Type *findCommonType(AllocaSlices::const_iterator B, continue; Type *UserTy = 0; - if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) + if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { UserTy = LI->getType(); - else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) + } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { UserTy = SI->getValueOperand()->getType(); - else - return 0; // Bail if we have weird uses. + } else { + IgnoreNonIntegralTypes = true; // Give up on anything but an iN type. + continue; + } if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { // If the type is larger than the partition, skip it. We only encounter // this for split integer operations where we want to use the type of the - // entity causing the split. - if (ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) + // entity causing the split. Also skip if the type is not a byte width + // multiple. + if (ITy->getBitWidth() % 8 != 0 || + ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) continue; // If we have found an integer type use covering the alloca, use that - // regardless of the other types, as integers are often used for a - // "bucket - // of bits" type. + // regardless of the other types, as integers are often used for + // a "bucket of bits" type. + // + // NB: This *must* be the only return from inside the loop so that the + // order of slices doesn't impact the computed type. return ITy; + } else if (IgnoreNonIntegralTypes) { + continue; } if (Ty && Ty != UserTy) - return 0; + IgnoreNonIntegralTypes = true; // Give up on anything but an iN type. Ty = UserTy; } @@ -1431,6 +1461,10 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) return false; + // We can convert pointers to integers and vice-versa. Same for vectors + // of pointers and integers. + OldTy = OldTy->getScalarType(); + NewTy = NewTy->getScalarType(); if (NewTy->isPointerTy() || OldTy->isPointerTy()) { if (NewTy->isPointerTy() && OldTy->isPointerTy()) return true; @@ -1449,21 +1483,53 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test /// two types for viability with this routine. static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, - Type *Ty) { - assert(canConvertValue(DL, V->getType(), Ty) && - "Value not convertable to type"); - if (V->getType() == Ty) + Type *NewTy) { + Type *OldTy = V->getType(); + assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); + + if (OldTy == NewTy) return V; - if (IntegerType *OldITy = dyn_cast<IntegerType>(V->getType())) - if (IntegerType *NewITy = dyn_cast<IntegerType>(Ty)) + + if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy)) + if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy)) if (NewITy->getBitWidth() > OldITy->getBitWidth()) return IRB.CreateZExt(V, NewITy); - if (V->getType()->isIntegerTy() && Ty->isPointerTy()) - return IRB.CreateIntToPtr(V, Ty); - if (V->getType()->isPointerTy() && Ty->isIntegerTy()) - return IRB.CreatePtrToInt(V, Ty); - return IRB.CreateBitCast(V, Ty); + // See if we need inttoptr for this type pair. A cast involving both scalars + // and vectors requires and additional bitcast. + if (OldTy->getScalarType()->isIntegerTy() && + NewTy->getScalarType()->isPointerTy()) { + // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* + if (OldTy->isVectorTy() && !NewTy->isVectorTy()) + return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), + NewTy); + + // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> + if (!OldTy->isVectorTy() && NewTy->isVectorTy()) + return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), + NewTy); + + return IRB.CreateIntToPtr(V, NewTy); + } + + // See if we need ptrtoint for this type pair. A cast involving both scalars + // and vectors requires and additional bitcast. + if (OldTy->getScalarType()->isPointerTy() && + NewTy->getScalarType()->isIntegerTy()) { + // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128 + if (OldTy->isVectorTy() && !NewTy->isVectorTy()) + return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + NewTy); + + // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> + if (!OldTy->isVectorTy() && NewTy->isVectorTy()) + return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + NewTy); + + return IRB.CreatePtrToInt(V, NewTy); + } + + return IRB.CreateBitCast(V, NewTy); } /// \brief Test whether the given slice use can be promoted to a vector. @@ -3364,12 +3430,12 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { } static void enqueueUsersInWorklist(Instruction &I, - SmallVectorImpl<Use *> &UseWorklist, - SmallPtrSet<Use *, 8> &VisitedUses) { + SmallVectorImpl<Instruction *> &Worklist, + SmallPtrSet<Instruction *, 8> &Visited) { for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; ++UI) - if (VisitedUses.insert(&UI.getUse())) - UseWorklist.push_back(&UI.getUse()); + if (Visited.insert(cast<Instruction>(*UI))) + Worklist.push_back(cast<Instruction>(*UI)); } /// \brief Promote the allocas, using the best available technique. @@ -3388,7 +3454,7 @@ bool SROA::promoteAllocas(Function &F) { if (DT && !ForceSSAUpdater) { DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); - PromoteMemToReg(PromotableAllocas, *DT, DL); + PromoteMemToReg(PromotableAllocas, *DT); PromotableAllocas.clear(); return true; } @@ -3396,29 +3462,29 @@ bool SROA::promoteAllocas(Function &F) { DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); SSAUpdater SSA; DIBuilder DIB(*F.getParent()); - SmallVector<Instruction*, 64> Insts; + SmallVector<Instruction *, 64> Insts; // We need a worklist to walk the uses of each alloca. - SmallVector<Use *, 8> UseWorklist; - SmallPtrSet<Use *, 8> VisitedUses; + SmallVector<Instruction *, 8> Worklist; + SmallPtrSet<Instruction *, 8> Visited; SmallVector<Instruction *, 32> DeadInsts; for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { AllocaInst *AI = PromotableAllocas[Idx]; - UseWorklist.clear(); - VisitedUses.clear(); + Insts.clear(); + Worklist.clear(); + Visited.clear(); - enqueueUsersInWorklist(*AI, UseWorklist, VisitedUses); + enqueueUsersInWorklist(*AI, Worklist, Visited); - while (!UseWorklist.empty()) { - Use *U = UseWorklist.pop_back_val(); - Instruction &I = *cast<Instruction>(U->getUser()); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); // FIXME: Currently the SSAUpdater infrastructure doesn't reason about // lifetime intrinsics and so we strip them (and the bitcasts+GEPs // leading to them) here. Eventually it should use them to optimize the // scalar values produced. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { assert(II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end); II->eraseFromParent(); @@ -3428,12 +3494,12 @@ bool SROA::promoteAllocas(Function &F) { // Push the loads and stores we find onto the list. SROA will already // have validated that all loads and stores are viable candidates for // promotion. - if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { assert(LI->getType() == AI->getAllocatedType()); Insts.push_back(LI); continue; } - if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { assert(SI->getValueOperand()->getType() == AI->getAllocatedType()); Insts.push_back(SI); continue; @@ -3442,11 +3508,10 @@ bool SROA::promoteAllocas(Function &F) { // For everything else, we know that only no-op bitcasts and GEPs will // make it this far, just recurse through them and recall them for later // removal. - DeadInsts.push_back(&I); - enqueueUsersInWorklist(I, UseWorklist, VisitedUses); + DeadInsts.push_back(I); + enqueueUsersInWorklist(*I, Worklist, Visited); } AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); - Insts.clear(); while (!DeadInsts.empty()) DeadInsts.pop_back_val()->eraseFromParent(); AI->eraseFromParent(); diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp new file mode 100644 index 0000000..9bcd702 --- /dev/null +++ b/lib/Transforms/Scalar/SampleProfile.cpp @@ -0,0 +1,479 @@ +//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the SampleProfileLoader transformation. This pass +// reads a profile file generated by a sampling profiler (e.g. Linux Perf - +// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the +// profile information in the given profile. +// +// This pass generates branch weight annotations on the IR: +// +// - prof: Represents branch weights. This annotation is added to branches +// to indicate the weights of each edge coming out of the branch. +// The weight of each edge is the weight of the target block for +// that edge. The weight of a block B is computed as the maximum +// number of samples found in B. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sample-profile" + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/DebugInfo/DIContext.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Regex.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +// Command line option to specify the file to read samples from. This is +// mainly used for debugging. +static cl::opt<std::string> SampleProfileFile( + "sample-profile-file", cl::init(""), cl::value_desc("filename"), + cl::desc("Profile file loaded by -sample-profile"), cl::Hidden); + +namespace { +/// \brief Sample-based profile reader. +/// +/// Each profile contains sample counts for all the functions +/// executed. Inside each function, statements are annotated with the +/// collected samples on all the instructions associated with that +/// statement. +/// +/// For this to produce meaningful data, the program needs to be +/// compiled with some debug information (at minimum, line numbers: +/// -gline-tables-only). Otherwise, it will be impossible to match IR +/// instructions to the line numbers collected by the profiler. +/// +/// From the profile file, we are interested in collecting the +/// following information: +/// +/// * A list of functions included in the profile (mangled names). +/// +/// * For each function F: +/// 1. The total number of samples collected in F. +/// +/// 2. The samples collected at each line in F. To provide some +/// protection against source code shuffling, line numbers should +/// be relative to the start of the function. +class SampleProfile { +public: + SampleProfile(StringRef F) : Profiles(0), Filename(F) {} + + void dump(); + void loadText(); + void loadNative() { llvm_unreachable("not implemented"); } + bool emitAnnotations(Function &F); + void printFunctionProfile(raw_ostream &OS, StringRef FName); + void dumpFunctionProfile(StringRef FName); + +protected: + typedef DenseMap<uint32_t, uint32_t> BodySampleMap; + typedef DenseMap<BasicBlock *, uint32_t> BlockWeightMap; + + /// \brief Representation of the runtime profile for a function. + /// + /// This data structure contains the runtime profile for a given + /// function. It contains the total number of samples collected + /// in the function and a map of samples collected in every statement. + struct FunctionProfile { + /// \brief Total number of samples collected inside this function. + /// + /// Samples are cumulative, they include all the samples collected + /// inside this function and all its inlined callees. + unsigned TotalSamples; + + // \brief Total number of samples collected at the head of the function. + unsigned TotalHeadSamples; + + /// \brief Map line offsets to collected samples. + /// + /// Each entry in this map contains the number of samples + /// collected at the corresponding line offset. All line locations + /// are an offset from the start of the function. + BodySampleMap BodySamples; + + /// \brief Map basic blocks to their computed weights. + /// + /// The weight of a basic block is defined to be the maximum + /// of all the instruction weights in that block. + BlockWeightMap BlockWeights; + }; + + uint32_t getInstWeight(Instruction &I, unsigned FirstLineno, + BodySampleMap &BodySamples); + uint32_t computeBlockWeight(BasicBlock *B, unsigned FirstLineno, + BodySampleMap &BodySamples); + + /// \brief Map every function to its associated profile. + /// + /// The profile of every function executed at runtime is collected + /// in the structure FunctionProfile. This maps function objects + /// to their corresponding profiles. + StringMap<FunctionProfile> Profiles; + + /// \brief Path name to the file holding the profile data. + /// + /// The format of this file is defined by each profiler + /// independently. If possible, the profiler should have a text + /// version of the profile format to be used in constructing test + /// cases and debugging. + StringRef Filename; +}; + +/// \brief Loader class for text-based profiles. +/// +/// This class defines a simple interface to read text files containing +/// profiles. It keeps track of line number information and location of +/// the file pointer. Users of this class are responsible for actually +/// parsing the lines returned by the readLine function. +/// +/// TODO - This does not really belong here. It is a generic text file +/// reader. It should be moved to the Support library and made more general. +class ExternalProfileTextLoader { +public: + ExternalProfileTextLoader(StringRef F) : Filename(F) { + error_code EC; + EC = MemoryBuffer::getFile(Filename, Buffer); + if (EC) + report_fatal_error("Could not open profile file " + Filename + ": " + + EC.message()); + FP = Buffer->getBufferStart(); + Lineno = 0; + } + + /// \brief Read a line from the mapped file. + StringRef readLine() { + size_t Length = 0; + const char *start = FP; + while (FP != Buffer->getBufferEnd() && *FP != '\n') { + Length++; + FP++; + } + if (FP != Buffer->getBufferEnd()) + FP++; + Lineno++; + return StringRef(start, Length); + } + + /// \brief Return true, if we've reached EOF. + bool atEOF() const { return FP == Buffer->getBufferEnd(); } + + /// \brief Report a parse error message and stop compilation. + void reportParseError(Twine Msg) const { + report_fatal_error(Filename + ":" + Twine(Lineno) + ": " + Msg + "\n"); + } + +private: + /// \brief Memory buffer holding the text file. + OwningPtr<MemoryBuffer> Buffer; + + /// \brief Current position into the memory buffer. + const char *FP; + + /// \brief Current line number. + int64_t Lineno; + + /// \brief Path name where to the profile file. + StringRef Filename; +}; + +/// \brief Sample profile pass. +/// +/// This pass reads profile data from the file specified by +/// -sample-profile-file and annotates every affected function with the +/// profile information found in that file. +class SampleProfileLoader : public FunctionPass { +public: + // Class identification, replacement for typeinfo + static char ID; + + SampleProfileLoader(StringRef Name = SampleProfileFile) + : FunctionPass(ID), Profiler(0), Filename(Name) { + initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry()); + } + + virtual bool doInitialization(Module &M); + + void dump() { Profiler->dump(); } + + virtual const char *getPassName() const { return "Sample profile pass"; } + + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + +protected: + /// \brief Profile reader object. + OwningPtr<SampleProfile> Profiler; + + /// \brief Name of the profile file to load. + StringRef Filename; +}; +} + +/// \brief Print the function profile for \p FName on stream \p OS. +/// +/// \param OS Stream to emit the output to. +/// \param FName Name of the function to print. +void SampleProfile::printFunctionProfile(raw_ostream &OS, StringRef FName) { + FunctionProfile FProfile = Profiles[FName]; + OS << "Function: " << FName << ", " << FProfile.TotalSamples << ", " + << FProfile.TotalHeadSamples << ", " << FProfile.BodySamples.size() + << " sampled lines\n"; + for (BodySampleMap::const_iterator SI = FProfile.BodySamples.begin(), + SE = FProfile.BodySamples.end(); + SI != SE; ++SI) + OS << "\tline offset: " << SI->first + << ", number of samples: " << SI->second << "\n"; + OS << "\n"; +} + +/// \brief Dump the function profile for \p FName. +/// +/// \param FName Name of the function to print. +void SampleProfile::dumpFunctionProfile(StringRef FName) { + printFunctionProfile(dbgs(), FName); +} + +/// \brief Dump all the function profiles found. +void SampleProfile::dump() { + for (StringMap<FunctionProfile>::const_iterator I = Profiles.begin(), + E = Profiles.end(); + I != E; ++I) + dumpFunctionProfile(I->getKey()); +} + +/// \brief Load samples from a text file. +/// +/// The file is divided in two segments: +/// +/// Symbol table (represented with the string "symbol table") +/// Number of symbols in the table +/// symbol 1 +/// symbol 2 +/// ... +/// symbol N +/// +/// Function body profiles +/// function1:total_samples:total_head_samples:number_of_locations +/// location_offset_1: number_of_samples +/// location_offset_2: number_of_samples +/// ... +/// location_offset_N: number_of_samples +/// +/// Function names must be mangled in order for the profile loader to +/// match them in the current translation unit. +/// +/// Since this is a flat profile, a function that shows up more than +/// once gets all its samples aggregated across all its instances. +/// TODO - flat profiles are too imprecise to provide good optimization +/// opportunities. Convert them to context-sensitive profile. +/// +/// This textual representation is useful to generate unit tests and +/// for debugging purposes, but it should not be used to generate +/// profiles for large programs, as the representation is extremely +/// inefficient. +void SampleProfile::loadText() { + ExternalProfileTextLoader Loader(Filename); + + // Read the symbol table. + StringRef Line = Loader.readLine(); + if (Line != "symbol table") + Loader.reportParseError("Expected 'symbol table', found " + Line); + int NumSymbols; + Line = Loader.readLine(); + if (Line.getAsInteger(10, NumSymbols)) + Loader.reportParseError("Expected a number, found " + Line); + for (int I = 0; I < NumSymbols; I++) { + StringRef FName = Loader.readLine(); + FunctionProfile &FProfile = Profiles[FName]; + FProfile.BodySamples.clear(); + FProfile.TotalSamples = 0; + FProfile.TotalHeadSamples = 0; + } + + // Read the profile of each function. Since each function may be + // mentioned more than once, and we are collecting flat profiles, + // accumulate samples as we parse them. + Regex HeadRE("^([^:]+):([0-9]+):([0-9]+):([0-9]+)$"); + Regex LineSample("^([0-9]+): ([0-9]+)$"); + while (!Loader.atEOF()) { + SmallVector<StringRef, 4> Matches; + Line = Loader.readLine(); + if (!HeadRE.match(Line, &Matches)) + Loader.reportParseError("Expected 'mangled_name:NUM:NUM:NUM', found " + + Line); + assert(Matches.size() == 5); + StringRef FName = Matches[1]; + unsigned NumSamples, NumHeadSamples, NumSampledLines; + Matches[2].getAsInteger(10, NumSamples); + Matches[3].getAsInteger(10, NumHeadSamples); + Matches[4].getAsInteger(10, NumSampledLines); + FunctionProfile &FProfile = Profiles[FName]; + FProfile.TotalSamples += NumSamples; + FProfile.TotalHeadSamples += NumHeadSamples; + BodySampleMap &SampleMap = FProfile.BodySamples; + unsigned I; + for (I = 0; I < NumSampledLines && !Loader.atEOF(); I++) { + Line = Loader.readLine(); + if (!LineSample.match(Line, &Matches)) + Loader.reportParseError("Expected 'NUM: NUM', found " + Line); + assert(Matches.size() == 3); + unsigned LineOffset, NumSamples; + Matches[1].getAsInteger(10, LineOffset); + Matches[2].getAsInteger(10, NumSamples); + SampleMap[LineOffset] += NumSamples; + } + + if (I < NumSampledLines) + Loader.reportParseError("Unexpected end of file"); + } +} + +/// \brief Get the weight for an instruction. +/// +/// The "weight" of an instruction \p Inst is the number of samples +/// collected on that instruction at runtime. To retrieve it, we +/// need to compute the line number of \p Inst relative to the start of its +/// function. We use \p FirstLineno to compute the offset. We then +/// look up the samples collected for \p Inst using \p BodySamples. +/// +/// \param Inst Instruction to query. +/// \param FirstLineno Line number of the first instruction in the function. +/// \param BodySamples Map of relative source line locations to samples. +/// +/// \returns The profiled weight of I. +uint32_t SampleProfile::getInstWeight(Instruction &Inst, unsigned FirstLineno, + BodySampleMap &BodySamples) { + unsigned LOffset = Inst.getDebugLoc().getLine() - FirstLineno + 1; + return BodySamples.lookup(LOffset); +} + +/// \brief Compute the weight of a basic block. +/// +/// The weight of basic block \p B is the maximum weight of all the +/// instructions in B. +/// +/// \param B The basic block to query. +/// \param FirstLineno The line number for the first line in the +/// function holding B. +/// \param BodySamples The map containing all the samples collected in that +/// function. +/// +/// \returns The computed weight of B. +uint32_t SampleProfile::computeBlockWeight(BasicBlock *B, unsigned FirstLineno, + BodySampleMap &BodySamples) { + // If we've computed B's weight before, return it. + Function *F = B->getParent(); + FunctionProfile &FProfile = Profiles[F->getName()]; + std::pair<BlockWeightMap::iterator, bool> Entry = + FProfile.BlockWeights.insert(std::make_pair(B, 0)); + if (!Entry.second) + return Entry.first->second; + + // Otherwise, compute and cache B's weight. + uint32_t Weight = 0; + for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) { + uint32_t InstWeight = getInstWeight(*I, FirstLineno, BodySamples); + if (InstWeight > Weight) + Weight = InstWeight; + } + Entry.first->second = Weight; + return Weight; +} + +/// \brief Generate branch weight metadata for all branches in \p F. +/// +/// For every branch instruction B in \p F, we compute the weight of the +/// target block for each of the edges out of B. This is the weight +/// that we associate with that branch. +/// +/// TODO - This weight assignment will most likely be wrong if the +/// target branch has more than two predecessors. This needs to be done +/// using some form of flow propagation. +/// +/// Once all the branch weights are computed, we emit the MD_prof +/// metadata on B using the computed values. +/// +/// \param F The function to query. +bool SampleProfile::emitAnnotations(Function &F) { + bool Changed = false; + FunctionProfile &FProfile = Profiles[F.getName()]; + unsigned FirstLineno = inst_begin(F)->getDebugLoc().getLine(); + MDBuilder MDB(F.getContext()); + + // Clear the block weights cache. + FProfile.BlockWeights.clear(); + + // When we find a branch instruction: For each edge E out of the branch, + // the weight of E is the weight of the target block. + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + BasicBlock *B = I; + TerminatorInst *TI = B->getTerminator(); + if (TI->getNumSuccessors() == 1) + continue; + if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)) + continue; + + SmallVector<uint32_t, 4> Weights; + unsigned NSuccs = TI->getNumSuccessors(); + for (unsigned I = 0; I < NSuccs; ++I) { + BasicBlock *Succ = TI->getSuccessor(I); + uint32_t Weight = + computeBlockWeight(Succ, FirstLineno, FProfile.BodySamples); + Weights.push_back(Weight); + } + + TI->setMetadata(llvm::LLVMContext::MD_prof, + MDB.createBranchWeights(Weights)); + Changed = true; + } + + return Changed; +} + +char SampleProfileLoader::ID = 0; +INITIALIZE_PASS(SampleProfileLoader, "sample-profile", "Sample Profile loader", + false, false) + +bool SampleProfileLoader::runOnFunction(Function &F) { + return Profiler->emitAnnotations(F); +} + +bool SampleProfileLoader::doInitialization(Module &M) { + Profiler.reset(new SampleProfile(Filename)); + Profiler->loadText(); + return true; +} + +FunctionPass *llvm::createSampleProfileLoaderPass() { + return new SampleProfileLoader(SampleProfileFile); +} + +FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) { + return new SampleProfileLoader(Name); +} diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 758334d..857597e 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -28,7 +28,7 @@ using namespace llvm; /// ScalarOpts library. void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCEPass(Registry); - initializeBlockPlacementPass(Registry); + initializeSampleProfileLoaderPass(Registry); initializeCodeGenPreparePass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); @@ -44,12 +44,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopInstSimplifyPass(Registry); initializeLoopRotatePass(Registry); initializeLoopStrengthReducePass(Registry); + initializeLoopRerollPass(Registry); initializeLoopUnrollPass(Registry); initializeLoopUnswitchPass(Registry); initializeLoopIdiomRecognizePass(Registry); initializeLowerAtomicPass(Registry); initializeLowerExpectIntrinsicPass(Registry); initializeMemCpyOptPass(Registry); + initializePartiallyInlineLibCallsPass(Registry); initializeReassociatePass(Registry); initializeRegToMemPass(Registry); initializeSCCPPass(Registry); @@ -111,6 +113,10 @@ void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopRotatePass()); } +void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopRerollPass()); +} + void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopUnrollPass()); } @@ -123,6 +129,10 @@ void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createMemCpyOptPass()); } +void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createPartiallyInlineLibCallsPass()); +} + void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createPromoteMemoryToRegisterPass()); } diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 73b2edf..57b290e 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -963,7 +963,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth)); else if (SV->getType()->isPointerTy()) - SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext())); + SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getType())); // Zero extend or truncate the value if needed. if (SV->getType() != AllocaType) { @@ -1426,7 +1426,7 @@ bool SROA::performPromotion(Function &F) { if (Allocas.empty()) break; if (HasDomTree) - PromoteMemToReg(Allocas, *DT, TD); + PromoteMemToReg(Allocas, *DT); else { SSAUpdater SSA; for (unsigned i = 0, e = Allocas.size(); i != e; ++i) { diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 6d05640..8371f6d 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -66,161 +66,6 @@ FunctionPass *llvm::createCFGSimplificationPass() { return new CFGSimplifyPass(); } -/// changeToUnreachable - Insert an unreachable instruction before the specified -/// instruction, making it and the rest of the code in the block dead. -static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { - BasicBlock *BB = I->getParent(); - // Loop over all of the successors, removing BB's entry from any PHI - // nodes. - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) - (*SI)->removePredecessor(BB); - - // Insert a call to llvm.trap right before this. This turns the undefined - // behavior into a hard fail instead of falling through into random code. - if (UseLLVMTrap) { - Function *TrapFn = - Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap); - CallInst *CallTrap = CallInst::Create(TrapFn, "", I); - CallTrap->setDebugLoc(I->getDebugLoc()); - } - new UnreachableInst(I->getContext(), I); - - // All instructions after this are dead. - BasicBlock::iterator BBI = I, BBE = BB->end(); - while (BBI != BBE) { - if (!BBI->use_empty()) - BBI->replaceAllUsesWith(UndefValue::get(BBI->getType())); - BB->getInstList().erase(BBI++); - } -} - -/// changeToCall - Convert the specified invoke into a normal call. -static void changeToCall(InvokeInst *II) { - SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); - CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II); - NewCall->takeName(II); - NewCall->setCallingConv(II->getCallingConv()); - NewCall->setAttributes(II->getAttributes()); - NewCall->setDebugLoc(II->getDebugLoc()); - II->replaceAllUsesWith(NewCall); - - // Follow the call by a branch to the normal destination. - BranchInst::Create(II->getNormalDest(), II); - - // Update PHI nodes in the unwind destination - II->getUnwindDest()->removePredecessor(II->getParent()); - II->eraseFromParent(); -} - -static bool markAliveBlocks(BasicBlock *BB, - SmallPtrSet<BasicBlock*, 128> &Reachable) { - - SmallVector<BasicBlock*, 128> Worklist; - Worklist.push_back(BB); - Reachable.insert(BB); - bool Changed = false; - do { - BB = Worklist.pop_back_val(); - - // Do a quick scan of the basic block, turning any obviously unreachable - // instructions into LLVM unreachable insts. The instruction combining pass - // canonicalizes unreachable insts into stores to null or undef. - for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){ - if (CallInst *CI = dyn_cast<CallInst>(BBI)) { - if (CI->doesNotReturn()) { - // If we found a call to a no-return function, insert an unreachable - // instruction after it. Make sure there isn't *already* one there - // though. - ++BBI; - if (!isa<UnreachableInst>(BBI)) { - // Don't insert a call to llvm.trap right before the unreachable. - changeToUnreachable(BBI, false); - Changed = true; - } - break; - } - } - - // Store to undef and store to null are undefined and used to signal that - // they should be changed to unreachable by passes that can't modify the - // CFG. - if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) { - // Don't touch volatile stores. - if (SI->isVolatile()) continue; - - Value *Ptr = SI->getOperand(1); - - if (isa<UndefValue>(Ptr) || - (isa<ConstantPointerNull>(Ptr) && - SI->getPointerAddressSpace() == 0)) { - changeToUnreachable(SI, true); - Changed = true; - break; - } - } - } - - // Turn invokes that call 'nounwind' functions into ordinary calls. - if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { - Value *Callee = II->getCalledValue(); - if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { - changeToUnreachable(II, true); - Changed = true; - } else if (II->doesNotThrow()) { - if (II->use_empty() && II->onlyReadsMemory()) { - // jump to the normal destination branch. - BranchInst::Create(II->getNormalDest(), II); - II->getUnwindDest()->removePredecessor(II->getParent()); - II->eraseFromParent(); - } else - changeToCall(II); - Changed = true; - } - } - - Changed |= ConstantFoldTerminator(BB, true); - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) - if (Reachable.insert(*SI)) - Worklist.push_back(*SI); - } while (!Worklist.empty()); - return Changed; -} - -/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even -/// if they are in a dead cycle. Return true if a change was made, false -/// otherwise. -static bool removeUnreachableBlocksFromFn(Function &F) { - SmallPtrSet<BasicBlock*, 128> Reachable; - bool Changed = markAliveBlocks(F.begin(), Reachable); - - // If there are unreachable blocks in the CFG... - if (Reachable.size() == F.size()) - return Changed; - - assert(Reachable.size() < F.size()); - NumSimpl += F.size()-Reachable.size(); - - // Loop over all of the basic blocks that are not reachable, dropping all of - // their internal references... - for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) { - if (Reachable.count(BB)) - continue; - - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) - if (Reachable.count(*SI)) - (*SI)->removePredecessor(BB); - BB->dropAllReferences(); - } - - for (Function::iterator I = ++F.begin(); I != F.end();) - if (!Reachable.count(I)) - I = F.getBasicBlockList().erase(I); - else - ++I; - - return true; -} - /// mergeEmptyReturnBlocks - If we have more than one empty (other than phi /// node) return blocks, merge them together to promote recursive block merging. static bool mergeEmptyReturnBlocks(Function &F) { @@ -325,7 +170,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, bool CFGSimplifyPass::runOnFunction(Function &F) { const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); - bool EverChanged = removeUnreachableBlocksFromFn(F); + bool EverChanged = removeUnreachableBlocks(F); EverChanged |= mergeEmptyReturnBlocks(F); EverChanged |= iterativelySimplifyCFG(F, TTI, TD); @@ -333,16 +178,16 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { if (!EverChanged) return false; // iterativelySimplifyCFG can (rarely) make some loops dead. If this happens, - // removeUnreachableBlocksFromFn is needed to nuke them, which means we should + // removeUnreachableBlocks is needed to nuke them, which means we should // iterate between the two optimizations. We structure the code like this to // avoid reruning iterativelySimplifyCFG if the second pass of - // removeUnreachableBlocksFromFn doesn't do anything. - if (!removeUnreachableBlocksFromFn(F)) + // removeUnreachableBlocks doesn't do anything. + if (!removeUnreachableBlocks(F)) return true; do { EverChanged = iterativelySimplifyCFG(F, TTI, TD); - EverChanged |= removeUnreachableBlocksFromFn(F); + EverChanged |= removeUnreachableBlocks(F); } while (EverChanged); return true; diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index bb6f163..5045ff8 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -231,7 +231,7 @@ public: StructurizeCFG() : RegionPass(ID) { - initializeRegionInfoPass(*PassRegistry::getPassRegistry()); + initializeStructurizeCFGPass(*PassRegistry::getPassRegistry()); } using Pass::doInitialization; @@ -244,6 +244,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequiredID(LowerSwitchID); AU.addRequired<DominatorTree>(); AU.addPreserved<DominatorTree>(); RegionPass::getAnalysisUsage(AU); @@ -256,6 +257,7 @@ char StructurizeCFG::ID = 0; INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG", false, false) +INITIALIZE_PASS_DEPENDENCY(LowerSwitch) INITIALIZE_PASS_DEPENDENCY(DominatorTree) INITIALIZE_PASS_DEPENDENCY(RegionInfo) INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG", @@ -321,21 +323,32 @@ Value *StructurizeCFG::invert(Value *Condition) { if (match(Condition, m_Not(m_Value(Condition)))) return Condition; - // Third: Check all the users for an invert - BasicBlock *Parent = cast<Instruction>(Condition)->getParent(); - for (Value::use_iterator I = Condition->use_begin(), - E = Condition->use_end(); I != E; ++I) { + if (Instruction *Inst = dyn_cast<Instruction>(Condition)) { + // Third: Check all the users for an invert + BasicBlock *Parent = Inst->getParent(); + for (Value::use_iterator I = Condition->use_begin(), + E = Condition->use_end(); I != E; ++I) { - Instruction *User = dyn_cast<Instruction>(*I); - if (!User || User->getParent() != Parent) - continue; + Instruction *User = dyn_cast<Instruction>(*I); + if (!User || User->getParent() != Parent) + continue; + + if (match(*I, m_Not(m_Specific(Condition)))) + return *I; + } - if (match(*I, m_Not(m_Specific(Condition)))) - return *I; + // Last option: Create a new instruction + return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator()); } - // Last option: Create a new instruction - return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator()); + if (Argument *Arg = dyn_cast<Argument>(Condition)) { + BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock(); + return BinaryOperator::CreateNot(Condition, + Arg->getName() + ".inv", + EntryBlock.getTerminator()); + } + + llvm_unreachable("Unhandled condition to invert"); } /// \brief Build the condition for one edge @@ -766,6 +779,20 @@ void StructurizeCFG::handleLoops(bool ExitUseAllowed, handleLoops(false, LoopEnd); } + // If the start of the loop is the entry block, we can't branch to it so + // insert a new dummy entry block. + Function *LoopFunc = LoopStart->getParent(); + if (LoopStart == &LoopFunc->getEntryBlock()) { + LoopStart->setName("entry.orig"); + + BasicBlock *NewEntry = + BasicBlock::Create(LoopStart->getContext(), + "entry", + LoopFunc, + LoopStart); + BranchInst::Create(LoopStart, NewEntry); + } + // Create an extra loop end node LoopEnd = needPrefix(false); BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed); diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp index e17a416..12de9ee 100644 --- a/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -248,7 +248,6 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) { // If the edge isn't critical, then BB has a single successor or Succ has a // single pred. Split the block. - BasicBlock::iterator SplitPoint; if (BasicBlock *SP = Succ->getSinglePredecessor()) { // If the successor only has a single pred, split the top of the successor // block. @@ -401,8 +400,12 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, // If all incoming values for the new PHI would be the same, just don't // make a new PHI. Instead, just remove the incoming values from the old // PHI. - for (unsigned i = 0, e = Preds.size(); i != e; ++i) - PN->removeIncomingValue(Preds[i], false); + for (unsigned i = 0, e = Preds.size(); i != e; ++i) { + // Explicitly check the BB index here to handle duplicates in Preds. + int Idx = PN->getBasicBlockIndex(Preds[i]); + if (Idx >= 0) + PN->removeIncomingValue(Idx, false); + } } else { // If the values coming into the block are not the same, we need a PHI. // Create the new PHI node, insert it into NewBB at the end of the block diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp index 8f3ff96..0e7f7f7 100644 --- a/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -22,7 +22,6 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/ProfileInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" @@ -45,7 +44,6 @@ namespace { virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<DominatorTree>(); AU.addPreserved<LoopInfo>(); - AU.addPreserved<ProfileInfo>(); // No loop canonicalization guarantees are broken by this pass. AU.addPreservedID(LoopSimplifyID); @@ -213,10 +211,9 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>(); LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>(); - ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>(); // If we have nothing to update, just return. - if (DT == 0 && LI == 0 && PI == 0) + if (DT == 0 && LI == 0) return NewBB; // Now update analysis information. Since the only predecessor of NewBB is @@ -369,9 +366,5 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, } } - // Update ProfileInfo if it is around. - if (PI) - PI->splitEdge(TIBB, DestBB, NewBB, MergeIdenticalEdges); - return NewBB; } diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt index 3648fd6..5afd6b8 100644 --- a/lib/Transforms/Utils/CMakeLists.txt +++ b/lib/Transforms/Utils/CMakeLists.txt @@ -8,6 +8,7 @@ add_llvm_library(LLVMTransformUtils CmpInstAnalysis.cpp CodeExtractor.cpp DemoteRegToStack.cpp + GlobalStatus.cpp InlineFunction.cpp InstructionNamer.cpp IntegerDivision.cpp diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index 82013f9..6f00864 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -665,8 +665,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, TheSwitch->setCondition(call); TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks)); // Remove redundant case - SwitchInst::CaseIt ToBeRemoved(TheSwitch, NumExitBlocks-1); - TheSwitch->removeCase(ToBeRemoved); + TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1)); break; } } diff --git a/lib/Transforms/Utils/FlattenCFG.cpp b/lib/Transforms/Utils/FlattenCFG.cpp index 9cbe15d..1da226b 100644 --- a/lib/Transforms/Utils/FlattenCFG.cpp +++ b/lib/Transforms/Utils/FlattenCFG.cpp @@ -266,8 +266,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, BasicBlock *CB; BranchInst *PBI = dyn_cast<BranchInst>(FirstCondBlock->getTerminator()); bool Iteration = true; - BasicBlock *SaveInsertBB = Builder.GetInsertBlock(); - BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint(); + IRBuilder<>::InsertPointGuard Guard(Builder); Value *PC = PBI->getCondition(); do { @@ -298,7 +297,6 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, new UnreachableInst(CB->getContext(), CB); } while (Iteration); - Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt); DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock); return true; } @@ -372,7 +370,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, /// Check whether \param BB is the merge block of a if-region. If yes, check /// whether there exists an adjacent if-region upstream, the two if-regions -/// contain identical instuctions and can be legally merged. \returns true if +/// contain identical instructions and can be legally merged. \returns true if /// the two if-regions are merged. /// /// From: diff --git a/lib/Transforms/Utils/GlobalStatus.cpp b/lib/Transforms/Utils/GlobalStatus.cpp new file mode 100644 index 0000000..5f0a563 --- /dev/null +++ b/lib/Transforms/Utils/GlobalStatus.cpp @@ -0,0 +1,183 @@ +//===-- GlobalStatus.cpp - Compute status info for globals -----------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Transforms/Utils/GlobalStatus.h" + +using namespace llvm; + +/// Return the stronger of the two ordering. If the two orderings are acquire +/// and release, then return AcquireRelease. +/// +static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) { + if (X == Acquire && Y == Release) + return AcquireRelease; + if (Y == Acquire && X == Release) + return AcquireRelease; + return (AtomicOrdering)std::max(X, Y); +} + +/// It is safe to destroy a constant iff it is only used by constants itself. +/// Note that constants cannot be cyclic, so this test is pretty easy to +/// implement recursively. +/// +bool llvm::isSafeToDestroyConstant(const Constant *C) { + if (isa<GlobalValue>(C)) + return false; + + for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E; + ++UI) + if (const Constant *CU = dyn_cast<Constant>(*UI)) { + if (!isSafeToDestroyConstant(CU)) + return false; + } else + return false; + return true; +} + +static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, + SmallPtrSet<const PHINode *, 16> &PhiUsers) { + for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; + ++UI) { + const User *U = *UI; + if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { + GS.HasNonInstructionUser = true; + + // If the result of the constantexpr isn't pointer type, then we won't + // know to expect it in various places. Just reject early. + if (!isa<PointerType>(CE->getType())) + return true; + + if (analyzeGlobalAux(CE, GS, PhiUsers)) + return true; + } else if (const Instruction *I = dyn_cast<Instruction>(U)) { + if (!GS.HasMultipleAccessingFunctions) { + const Function *F = I->getParent()->getParent(); + if (GS.AccessingFunction == 0) + GS.AccessingFunction = F; + else if (GS.AccessingFunction != F) + GS.HasMultipleAccessingFunctions = true; + } + if (const LoadInst *LI = dyn_cast<LoadInst>(I)) { + GS.IsLoaded = true; + // Don't hack on volatile loads. + if (LI->isVolatile()) + return true; + GS.Ordering = strongerOrdering(GS.Ordering, LI->getOrdering()); + } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) { + // Don't allow a store OF the address, only stores TO the address. + if (SI->getOperand(0) == V) + return true; + + // Don't hack on volatile stores. + if (SI->isVolatile()) + return true; + + GS.Ordering = strongerOrdering(GS.Ordering, SI->getOrdering()); + + // If this is a direct store to the global (i.e., the global is a scalar + // value, not an aggregate), keep more specific information about + // stores. + if (GS.StoredType != GlobalStatus::Stored) { + if (const GlobalVariable *GV = + dyn_cast<GlobalVariable>(SI->getOperand(1))) { + Value *StoredVal = SI->getOperand(0); + + if (Constant *C = dyn_cast<Constant>(StoredVal)) { + if (C->isThreadDependent()) { + // The stored value changes between threads; don't track it. + return true; + } + } + + if (StoredVal == GV->getInitializer()) { + if (GS.StoredType < GlobalStatus::InitializerStored) + GS.StoredType = GlobalStatus::InitializerStored; + } else if (isa<LoadInst>(StoredVal) && + cast<LoadInst>(StoredVal)->getOperand(0) == GV) { + if (GS.StoredType < GlobalStatus::InitializerStored) + GS.StoredType = GlobalStatus::InitializerStored; + } else if (GS.StoredType < GlobalStatus::StoredOnce) { + GS.StoredType = GlobalStatus::StoredOnce; + GS.StoredOnceValue = StoredVal; + } else if (GS.StoredType == GlobalStatus::StoredOnce && + GS.StoredOnceValue == StoredVal) { + // noop. + } else { + GS.StoredType = GlobalStatus::Stored; + } + } else { + GS.StoredType = GlobalStatus::Stored; + } + } + } else if (isa<BitCastInst>(I)) { + if (analyzeGlobalAux(I, GS, PhiUsers)) + return true; + } else if (isa<GetElementPtrInst>(I)) { + if (analyzeGlobalAux(I, GS, PhiUsers)) + return true; + } else if (isa<SelectInst>(I)) { + if (analyzeGlobalAux(I, GS, PhiUsers)) + return true; + } else if (const PHINode *PN = dyn_cast<PHINode>(I)) { + // PHI nodes we can check just like select or GEP instructions, but we + // have to be careful about infinite recursion. + if (PhiUsers.insert(PN)) // Not already visited. + if (analyzeGlobalAux(I, GS, PhiUsers)) + return true; + } else if (isa<CmpInst>(I)) { + GS.IsCompared = true; + } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) { + if (MTI->isVolatile()) + return true; + if (MTI->getArgOperand(0) == V) + GS.StoredType = GlobalStatus::Stored; + if (MTI->getArgOperand(1) == V) + GS.IsLoaded = true; + } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) { + assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!"); + if (MSI->isVolatile()) + return true; + GS.StoredType = GlobalStatus::Stored; + } else if (ImmutableCallSite C = I) { + if (!C.isCallee(UI)) + return true; + GS.IsLoaded = true; + } else { + return true; // Any other non-load instruction might take address! + } + } else if (const Constant *C = dyn_cast<Constant>(U)) { + GS.HasNonInstructionUser = true; + // We might have a dead and dangling constant hanging off of here. + if (!isSafeToDestroyConstant(C)) + return true; + } else { + GS.HasNonInstructionUser = true; + // Otherwise must be some other user. + return true; + } + } + + return false; +} + +bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) { + SmallPtrSet<const PHINode *, 16> PhiUsers; + return analyzeGlobalAux(V, GS, PhiUsers); +} + +GlobalStatus::GlobalStatus() + : IsCompared(false), IsLoaded(false), StoredType(NotStored), + StoredOnceValue(0), AccessingFunction(0), + HasMultipleAccessingFunctions(false), HasNonInstructionUser(false), + Ordering(NotAtomic) {} diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index dabb67b..d021bce 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -193,7 +193,8 @@ static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, CallInst *CI = dyn_cast<CallInst>(I); // If this call cannot unwind, don't convert it to an invoke. - if (!CI || CI->doesNotThrow()) + // Inline asm calls cannot throw. + if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue())) continue; // Convert this function call into an invoke instruction. First, split the diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp index 2d1b166..f15e8d5 100644 --- a/lib/Transforms/Utils/LCSSA.cpp +++ b/lib/Transforms/Utils/LCSSA.cpp @@ -55,7 +55,6 @@ namespace { DominatorTree *DT; LoopInfo *LI; ScalarEvolution *SE; - std::vector<BasicBlock*> LoopBlocks; PredIteratorCache PredCache; Loop *L; @@ -82,11 +81,6 @@ namespace { // Check the special guarantees that LCSSA makes. assert(L->isLCSSAForm(*DT) && "LCSSA form not preserved!"); } - - /// inLoop - returns true if the given block is within the current loop - bool inLoop(BasicBlock *B) const { - return std::binary_search(LoopBlocks.begin(), LoopBlocks.end(), B); - } }; } @@ -129,11 +123,6 @@ bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) { if (ExitBlocks.empty()) return false; - // Speed up queries by creating a sorted vector of blocks. - LoopBlocks.clear(); - LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end()); - array_pod_sort(LoopBlocks.begin(), LoopBlocks.end()); - // Look at all the instructions in the loop, checking to see if they have uses // outside the loop. If so, rewrite those uses. bool MadeChange = false; @@ -198,7 +187,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst, if (PHINode *PN = dyn_cast<PHINode>(U)) UserBB = PN->getIncomingBlock(UI); - if (InstBB != UserBB && !inLoop(UserBB)) + if (InstBB != UserBB && !L->contains(UserBB)) UsesToRewrite.push_back(&UI.getUse()); } @@ -244,7 +233,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst, // If the exit block has a predecessor not within the loop, arrange for // the incoming value use corresponding to that predecessor to be // rewritten in terms of a different LCSSA PHI. - if (!inLoop(*PI)) + if (!L->contains(*PI)) UsesToRewrite.push_back( &PN->getOperandUse( PN->getOperandNumForIncomingValue(PN->getNumIncomingValues()-1))); diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 08e1808..2768041 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -16,10 +16,10 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" -#include "llvm/Analysis/ProfileInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/DIBuilder.h" #include "llvm/DebugInfo.h" @@ -43,6 +43,8 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; +STATISTIC(NumRemoved, "Number of unreachable basic blocks removed"); + //===----------------------------------------------------------------------===// // Local constant propagation. // @@ -193,33 +195,28 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, // Otherwise, we can fold this switch into a conditional branch // instruction if it has only one non-default destination. SwitchInst::CaseIt FirstCase = SI->case_begin(); - IntegersSubset& Case = FirstCase.getCaseValueEx(); - if (Case.isSingleNumber()) { - // FIXME: Currently work with ConstantInt based numbers. - Value *Cond = Builder.CreateICmpEQ(SI->getCondition(), - Case.getSingleNumber(0).toConstantInt(), - "cond"); - - // Insert the new branch. - BranchInst *NewBr = Builder.CreateCondBr(Cond, - FirstCase.getCaseSuccessor(), - SI->getDefaultDest()); - MDNode* MD = SI->getMetadata(LLVMContext::MD_prof); - if (MD && MD->getNumOperands() == 3) { - ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2)); - ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1)); - assert(SICase && SIDef); - // The TrueWeight should be the weight for the single case of SI. - NewBr->setMetadata(LLVMContext::MD_prof, - MDBuilder(BB->getContext()). - createBranchWeights(SICase->getValue().getZExtValue(), - SIDef->getValue().getZExtValue())); - } + Value *Cond = Builder.CreateICmpEQ(SI->getCondition(), + FirstCase.getCaseValue(), "cond"); - // Delete the old switch. - SI->eraseFromParent(); - return true; + // Insert the new branch. + BranchInst *NewBr = Builder.CreateCondBr(Cond, + FirstCase.getCaseSuccessor(), + SI->getDefaultDest()); + MDNode* MD = SI->getMetadata(LLVMContext::MD_prof); + if (MD && MD->getNumOperands() == 3) { + ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2)); + ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1)); + assert(SICase && SIDef); + // The TrueWeight should be the weight for the single case of SI. + NewBr->setMetadata(LLVMContext::MD_prof, + MDBuilder(BB->getContext()). + createBranchWeights(SICase->getValue().getZExtValue(), + SIDef->getValue().getZExtValue())); } + + // Delete the old switch. + SI->eraseFromParent(); + return true; } return false; } @@ -415,7 +412,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD, Instruction *Inst = BI++; WeakVH BIHandle(BI); - if (recursivelySimplifyInstruction(Inst, TD)) { + if (recursivelySimplifyInstruction(Inst, TD, TLI)) { MadeChange = true; if (BIHandle != BI) BI = BB->begin(); @@ -515,11 +512,6 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) { DT->changeImmediateDominator(DestBB, PredBBIDom); DT->eraseNode(PredBB); } - ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>(); - if (PI) { - PI->replaceAllUses(PredBB, DestBB); - PI->removeEdge(ProfileInfo::getEdge(PredBB, DestBB)); - } } // Nuke BB. PredBB->eraseFromParent(); @@ -533,7 +525,7 @@ static bool CanMergeValues(Value *First, Value *Second) { } /// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an -/// almost-empty BB ending in an unconditional branch to Succ, into succ. +/// almost-empty BB ending in an unconditional branch to Succ, into Succ. /// /// Assumption: Succ is the single successor for BB. /// @@ -1053,7 +1045,11 @@ bool llvm::LowerDbgDeclare(Function &F) { for (SmallVectorImpl<DbgDeclareInst *>::iterator I = Dbgs.begin(), E = Dbgs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; - if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress())) { + AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress()); + // If this is an alloca for a scalar variable, insert a dbg.value + // at each load and store to the alloca and erase the dbg.declare. + if (AI && !AI->isArrayAllocation()) { + // We only remove the dbg.declare intrinsic if all uses are // converted to dbg.value intrinsics. bool RemoveDDI = true; @@ -1121,33 +1117,153 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, return true; } -bool llvm::removeUnreachableBlocks(Function &F) { - SmallPtrSet<BasicBlock*, 16> Reachable; +/// changeToUnreachable - Insert an unreachable instruction before the specified +/// instruction, making it and the rest of the code in the block dead. +static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { + BasicBlock *BB = I->getParent(); + // Loop over all of the successors, removing BB's entry from any PHI + // nodes. + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) + (*SI)->removePredecessor(BB); + + // Insert a call to llvm.trap right before this. This turns the undefined + // behavior into a hard fail instead of falling through into random code. + if (UseLLVMTrap) { + Function *TrapFn = + Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap); + CallInst *CallTrap = CallInst::Create(TrapFn, "", I); + CallTrap->setDebugLoc(I->getDebugLoc()); + } + new UnreachableInst(I->getContext(), I); + + // All instructions after this are dead. + BasicBlock::iterator BBI = I, BBE = BB->end(); + while (BBI != BBE) { + if (!BBI->use_empty()) + BBI->replaceAllUsesWith(UndefValue::get(BBI->getType())); + BB->getInstList().erase(BBI++); + } +} + +/// changeToCall - Convert the specified invoke into a normal call. +static void changeToCall(InvokeInst *II) { + SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); + CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II); + NewCall->takeName(II); + NewCall->setCallingConv(II->getCallingConv()); + NewCall->setAttributes(II->getAttributes()); + NewCall->setDebugLoc(II->getDebugLoc()); + II->replaceAllUsesWith(NewCall); + + // Follow the call by a branch to the normal destination. + BranchInst::Create(II->getNormalDest(), II); + + // Update PHI nodes in the unwind destination + II->getUnwindDest()->removePredecessor(II->getParent()); + II->eraseFromParent(); +} + +static bool markAliveBlocks(BasicBlock *BB, + SmallPtrSet<BasicBlock*, 128> &Reachable) { + SmallVector<BasicBlock*, 128> Worklist; - Worklist.push_back(&F.getEntryBlock()); - Reachable.insert(&F.getEntryBlock()); + Worklist.push_back(BB); + Reachable.insert(BB); + bool Changed = false; do { - BasicBlock *BB = Worklist.pop_back_val(); + BB = Worklist.pop_back_val(); + + // Do a quick scan of the basic block, turning any obviously unreachable + // instructions into LLVM unreachable insts. The instruction combining pass + // canonicalizes unreachable insts into stores to null or undef. + for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){ + if (CallInst *CI = dyn_cast<CallInst>(BBI)) { + if (CI->doesNotReturn()) { + // If we found a call to a no-return function, insert an unreachable + // instruction after it. Make sure there isn't *already* one there + // though. + ++BBI; + if (!isa<UnreachableInst>(BBI)) { + // Don't insert a call to llvm.trap right before the unreachable. + changeToUnreachable(BBI, false); + Changed = true; + } + break; + } + } + + // Store to undef and store to null are undefined and used to signal that + // they should be changed to unreachable by passes that can't modify the + // CFG. + if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) { + // Don't touch volatile stores. + if (SI->isVolatile()) continue; + + Value *Ptr = SI->getOperand(1); + + if (isa<UndefValue>(Ptr) || + (isa<ConstantPointerNull>(Ptr) && + SI->getPointerAddressSpace() == 0)) { + changeToUnreachable(SI, true); + Changed = true; + break; + } + } + } + + // Turn invokes that call 'nounwind' functions into ordinary calls. + if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { + Value *Callee = II->getCalledValue(); + if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { + changeToUnreachable(II, true); + Changed = true; + } else if (II->doesNotThrow()) { + if (II->use_empty() && II->onlyReadsMemory()) { + // jump to the normal destination branch. + BranchInst::Create(II->getNormalDest(), II); + II->getUnwindDest()->removePredecessor(II->getParent()); + II->eraseFromParent(); + } else + changeToCall(II); + Changed = true; + } + } + + Changed |= ConstantFoldTerminator(BB, true); for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) if (Reachable.insert(*SI)) Worklist.push_back(*SI); } while (!Worklist.empty()); + return Changed; +} +/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even +/// if they are in a dead cycle. Return true if a change was made, false +/// otherwise. +bool llvm::removeUnreachableBlocks(Function &F) { + SmallPtrSet<BasicBlock*, 128> Reachable; + bool Changed = markAliveBlocks(F.begin(), Reachable); + + // If there are unreachable blocks in the CFG... if (Reachable.size() == F.size()) - return false; + return Changed; assert(Reachable.size() < F.size()); - for (Function::iterator I = llvm::next(F.begin()), E = F.end(); I != E; ++I) { - if (Reachable.count(I)) + NumRemoved += F.size()-Reachable.size(); + + // Loop over all of the basic blocks that are not reachable, dropping all of + // their internal references... + for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) { + if (Reachable.count(BB)) continue; - for (succ_iterator SI = succ_begin(I), SE = succ_end(I); SI != SE; ++SI) + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) if (Reachable.count(*SI)) - (*SI)->removePredecessor(I); - I->dropAllReferences(); + (*SI)->removePredecessor(BB); + BB->dropAllReferences(); } - for (Function::iterator I = llvm::next(F.begin()), E=F.end(); I != E;) + for (Function::iterator I = ++F.begin(); I != F.end();) if (!Reachable.count(I)) I = F.getBasicBlockList().erase(I); else diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index cb581b3..162807d 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -90,7 +90,8 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, // Move all definitions in the successor to the predecessor... OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList()); - std::string OldName = BB->getName(); + // OldName will be valid until erased. + StringRef OldName = BB->getName(); // Erase basic block from the function... @@ -102,12 +103,13 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, } } LI->removeBlock(BB); - BB->eraseFromParent(); // Inherit predecessor's name if it exists... if (!OldName.empty() && !OnlyPred->hasName()) OnlyPred->setName(OldName); + BB->eraseFromParent(); + return OnlyPred; } @@ -239,8 +241,6 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, DEBUG(dbgs() << "!\n"); } - std::vector<BasicBlock*> LoopBlocks = L->getBlocks(); - bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp index 4aee8ff..e017f50 100644 --- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp +++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp @@ -29,7 +29,7 @@ using namespace llvm; -STATISTIC(IfHandled, "Number of 'expect' intrinsic intructions handled"); +STATISTIC(IfHandled, "Number of 'expect' intrinsic instructions handled"); static cl::opt<uint32_t> LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64), diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp index f66b54d..9799a30 100644 --- a/lib/Transforms/Utils/LowerInvoke.cpp +++ b/lib/Transforms/Utils/LowerInvoke.cpp @@ -346,7 +346,6 @@ splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*> &Invokes) { // Scan all of the uses and see if the live range is live across an unwind // edge. If we find a use live across an invoke edge, create an alloca // and spill the value. - std::set<InvokeInst*> InvokesWithStoreInserted; // Find all of the blocks that this value is live in. std::set<BasicBlock*> LiveBBs; diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp index 955b853..2d2a8a5 100644 --- a/lib/Transforms/Utils/LowerSwitch.cpp +++ b/lib/Transforms/Utils/LowerSwitch.cpp @@ -66,6 +66,18 @@ namespace { BasicBlock* OrigBlock, BasicBlock* Default); unsigned Clusterify(CaseVector& Cases, SwitchInst *SI); }; + + /// The comparison function for sorting the switch case values in the vector. + /// WARNING: Case ranges should be disjoint! + struct CaseCmp { + bool operator () (const LowerSwitch::CaseRange& C1, + const LowerSwitch::CaseRange& C2) { + + const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low); + const ConstantInt* CI2 = cast<const ConstantInt>(C2.High); + return CI1->getValue().slt(CI2->getValue()); + } + }; } char LowerSwitch::ID = 0; @@ -147,7 +159,7 @@ BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, Function::iterator FI = OrigBlock; F->getBasicBlockList().insert(++FI, NewNode); - ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_ULT, + ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT, Val, Pivot.Low, "Pivot"); NewNode->getInstList().push_back(Comp); BranchInst::Create(LBranch, RBranch, Comp, NewNode); @@ -222,34 +234,40 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, // Clusterify - Transform simple list of Cases into list of CaseRange's unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { - - IntegersSubsetToBB TheClusterifier; + unsigned numCmps = 0; // Start with "simple" cases - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); - i != e; ++i) { - BasicBlock *SuccBB = i.getCaseSuccessor(); - IntegersSubset CaseRanges = i.getCaseValueEx(); - TheClusterifier.add(CaseRanges, SuccBB); - } - - TheClusterifier.optimize(); + for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) + Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(), + i.getCaseSuccessor())); - size_t numCmps = 0; - for (IntegersSubsetToBB::RangeIterator i = TheClusterifier.begin(), - e = TheClusterifier.end(); i != e; ++i, ++numCmps) { - IntegersSubsetToBB::Cluster &C = *i; - - // FIXME: Currently work with ConstantInt based numbers. - // Changing it to APInt based is a pretty heavy for this commit. - Cases.push_back(CaseRange(C.first.getLow().toConstantInt(), - C.first.getHigh().toConstantInt(), C.second)); - if (C.first.isSingleNumber()) + std::sort(Cases.begin(), Cases.end(), CaseCmp()); + + // Merge case into clusters + if (Cases.size()>=2) + for (CaseItr I=Cases.begin(), J=llvm::next(Cases.begin()); J!=Cases.end(); ) { + int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue(); + int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue(); + BasicBlock* nextBB = J->BB; + BasicBlock* currentBB = I->BB; + + // If the two neighboring cases go to the same destination, merge them + // into a single case. + if ((nextValue-currentValue==1) && (currentBB == nextBB)) { + I->High = J->High; + J = Cases.erase(J); + } else { + I = J++; + } + } + + for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) { + if (I->Low != I->High) // A range counts double, since it requires two compares. ++numCmps; } - return numCmps; + return numCmps; } // processSwitchInst - Replace the specified switch instruction with a sequence diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp index ebd7db6..61b3965 100644 --- a/lib/Transforms/Utils/Mem2Reg.cpp +++ b/lib/Transforms/Utils/Mem2Reg.cpp @@ -16,7 +16,6 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" @@ -28,7 +27,6 @@ STATISTIC(NumPromoted, "Number of alloca's promoted"); namespace { struct PromotePass : public FunctionPass { static char ID; // Pass identification, replacement for typeid - PromotePass() : FunctionPass(ID) { initializePromotePassPass(*PassRegistry::getPassRegistry()); } @@ -64,7 +62,6 @@ bool PromotePass::runOnFunction(Function &F) { bool Changed = false; DominatorTree &DT = getAnalysis<DominatorTree>(); - const DataLayout *DL = getAnalysisIfAvailable<DataLayout>(); while (1) { Allocas.clear(); @@ -73,12 +70,12 @@ bool PromotePass::runOnFunction(Function &F) { // the entry node for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca? - if (isAllocaPromotable(AI, DL)) + if (isAllocaPromotable(AI)) Allocas.push_back(AI); if (Allocas.empty()) break; - PromoteMemToReg(Allocas, DT, DL); + PromoteMemToReg(Allocas, DT); NumPromoted += Allocas.size(); Changed = true; } diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 6910180..8f6eee3 100644 --- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -30,7 +30,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -46,7 +45,6 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" -#include "llvm/InstVisitor.h" #include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> @@ -58,16 +56,56 @@ STATISTIC(NumSingleStore, "Number of alloca's promoted with a single store"); STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); -namespace { +bool llvm::isAllocaPromotable(const AllocaInst *AI) { + // FIXME: If the memory unit is of pointer or integer type, we can permit + // assignments to subsections of the memory unit. + + // Only allow direct and non-volatile loads and stores... + for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end(); + UI != UE; ++UI) { // Loop over all of the uses of the alloca + const User *U = *UI; + if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { + // Note that atomic loads can be transformed; atomic semantics do + // not have any meaning for a local alloca. + if (LI->isVolatile()) + return false; + } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) { + if (SI->getOperand(0) == AI) + return false; // Don't allow a store OF the AI, only INTO the AI. + // Note that atomic stores can be transformed; atomic semantics do + // not have any meaning for a local alloca. + if (SI->isVolatile()) + return false; + } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { + if (II->getIntrinsicID() != Intrinsic::lifetime_start && + II->getIntrinsicID() != Intrinsic::lifetime_end) + return false; + } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { + if (BCI->getType() != Type::getInt8PtrTy(U->getContext())) + return false; + if (!onlyUsedByLifetimeMarkers(BCI)) + return false; + } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) { + if (GEPI->getType() != Type::getInt8PtrTy(U->getContext())) + return false; + if (!GEPI->hasAllZeroIndices()) + return false; + if (!onlyUsedByLifetimeMarkers(GEPI)) + return false; + } else { + return false; + } + } -struct AllocaInfo : private InstVisitor<AllocaInfo, bool> { - const DataLayout *DL; + return true; +} + +namespace { +struct AllocaInfo { SmallVector<BasicBlock *, 32> DefiningBlocks; SmallVector<BasicBlock *, 32> UsingBlocks; - SmallVector<Instruction *, 8> DeadInsts; - Type *AllocaTy; StoreInst *OnlyStore; BasicBlock *OnlyBlock; bool OnlyUsedInOneBlock; @@ -75,13 +113,9 @@ struct AllocaInfo : private InstVisitor<AllocaInfo, bool> { Value *AllocaPointerVal; DbgDeclareInst *DbgDeclare; - AllocaInfo(const DataLayout *DL) : DL(DL) {} - void clear() { DefiningBlocks.clear(); UsingBlocks.clear(); - DeadInsts.clear(); - AllocaTy = 0; OnlyStore = 0; OnlyBlock = 0; OnlyUsedInOneBlock = true; @@ -91,116 +125,39 @@ struct AllocaInfo : private InstVisitor<AllocaInfo, bool> { /// Scan the uses of the specified alloca, filling in the AllocaInfo used /// by the rest of the pass to reason about the uses of this alloca. - bool analyzeAlloca(AllocaInst &AI) { + void AnalyzeAlloca(AllocaInst *AI) { clear(); - AllocaTy = AI.getAllocatedType(); - enqueueUsers(AI); - - // Walk queued up uses in the worklist to handle nested uses. - while (!UseWorklist.empty()) { - U = UseWorklist.pop_back_val(); - Instruction &I = *cast<Instruction>(U->getUser()); - if (!visit(I)) - return false; // Propagate failure to promote up. + // As we scan the uses of the alloca instruction, keep track of stores, + // and decide whether all of the loads and stores to the alloca are within + // the same basic block. + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); + UI != E;) { + Instruction *User = cast<Instruction>(*UI++); + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + // Remember the basic blocks which define new values for the alloca + DefiningBlocks.push_back(SI->getParent()); + AllocaPointerVal = SI->getOperand(0); + OnlyStore = SI; + } else { + LoadInst *LI = cast<LoadInst>(User); + // Otherwise it must be a load instruction, keep track of variable + // reads. + UsingBlocks.push_back(LI->getParent()); + AllocaPointerVal = LI; + } if (OnlyUsedInOneBlock) { if (OnlyBlock == 0) - OnlyBlock = I.getParent(); - else if (OnlyBlock != I.getParent()) + OnlyBlock = User->getParent(); + else if (OnlyBlock != User->getParent()) OnlyUsedInOneBlock = false; } } - DbgDeclare = FindAllocaDbgDeclare(&AI); - return true; - } - -private: - // Befriend the base class so it can call through private visitor methods. - friend class InstVisitor<AllocaInfo, bool>; - - /// \brief A use pointer that is non-null when visiting uses. - Use *U; - - /// \brief A worklist for recursively visiting all uses of an alloca. - SmallVector<Use *, 8> UseWorklist; - - /// \brief A set for preventing cyclic visitation. - SmallPtrSet<Use *, 8> VisitedUses; - - void enqueueUsers(Instruction &I) { - for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; - ++UI) - if (VisitedUses.insert(&UI.getUse())) - UseWorklist.push_back(&UI.getUse()); + DbgDeclare = FindAllocaDbgDeclare(AI); } - - bool visitLoadInst(LoadInst &LI) { - if (LI.isVolatile() || LI.getType() != AllocaTy) - return false; - - // Keep track of variable reads. - UsingBlocks.push_back(LI.getParent()); - AllocaPointerVal = &LI; - return true; - } - - bool visitStoreInst(StoreInst &SI) { - if (SI.isVolatile() || SI.getValueOperand() == U->get() || - SI.getValueOperand()->getType() != AllocaTy) - return false; - - // Remember the basic blocks which define new values for the alloca - DefiningBlocks.push_back(SI.getParent()); - AllocaPointerVal = SI.getOperand(0); - OnlyStore = &SI; - return true; - } - - bool visitBitCastInst(BitCastInst &BC) { - if (BC.use_empty()) - DeadInsts.push_back(&BC); - else - enqueueUsers(BC); - return true; - } - - bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { - if (GEPI.use_empty()) { - DeadInsts.push_back(&GEPI); - return true; - } - - enqueueUsers(GEPI); - - return GEPI.hasAllZeroIndices(); - } - - // We can promote through debug info intrinsics as they don't alter the - // value stored in memory. - bool visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) { - DeadInsts.push_back(&I); - return true; - } - - bool visitIntrinsicInst(IntrinsicInst &II) { - switch (II.getIntrinsicID()) { - default: - return false; - - // Lifetime intrinsics don't preclude promoting the memory to a register. - // FIXME: We should use these to promote to undef when outside of a valid - // lifetime. - case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: - DeadInsts.push_back(&II); - return true; - } - } - - // The fallback is that the alloca cannot be promoted. - bool visitInstruction(Instruction &I) { return false; } }; // Data package used by RenamePass() @@ -278,7 +235,6 @@ struct PromoteMem2Reg { std::vector<AllocaInst *> Allocas; DominatorTree &DT; DIBuilder DIB; - const DataLayout *DL; /// An AliasSetTracker object to update. If null, don't update it. AliasSetTracker *AST; @@ -324,9 +280,9 @@ struct PromoteMem2Reg { public: PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT, - const DataLayout *DL, AliasSetTracker *AST) + AliasSetTracker *AST) : Allocas(Allocas.begin(), Allocas.end()), DT(DT), - DIB(*DT.getRoot()->getParent()->getParent()), DL(DL), AST(AST) {} + DIB(*DT.getRoot()->getParent()->getParent()), AST(AST) {} void run(); @@ -357,39 +313,27 @@ private: } // end of anonymous namespace -/// \brief Walk a small vector of dead instructions and recursively remove them -/// and subsequently dead instructions. -/// -/// This is only valid to call on dead instructions using an alloca which is -/// promotable, as we leverage that assumption to delete them faster. -static void removeDeadInstructions(AllocaInst *AI, - SmallVectorImpl<Instruction *> &DeadInsts) { - while (!DeadInsts.empty()) { - Instruction *I = DeadInsts.pop_back_val(); - - // Don't delete the alloca itself. - if (I == AI) - continue; +static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { + // Knowing that this alloca is promotable, we know that it's safe to kill all + // instructions except for load and store. - // Note that we open code the deletion algorithm here because we know - // apriori that all of the instructions using an alloca that reaches here - // are trivially dead when their use list becomes empty (The only risk are - // lifetime markers which we specifically want to nuke). By coding it here - // we can skip the triviality test and be more efficient. - // - // Null out all of the instruction's operands to see if any operand becomes - // dead as we go. - for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; - ++OI) { - Instruction *Op = dyn_cast<Instruction>(*OI); - if (!Op) - continue; - - OI->set(0); - if (!Op->use_empty()) - continue; + for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); + UI != UE;) { + Instruction *I = cast<Instruction>(*UI); + ++UI; + if (isa<LoadInst>(I) || isa<StoreInst>(I)) + continue; - DeadInsts.push_back(Op); + if (!I->getType()->isVoidTy()) { + // The only users of this bitcast/GEP instruction are lifetime intrinsics. + // Follow the use/def chain to erase them now instead of leaving it for + // dead code elimination later. + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); + UI != UE;) { + Instruction *Inst = cast<Instruction>(*UI); + ++UI; + Inst->eraseFromParent(); + } } I->eraseFromParent(); } @@ -474,6 +418,7 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, DIBuilder DIB(*AI->getParent()->getParent()->getParent()); ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, DIB); DDI->eraseFromParent(); + LBI.deleteValue(DDI); } // Remove the (now dead) store and alloca. Info.OnlyStore->eraseFromParent(); @@ -486,16 +431,6 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, return true; } -namespace { -/// This is a helper predicate used to search by the first element of a pair. -struct StoreIndexSearchPredicate { - bool operator()(const std::pair<unsigned, StoreInst *> &LHS, - const std::pair<unsigned, StoreInst *> &RHS) { - return LHS.first < RHS.first; - } -}; -} - /// Many allocas are only used within a single basic block. If this is the /// case, avoid traversing the CFG and inserting a lot of potentially useless /// PHI nodes by just performing a single linear pass over the basic block @@ -528,8 +463,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, // Sort the stores by their index, making it efficient to do a lookup with a // binary search. - std::sort(StoresByIndex.begin(), StoresByIndex.end(), - StoreIndexSearchPredicate()); + std::sort(StoresByIndex.begin(), StoresByIndex.end(), less_first()); // Walk all of the loads from this alloca, replacing them with the nearest // store above them, if any. @@ -544,7 +478,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, StoresByIndexTy::iterator I = std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(), std::make_pair(LoadIdx, static_cast<StoreInst *>(0)), - StoreIndexSearchPredicate()); + less_first()); if (I == StoresByIndex.begin()) // If there is no store before this load, the load takes the undef value. @@ -577,8 +511,10 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, LBI.deleteValue(AI); // The alloca's debuginfo can be removed as well. - if (DbgDeclareInst *DDI = Info.DbgDeclare) + if (DbgDeclareInst *DDI = Info.DbgDeclare) { DDI->eraseFromParent(); + LBI.deleteValue(DDI); + } ++NumLocalPromoted; } @@ -590,23 +526,17 @@ void PromoteMem2Reg::run() { PointerAllocaValues.resize(Allocas.size()); AllocaDbgDeclares.resize(Allocas.size()); - AllocaInfo Info(DL); + AllocaInfo Info; LargeBlockInfo LBI; for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) { AllocaInst *AI = Allocas[AllocaNum]; + assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!"); assert(AI->getParent()->getParent() == &F && "All allocas should be in the same function, which is same as DF!"); - // Calculate the set of read and write-locations for each alloca. This is - // analogous to finding the 'uses' and 'definitions' of each variable. - bool Good = Info.analyzeAlloca(*AI); - (void)Good; - assert(Good && "Cannot promote non-promotable alloca!"); - - // Nuke all of the dead instructions. - removeDeadInstructions(AI, Info.DeadInsts); + removeLifetimeIntrinsicUsers(AI); if (AI->use_empty()) { // If there are no uses of the alloca, just delete it now. @@ -620,6 +550,10 @@ void PromoteMem2Reg::run() { continue; } + // Calculate the set of read and write-locations for each alloca. This is + // analogous to finding the 'uses' and 'definitions' of each variable. + Info.AnalyzeAlloca(AI); + // If there is only a single store to this value, replace any loads of // it that are directly dominated by the definition with the value stored. if (Info.DefiningBlocks.size() == 1) { @@ -904,16 +838,6 @@ void PromoteMem2Reg::ComputeLiveInBlocks( } } -namespace { -typedef std::pair<DomTreeNode *, unsigned> DomTreeNodePair; - -struct DomTreeNodeCompare { - bool operator()(const DomTreeNodePair &LHS, const DomTreeNodePair &RHS) { - return LHS.second < RHS.second; - } -}; -} // end anonymous namespace - /// At this point, we're committed to promoting the alloca using IDF's, and the /// standard SSA construction algorithm. Determine which blocks need phi nodes /// and see if we can optimize out some work by avoiding insertion of dead phi @@ -931,9 +855,9 @@ void PromoteMem2Reg::DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum, // Use a priority queue keyed on dominator tree level so that inserted nodes // are handled from the bottom of the dominator tree upwards. - typedef std::priority_queue<DomTreeNodePair, - SmallVector<DomTreeNodePair, 32>, - DomTreeNodeCompare> IDFPriorityQueue; + typedef std::pair<DomTreeNode *, unsigned> DomTreeNodePair; + typedef std::priority_queue<DomTreeNodePair, SmallVector<DomTreeNodePair, 32>, + less_second> IDFPriorityQueue; IDFPriorityQueue PQ; for (SmallPtrSet<BasicBlock *, 32>::const_iterator I = DefBlocks.begin(), @@ -1145,19 +1069,11 @@ NextIteration: goto NextIteration; } -bool llvm::isAllocaPromotable(const AllocaInst *AI, const DataLayout *DL) { - // We cast away constness because we re-use the non-const analysis that the - // actual promotion routine uses. While it is non-const, it doesn't actually - // mutate anything at this phase, and we discard the non-const results that - // promotion uses to mutate the alloca. - return AllocaInfo(DL).analyzeAlloca(*const_cast<AllocaInst *>(AI)); -} - void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT, - const DataLayout *DL, AliasSetTracker *AST) { + AliasSetTracker *AST) { // If there is nothing to do, bail out... if (Allocas.empty()) return; - PromoteMem2Reg(Allocas, DT, DL, AST).run(); + PromoteMem2Reg(Allocas, DT, AST).run(); } diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp index fc85ef3..30adbfa 100644 --- a/lib/Transforms/Utils/SSAUpdater.cpp +++ b/lib/Transforms/Utils/SSAUpdater.cpp @@ -63,7 +63,7 @@ void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) { } static bool IsEquivalentPHI(PHINode *PHI, - DenseMap<BasicBlock*, Value*> &ValueMapping) { + SmallDenseMap<BasicBlock*, Value*, 8> &ValueMapping) { unsigned PHINumValues = PHI->getNumIncomingValues(); if (PHINumValues != ValueMapping.size()) return false; @@ -136,8 +136,8 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { // Otherwise, we do need a PHI: check to see if we already have one available // in this block that produces the right value. if (isa<PHINode>(BB->begin())) { - DenseMap<BasicBlock*, Value*> ValueMapping(PredValues.begin(), - PredValues.end()); + SmallDenseMap<BasicBlock*, Value*, 8> ValueMapping(PredValues.begin(), + PredValues.end()); PHINode *SomePHI; for (BasicBlock::iterator It = BB->begin(); (SomePHI = dyn_cast<PHINode>(It)); ++It) { diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index c4c1423..ff50b12 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -475,9 +476,13 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) { CV = ICI->getOperand(0); // Unwrap any lossless ptrtoint cast. - if (TD && CV && CV->getType() == TD->getIntPtrType(CV->getContext())) - if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) - CV = PTII->getOperand(0); + if (TD && CV) { + if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) { + Value *Ptr = PTII->getPointerOperand(); + if (PTII->getType() == TD->getIntPtrType(Ptr->getType())) + CV = Ptr; + } + } return CV; } @@ -699,9 +704,10 @@ namespace { }; } -static int ConstantIntSortPredicate(const void *P1, const void *P2) { - const ConstantInt *LHS = *(const ConstantInt*const*)P1; - const ConstantInt *RHS = *(const ConstantInt*const*)P2; +static int ConstantIntSortPredicate(ConstantInt *const *P1, + ConstantInt *const *P2) { + const ConstantInt *LHS = *P1; + const ConstantInt *RHS = *P2; if (LHS->getValue().ult(RHS->getValue())) return 1; if (LHS->getValue() == RHS->getValue()) @@ -924,7 +930,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, // Convert pointer to int before we switch. if (CV->getType()->isPointerTy()) { assert(TD && "Cannot switch on pointer without DataLayout"); - CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getContext()), + CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getType()), "magicptr"); } @@ -1556,6 +1562,19 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) { return true; } +/// \returns True if this block contains a CallInst with the NoDuplicate +/// attribute. +static bool HasNoDuplicateCall(const BasicBlock *BB) { + for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + const CallInst *CI = dyn_cast<CallInst>(I); + if (!CI) + continue; + if (CI->cannotDuplicate()) + return true; + } + return false; +} + /// BlockIsSimpleEnoughToThreadThrough - Return true if we can thread a branch /// across this block. static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) { @@ -1603,6 +1622,8 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) { // Now we know that this block has multiple preds and two succs. if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false; + if (HasNoDuplicateCall(BB)) return false; + // Okay, this is a simple enough basic block. See if any phi values are // constants. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { @@ -2069,14 +2090,19 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // Ensure that any values used in the bonus instruction are also used // by the terminator of the predecessor. This means that those values // must already have been resolved, so we won't be inhibiting the - // out-of-order core by speculating them earlier. - if (BonusInst) { + // out-of-order core by speculating them earlier. We also allow + // instructions that are used by the terminator's condition because it + // exposes more merging opportunities. + bool UsedByBranch = (BonusInst && BonusInst->hasOneUse() && + *BonusInst->use_begin() == Cond); + + if (BonusInst && !UsedByBranch) { // Collect the values used by the bonus inst SmallPtrSet<Value*, 4> UsedValues; for (Instruction::op_iterator OI = BonusInst->op_begin(), OE = BonusInst->op_end(); OI != OE; ++OI) { Value *V = *OI; - if (!isa<Constant>(V)) + if (!isa<Constant>(V) && !isa<Argument>(V)) UsedValues.insert(V); } @@ -2787,7 +2813,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *TD, if (CompVal->getType()->isPointerTy()) { assert(TD && "Cannot switch on pointer without DataLayout"); CompVal = Builder.CreatePtrToInt(CompVal, - TD->getIntPtrType(CompVal->getContext()), + TD->getIntPtrType(CompVal->getType()), "magicptr"); } @@ -3160,7 +3186,7 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { /// and use it to remove dead cases. static bool EliminateDeadSwitchCases(SwitchInst *SI) { Value *Cond = SI->getCondition(); - unsigned Bits = cast<IntegerType>(Cond->getType())->getBitWidth(); + unsigned Bits = Cond->getType()->getIntegerBitWidth(); APInt KnownZero(Bits, 0), KnownOne(Bits, 0); ComputeMaskedBits(Cond, KnownZero, KnownOne); @@ -3303,28 +3329,10 @@ static Constant *LookupConstant(Value *V, /// simple instructions such as binary operations where both operands are /// constant or can be replaced by constants from the ConstantPool. Returns the /// resulting constant on success, 0 otherwise. -static Constant *ConstantFold(Instruction *I, - const SmallDenseMap<Value*, Constant*>& ConstantPool) { - if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { - Constant *A = LookupConstant(BO->getOperand(0), ConstantPool); - if (!A) - return 0; - Constant *B = LookupConstant(BO->getOperand(1), ConstantPool); - if (!B) - return 0; - return ConstantExpr::get(BO->getOpcode(), A, B); - } - - if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) { - Constant *A = LookupConstant(I->getOperand(0), ConstantPool); - if (!A) - return 0; - Constant *B = LookupConstant(I->getOperand(1), ConstantPool); - if (!B) - return 0; - return ConstantExpr::getCompare(Cmp->getPredicate(), A, B); - } - +static Constant * +ConstantFold(Instruction *I, + const SmallDenseMap<Value *, Constant *> &ConstantPool, + const DataLayout *DL) { if (SelectInst *Select = dyn_cast<SelectInst>(I)) { Constant *A = LookupConstant(Select->getCondition(), ConstantPool); if (!A) @@ -3336,14 +3344,19 @@ static Constant *ConstantFold(Instruction *I, return 0; } - if (CastInst *Cast = dyn_cast<CastInst>(I)) { - Constant *A = LookupConstant(I->getOperand(0), ConstantPool); - if (!A) + SmallVector<Constant *, 4> COps; + for (unsigned N = 0, E = I->getNumOperands(); N != E; ++N) { + if (Constant *A = LookupConstant(I->getOperand(N), ConstantPool)) + COps.push_back(A); + else return 0; - return ConstantExpr::getCast(Cast->getOpcode(), A, Cast->getDestTy()); } - return 0; + if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) + return ConstantFoldCompareInstOperands(Cmp->getPredicate(), COps[0], + COps[1], DL); + + return ConstantFoldInstOperands(I->getOpcode(), I->getType(), COps, DL); } /// GetCaseResults - Try to determine the resulting constant values in phi nodes @@ -3355,7 +3368,8 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest, BasicBlock **CommonDest, - SmallVectorImpl<std::pair<PHINode*,Constant*> > &Res) { + SmallVectorImpl<std::pair<PHINode *, Constant *> > &Res, + const DataLayout *DL) { // The block from which we enter the common destination. BasicBlock *Pred = SI->getParent(); @@ -3374,7 +3388,7 @@ GetCaseResults(SwitchInst *SI, } else if (isa<DbgInfoIntrinsic>(I)) { // Skip debug intrinsic. continue; - } else if (Constant *C = ConstantFold(I, ConstantPool)) { + } else if (Constant *C = ConstantFold(I, ConstantPool, DL)) { // Instruction is side-effect free and constant. ConstantPool.insert(std::make_pair(I, C)); } else { @@ -3698,7 +3712,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy; ResultsTy Results; if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest, - Results)) + Results, TD)) return false; // Append the result from this case to the list for each phi. @@ -3712,7 +3726,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, // Get the resulting values for the default case. SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList; if (!GetCaseResults(SI, 0, SI->getDefaultDest(), &CommonDest, - DefaultResultsList)) + DefaultResultsList, TD)) return false; for (size_t I = 0, E = DefaultResultsList.size(); I != E; ++I) { PHINode *PHI = DefaultResultsList[I].first; @@ -3733,14 +3747,32 @@ static bool SwitchToLookupTable(SwitchInst *SI, CommonDest->getParent(), CommonDest); - // Check whether the condition value is within the case range, and branch to - // the new BB. + // Compute the table index value. Builder.SetInsertPoint(SI); Value *TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal, "switch.tableidx"); - Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get( - MinCaseVal->getType(), TableSize)); - Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest()); + + // Compute the maximum table size representable by the integer type we are + // switching upon. + unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits(); + uint64_t MaxTableSize = CaseSize > 63? UINT64_MAX : 1ULL << CaseSize; + assert(MaxTableSize >= TableSize && + "It is impossible for a switch to have more entries than the max " + "representable value of its input integer type's size."); + + // If we have a fully covered lookup table, unconditionally branch to the + // lookup table BB. Otherwise, check if the condition value is within the case + // range. If it is so, branch to the new BB. Otherwise branch to SI's default + // destination. + const bool GeneratingCoveredLookupTable = MaxTableSize == TableSize; + if (GeneratingCoveredLookupTable) { + Builder.CreateBr(LookupBB); + SI->getDefaultDest()->removePredecessor(SI->getParent()); + } else { + Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get( + MinCaseVal->getType(), TableSize)); + Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest()); + } // Populate the BB that does the lookups. Builder.SetInsertPoint(LookupBB); @@ -3769,9 +3801,11 @@ static bool SwitchToLookupTable(SwitchInst *SI, Builder.CreateBr(CommonDest); // Remove the switch. - for (unsigned i = 0; i < SI->getNumSuccessors(); ++i) { + for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) { BasicBlock *Succ = SI->getSuccessor(i); - if (Succ == SI->getDefaultDest()) continue; + + if (Succ == SI->getDefaultDest()) + continue; Succ->removePredecessor(SI->getParent()); } SI->eraseFromParent(); diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 094c201..15b3e66 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -17,6 +17,7 @@ #include "llvm/Transforms/Utils/SimplifyLibCalls.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringMap.h" +#include "llvm/ADT/Triple.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -26,11 +27,16 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" using namespace llvm; +static cl::opt<bool> +ColdErrorCalls("error-reporting-is-cold", cl::init(true), + cl::Hidden, cl::desc("Treat error-reporting calls as cold")); + /// This class is the abstract base class for the set of optimizations that /// corresponds to one library call. namespace { @@ -118,6 +124,21 @@ static bool callHasFloatingPointArgument(const CallInst *CI) { return false; } +/// \brief Check whether the overloaded unary floating point function +/// corresponing to \a Ty is available. +static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty, + LibFunc::Func DoubleFn, LibFunc::Func FloatFn, + LibFunc::Func LongDoubleFn) { + switch (Ty->getTypeID()) { + case Type::FloatTyID: + return TLI->has(FloatFn); + case Type::DoubleTyID: + return TLI->has(DoubleFn); + default: + return TLI->has(LongDoubleFn); + } +} + //===----------------------------------------------------------------------===// // Fortified Library Call Optimizations //===----------------------------------------------------------------------===// @@ -477,7 +498,7 @@ struct StrChrOpt : public LibCallOptimization { // Compute the offset, make sure to handle the case when we're searching for // zero (a weird way to spell strlen). - size_t I = CharC->getSExtValue() == 0 ? + size_t I = (0xFF & CharC->getSExtValue()) == 0 ? Str.size() : Str.find(CharC->getSExtValue()); if (I == StringRef::npos) // Didn't find the char. strchr returns null. return Constant::getNullValue(CI->getType()); @@ -513,7 +534,7 @@ struct StrRChrOpt : public LibCallOptimization { } // Compute the offset. - size_t I = CharC->getSExtValue() == 0 ? + size_t I = (0xFF & CharC->getSExtValue()) == 0 ? Str.size() : Str.rfind(CharC->getSExtValue()); if (I == StringRef::npos) // Didn't find the char. Return null. return Constant::getNullValue(CI->getType()); @@ -774,7 +795,7 @@ struct StrPBrkOpt : public LibCallOptimization { // Constant folding. if (HasS1 && HasS2) { size_t I = S1.find_first_of(S2); - if (I == std::string::npos) // No match. + if (I == StringRef::npos) // No match. return Constant::getNullValue(CI->getType()); return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk"); @@ -912,7 +933,7 @@ struct StrStrOpt : public LibCallOptimization { // If both strings are known, constant fold it. if (HasStr1 && HasStr2) { - std::string::size_type Offset = SearchStr.find(ToFindStr); + size_t Offset = SearchStr.find(ToFindStr); if (Offset == StringRef::npos) // strstr("foo", "bar") -> null return Constant::getNullValue(CI->getType()); @@ -1031,7 +1052,7 @@ struct MemSetOpt : public LibCallOptimization { if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isIntegerTy() || - FT->getParamType(2) != TD->getIntPtrType(*Context)) + FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0))) return 0; // memset(p, v, n) -> llvm.memset(p, v, n, 1) @@ -1133,9 +1154,13 @@ struct PowOpt : public UnsafeFPLibCallOptimization { Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1); if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) { - if (Op1C->isExactlyValue(1.0)) // pow(1.0, x) -> 1.0 + // pow(1.0, x) -> 1.0 + if (Op1C->isExactlyValue(1.0)) return Op1C; - if (Op1C->isExactlyValue(2.0)) // pow(2.0, x) -> exp2(x) + // pow(2.0, x) -> exp2(x) + if (Op1C->isExactlyValue(2.0) && + hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f, + LibFunc::exp2l)) return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes()); } @@ -1145,7 +1170,11 @@ struct PowOpt : public UnsafeFPLibCallOptimization { if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0 return ConstantFP::get(CI->getType(), 1.0); - if (Op2C->isExactlyValue(0.5)) { + if (Op2C->isExactlyValue(0.5) && + hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf, + LibFunc::sqrtl) && + hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf, + LibFunc::fabsl)) { // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))). // This is faster than calling pow, and still handles negative zero // and negative infinity correctly. @@ -1178,7 +1207,7 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization { virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { Value *Ret = NULL; if (UnsafeFPShrink && Callee->getName() == "exp2" && - TLI->has(LibFunc::exp2)) { + TLI->has(LibFunc::exp2f)) { UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B); } @@ -1229,6 +1258,155 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization { } }; +struct SinCosPiOpt : public LibCallOptimization { + SinCosPiOpt() {} + + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Make sure the prototype is as expected, otherwise the rest of the + // function is probably invalid and likely to abort. + if (!isTrigLibCall(CI)) + return 0; + + Value *Arg = CI->getArgOperand(0); + SmallVector<CallInst *, 1> SinCalls; + SmallVector<CallInst *, 1> CosCalls; + SmallVector<CallInst *, 1> SinCosCalls; + + bool IsFloat = Arg->getType()->isFloatTy(); + + // Look for all compatible sinpi, cospi and sincospi calls with the same + // argument. If there are enough (in some sense) we can make the + // substitution. + for (Value::use_iterator UI = Arg->use_begin(), UE = Arg->use_end(); + UI != UE; ++UI) + classifyArgUse(*UI, CI->getParent(), IsFloat, SinCalls, CosCalls, + SinCosCalls); + + // It's only worthwhile if both sinpi and cospi are actually used. + if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty())) + return 0; + + Value *Sin, *Cos, *SinCos; + insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, + SinCos); + + replaceTrigInsts(SinCalls, Sin); + replaceTrigInsts(CosCalls, Cos); + replaceTrigInsts(SinCosCalls, SinCos); + + return 0; + } + + bool isTrigLibCall(CallInst *CI) { + Function *Callee = CI->getCalledFunction(); + FunctionType *FT = Callee->getFunctionType(); + + // We can only hope to do anything useful if we can ignore things like errno + // and floating-point exceptions. + bool AttributesSafe = CI->hasFnAttr(Attribute::NoUnwind) && + CI->hasFnAttr(Attribute::ReadNone); + + // Other than that we need float(float) or double(double) + return AttributesSafe && FT->getNumParams() == 1 && + FT->getReturnType() == FT->getParamType(0) && + (FT->getParamType(0)->isFloatTy() || + FT->getParamType(0)->isDoubleTy()); + } + + void classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat, + SmallVectorImpl<CallInst *> &SinCalls, + SmallVectorImpl<CallInst *> &CosCalls, + SmallVectorImpl<CallInst *> &SinCosCalls) { + CallInst *CI = dyn_cast<CallInst>(Val); + + if (!CI) + return; + + Function *Callee = CI->getCalledFunction(); + StringRef FuncName = Callee->getName(); + LibFunc::Func Func; + if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func) || + !isTrigLibCall(CI)) + return; + + if (IsFloat) { + if (Func == LibFunc::sinpif) + SinCalls.push_back(CI); + else if (Func == LibFunc::cospif) + CosCalls.push_back(CI); + else if (Func == LibFunc::sincospi_stretf) + SinCosCalls.push_back(CI); + } else { + if (Func == LibFunc::sinpi) + SinCalls.push_back(CI); + else if (Func == LibFunc::cospi) + CosCalls.push_back(CI); + else if (Func == LibFunc::sincospi_stret) + SinCosCalls.push_back(CI); + } + } + + void replaceTrigInsts(SmallVectorImpl<CallInst*> &Calls, Value *Res) { + for (SmallVectorImpl<CallInst*>::iterator I = Calls.begin(), + E = Calls.end(); + I != E; ++I) { + LCS->replaceAllUsesWith(*I, Res); + } + } + + void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, + bool UseFloat, Value *&Sin, Value *&Cos, + Value *&SinCos) { + Type *ArgTy = Arg->getType(); + Type *ResTy; + StringRef Name; + + Triple T(OrigCallee->getParent()->getTargetTriple()); + if (UseFloat) { + Name = "__sincospi_stretf"; + + assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now"); + // x86_64 can't use {float, float} since that would be returned in both + // xmm0 and xmm1, which isn't what a real struct would do. + ResTy = T.getArch() == Triple::x86_64 + ? static_cast<Type *>(VectorType::get(ArgTy, 2)) + : static_cast<Type *>(StructType::get(ArgTy, ArgTy, NULL)); + } else { + Name = "__sincospi_stret"; + ResTy = StructType::get(ArgTy, ArgTy, NULL); + } + + Module *M = OrigCallee->getParent(); + Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(), + ResTy, ArgTy, NULL); + + if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) { + // If the argument is an instruction, it must dominate all uses so put our + // sincos call there. + BasicBlock::iterator Loc = ArgInst; + B.SetInsertPoint(ArgInst->getParent(), ++Loc); + } else { + // Otherwise (e.g. for a constant) the beginning of the function is as + // good a place as any. + BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock(); + B.SetInsertPoint(&EntryBB, EntryBB.begin()); + } + + SinCos = B.CreateCall(Callee, Arg, "sincospi"); + + if (SinCos->getType()->isStructTy()) { + Sin = B.CreateExtractValue(SinCos, 0, "sinpi"); + Cos = B.CreateExtractValue(SinCos, 1, "cospi"); + } else { + Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0), + "sinpi"); + Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1), + "cospi"); + } + } + +}; + //===----------------------------------------------------------------------===// // Integer Library Call Optimizations //===----------------------------------------------------------------------===// @@ -1333,6 +1511,54 @@ struct ToAsciiOpt : public LibCallOptimization { // Formatting and IO Library Call Optimizations //===----------------------------------------------------------------------===// +struct ErrorReportingOpt : public LibCallOptimization { + ErrorReportingOpt(int S = -1) : StreamArg(S) {} + + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &) { + // Error reporting calls should be cold, mark them as such. + // This applies even to non-builtin calls: it is only a hint and applies to + // functions that the frontend might not understand as builtins. + + // This heuristic was suggested in: + // Improving Static Branch Prediction in a Compiler + // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu + // Proceedings of PACT'98, Oct. 1998, IEEE + + if (!CI->hasFnAttr(Attribute::Cold) && isReportingError(Callee, CI)) { + CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold); + } + + return 0; + } + +protected: + bool isReportingError(Function *Callee, CallInst *CI) { + if (!ColdErrorCalls) + return false; + + if (!Callee || !Callee->isDeclaration()) + return false; + + if (StreamArg < 0) + return true; + + // These functions might be considered cold, but only if their stream + // argument is stderr. + + if (StreamArg >= (int) CI->getNumArgOperands()) + return false; + LoadInst *LI = dyn_cast<LoadInst>(CI->getArgOperand(StreamArg)); + if (!LI) + return false; + GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()); + if (!GV || !GV->isDeclaration()) + return false; + return GV->getName() == "stderr"; + } + + int StreamArg; +}; + struct PrintFOpt : public LibCallOptimization { Value *optimizeFixedFormatString(Function *Callee, CallInst *CI, IRBuilder<> &B) { @@ -1361,7 +1587,7 @@ struct PrintFOpt : public LibCallOptimization { // printf("foo\n") --> puts("foo") if (FormatStr[FormatStr.size()-1] == '\n' && - FormatStr.find('%') == std::string::npos) { // no format characters. + FormatStr.find('%') == StringRef::npos) { // No format characters. // Create a string literal with no \n on it. We expect the constant merge // pass to be run after this pass, to merge duplicate strings. FormatStr = FormatStr.drop_back(); @@ -1513,6 +1739,9 @@ struct SPrintFOpt : public LibCallOptimization { struct FPrintFOpt : public LibCallOptimization { Value *optimizeFixedFormatString(Function *Callee, CallInst *CI, IRBuilder<> &B) { + ErrorReportingOpt ER(/* StreamArg = */ 0); + (void) ER.callOptimizer(Callee, CI, B); + // All the optimizations depend on the format string. StringRef FormatStr; if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) @@ -1590,6 +1819,9 @@ struct FPrintFOpt : public LibCallOptimization { struct FWriteOpt : public LibCallOptimization { virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + ErrorReportingOpt ER(/* StreamArg = */ 3); + (void) ER.callOptimizer(Callee, CI, B); + // Require a pointer, an integer, an integer, a pointer, returning integer. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() || @@ -1623,6 +1855,9 @@ struct FWriteOpt : public LibCallOptimization { struct FPutsOpt : public LibCallOptimization { virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + ErrorReportingOpt ER(/* StreamArg = */ 1); + (void) ER.callOptimizer(Callee, CI, B); + // These optimizations require DataLayout. if (!TD) return 0; @@ -1741,6 +1976,7 @@ static MemSetOpt MemSet; // Math library call optimizations. static UnaryDoubleFPOpt UnaryDoubleFP(false); static UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); +static SinCosPiOpt SinCosPi; // Integer library call optimizations. static FFSOpt FFS; @@ -1750,6 +1986,9 @@ static IsAsciiOpt IsAscii; static ToAsciiOpt ToAscii; // Formatting and IO library call optimizations. +static ErrorReportingOpt ErrorReporting; +static ErrorReportingOpt ErrorReporting0(0); +static ErrorReportingOpt ErrorReporting1(1); static PrintFOpt PrintF; static SPrintFOpt SPrintF; static FPrintFOpt FPrintF; @@ -1825,6 +2064,11 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) { case LibFunc::cos: case LibFunc::cosl: return &Cos; + case LibFunc::sinpif: + case LibFunc::sinpi: + case LibFunc::cospif: + case LibFunc::cospi: + return &SinCosPi; case LibFunc::powf: case LibFunc::pow: case LibFunc::powl: @@ -1859,6 +2103,13 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) { return &FPuts; case LibFunc::puts: return &Puts; + case LibFunc::perror: + return &ErrorReporting; + case LibFunc::vfprintf: + case LibFunc::fiprintf: + return &ErrorReporting0; + case LibFunc::fputc: + return &ErrorReporting1; case LibFunc::ceil: case LibFunc::fabs: case LibFunc::floor: diff --git a/lib/Transforms/Utils/SpecialCaseList.cpp b/lib/Transforms/Utils/SpecialCaseList.cpp index b98cb5b..2ef692c 100644 --- a/lib/Transforms/Utils/SpecialCaseList.cpp +++ b/lib/Transforms/Utils/SpecialCaseList.cpp @@ -49,29 +49,45 @@ struct SpecialCaseList::Entry { } }; -SpecialCaseList::SpecialCaseList(const StringRef Path) { - // Validate and open blacklist file. - if (Path.empty()) return; +SpecialCaseList::SpecialCaseList() : Entries() {} + +SpecialCaseList *SpecialCaseList::create( + const StringRef Path, std::string &Error) { + if (Path.empty()) + return new SpecialCaseList(); OwningPtr<MemoryBuffer> File; if (error_code EC = MemoryBuffer::getFile(Path, File)) { - report_fatal_error("Can't open blacklist file: " + Path + ": " + - EC.message()); + Error = (Twine("Can't open file '") + Path + "': " + EC.message()).str(); + return 0; } + return create(File.get(), Error); +} - init(File.get()); +SpecialCaseList *SpecialCaseList::create( + const MemoryBuffer *MB, std::string &Error) { + OwningPtr<SpecialCaseList> SCL(new SpecialCaseList()); + if (!SCL->parse(MB, Error)) + return 0; + return SCL.take(); } -SpecialCaseList::SpecialCaseList(const MemoryBuffer *MB) { - init(MB); +SpecialCaseList *SpecialCaseList::createOrDie(const StringRef Path) { + std::string Error; + if (SpecialCaseList *SCL = create(Path, Error)) + return SCL; + report_fatal_error(Error); } -void SpecialCaseList::init(const MemoryBuffer *MB) { +bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) { // Iterate through each line in the blacklist file. SmallVector<StringRef, 16> Lines; SplitString(MB->getBuffer(), Lines, "\n\r"); StringMap<StringMap<std::string> > Regexps; + assert(Entries.empty() && + "parse() should be called on an empty SpecialCaseList"); + int LineNo = 1; for (SmallVectorImpl<StringRef>::iterator I = Lines.begin(), E = Lines.end(); - I != E; ++I) { + I != E; ++I, ++LineNo) { // Ignore empty lines and lines starting with "#" if (I->empty() || I->startswith("#")) continue; @@ -80,7 +96,9 @@ void SpecialCaseList::init(const MemoryBuffer *MB) { StringRef Prefix = SplitLine.first; if (SplitLine.second.empty()) { // Missing ':' in the line. - report_fatal_error("malformed blacklist line: " + SplitLine.first); + Error = (Twine("Malformed line ") + Twine(LineNo) + ": '" + + SplitLine.first + "'").str(); + return false; } std::pair<StringRef, StringRef> SplitRegexp = SplitLine.second.split("="); @@ -113,10 +131,11 @@ void SpecialCaseList::init(const MemoryBuffer *MB) { // Check that the regexp is valid. Regex CheckRE(Regexp); - std::string Error; - if (!CheckRE.isValid(Error)) { - report_fatal_error("malformed blacklist regex: " + SplitLine.second + - ": " + Error); + std::string REError; + if (!CheckRE.isValid(REError)) { + Error = (Twine("Malformed regex in line ") + Twine(LineNo) + ": '" + + SplitLine.second + "': " + REError).str(); + return false; } // Add this regexp into the proper group by its prefix. @@ -135,6 +154,7 @@ void SpecialCaseList::init(const MemoryBuffer *MB) { Entries[I->getKey()][II->getKey()].RegEx = new Regex(II->getValue()); } } + return true; } SpecialCaseList::~SpecialCaseList() { @@ -149,18 +169,12 @@ SpecialCaseList::~SpecialCaseList() { } } -bool SpecialCaseList::findCategory(const Function &F, - StringRef &Category) const { - return findCategory(*F.getParent(), Category) || - findCategory("fun", F.getName(), Category); -} - bool SpecialCaseList::isIn(const Function& F, const StringRef Category) const { return isIn(*F.getParent(), Category) || inSectionCategory("fun", F.getName(), Category); } -static StringRef GetGVTypeString(const GlobalVariable &G) { +static StringRef GetGlobalTypeString(const GlobalValue &G) { // Types of GlobalVariables are always pointer types. Type *GType = G.getType()->getElementType(); // For now we support blacklisting struct types only. @@ -171,46 +185,29 @@ static StringRef GetGVTypeString(const GlobalVariable &G) { return "<unknown type>"; } -bool SpecialCaseList::findCategory(const GlobalVariable &G, - StringRef &Category) const { - return findCategory(*G.getParent(), Category) || - findCategory("global", G.getName(), Category) || - findCategory("type", GetGVTypeString(G), Category); -} - bool SpecialCaseList::isIn(const GlobalVariable &G, const StringRef Category) const { return isIn(*G.getParent(), Category) || inSectionCategory("global", G.getName(), Category) || - inSectionCategory("type", GetGVTypeString(G), Category); + inSectionCategory("type", GetGlobalTypeString(G), Category); } -bool SpecialCaseList::findCategory(const Module &M, StringRef &Category) const { - return findCategory("src", M.getModuleIdentifier(), Category); +bool SpecialCaseList::isIn(const GlobalAlias &GA, + const StringRef Category) const { + if (isIn(*GA.getParent(), Category)) + return true; + + if (isa<FunctionType>(GA.getType()->getElementType())) + return inSectionCategory("fun", GA.getName(), Category); + + return inSectionCategory("global", GA.getName(), Category) || + inSectionCategory("type", GetGlobalTypeString(GA), Category); } bool SpecialCaseList::isIn(const Module &M, const StringRef Category) const { return inSectionCategory("src", M.getModuleIdentifier(), Category); } -bool SpecialCaseList::findCategory(const StringRef Section, - const StringRef Query, - StringRef &Category) const { - StringMap<StringMap<Entry> >::const_iterator I = Entries.find(Section); - if (I == Entries.end()) return false; - - for (StringMap<Entry>::const_iterator II = I->second.begin(), - IE = I->second.end(); - II != IE; ++II) { - if (II->getValue().match(Query)) { - Category = II->first(); - return true; - } - } - - return false; -} - bool SpecialCaseList::inSectionCategory(const StringRef Section, const StringRef Query, const StringRef Category) const { diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index cbc1d63..c5e1dcb 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -533,7 +533,7 @@ namespace { default: break; case Instruction::GetElementPtr: // We mark this instruction as zero-cost because scalar GEPs are usually - // lowered to the intruction addressing mode. At the moment we don't + // lowered to the instruction addressing mode. At the moment we don't // generate vector GEPs. return 0; case Instruction::Br: @@ -625,10 +625,10 @@ namespace { ConstantInt *IntOff = ConstOffSCEV->getValue(); int64_t Offset = IntOff->getSExtValue(); - Type *VTy = cast<PointerType>(IPtr->getType())->getElementType(); + Type *VTy = IPtr->getType()->getPointerElementType(); int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); - Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType(); + Type *VTy2 = JPtr->getType()->getPointerElementType(); if (VTy != VTy2 && Offset < 0) { int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2); OffsetInElmts = Offset/VTy2TSS; @@ -1182,6 +1182,8 @@ namespace { // Look for an instruction with which to pair instruction *I... DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); + if (I->mayWriteToMemory()) WriteSet.add(I); + bool JAfterStart = IAfterStart; BasicBlock::iterator J = llvm::next(I); for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { @@ -1403,6 +1405,8 @@ namespace { DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); + if (I->mayWriteToMemory()) WriteSet.add(I); + for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) { (void) trackUsesOfI(Users, WriteSet, I, J); @@ -2227,11 +2231,12 @@ namespace { // The pointer value is taken to be the one with the lowest offset. Value *VPtr = IPtr; - Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType(); - Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType(); + Type *ArgTypeI = IPtr->getType()->getPointerElementType(); + Type *ArgTypeJ = JPtr->getType()->getPointerElementType(); Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - Type *VArgPtrType = PointerType::get(VArgType, - cast<PointerType>(IPtr->getType())->getAddressSpace()); + Type *VArgPtrType + = PointerType::get(VArgType, + IPtr->getType()->getPointerAddressSpace()); return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), /* insert before */ I); } @@ -2240,7 +2245,7 @@ namespace { unsigned MaskOffset, unsigned NumInElem, unsigned NumInElem1, unsigned IdxOffset, std::vector<Constant*> &Mask) { - unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements(); + unsigned NumElem1 = J->getType()->getVectorNumElements(); for (unsigned v = 0; v < NumElem1; ++v) { int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); if (m < 0) { @@ -2267,18 +2272,18 @@ namespace { Type *ArgTypeJ = J->getType(); Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements(); + unsigned NumElemI = ArgTypeI->getVectorNumElements(); // Get the total number of elements in the fused vector type. // By definition, this must equal the number of elements in // the final mask. - unsigned NumElem = cast<VectorType>(VArgType)->getNumElements(); + unsigned NumElem = VArgType->getVectorNumElements(); std::vector<Constant*> Mask(NumElem); Type *OpTypeI = I->getOperand(0)->getType(); - unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements(); + unsigned NumInElemI = OpTypeI->getVectorNumElements(); Type *OpTypeJ = J->getOperand(0)->getType(); - unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements(); + unsigned NumInElemJ = OpTypeJ->getVectorNumElements(); // The fused vector will be: // ----------------------------------------------------- @@ -2340,6 +2345,12 @@ namespace { return ExpandedIEChain; } + static unsigned getNumScalarElements(Type *Ty) { + if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) + return VecTy->getNumElements(); + return 1; + } + // Returns the value to be used as the specified operand of the vector // instruction that fuses I with J. Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, @@ -2355,17 +2366,8 @@ namespace { Instruction *L = I, *H = J; Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; - unsigned numElemL; - if (ArgTypeL->isVectorTy()) - numElemL = cast<VectorType>(ArgTypeL)->getNumElements(); - else - numElemL = 1; - - unsigned numElemH; - if (ArgTypeH->isVectorTy()) - numElemH = cast<VectorType>(ArgTypeH)->getNumElements(); - else - numElemH = 1; + unsigned numElemL = getNumScalarElements(ArgTypeL); + unsigned numElemH = getNumScalarElements(ArgTypeH); Value *LOp = L->getOperand(o); Value *HOp = H->getOperand(o); @@ -2426,11 +2428,12 @@ namespace { if (CanUseInputs) { unsigned LOpElem = - cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType()) - ->getNumElements(); + cast<Instruction>(LOp)->getOperand(0)->getType() + ->getVectorNumElements(); + unsigned HOpElem = - cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType()) - ->getNumElements(); + cast<Instruction>(HOp)->getOperand(0)->getType() + ->getVectorNumElements(); // We have one or two input vectors. We need to map each index of the // operands to the index of the original vector. @@ -2646,14 +2649,14 @@ namespace { getReplacementName(IBeforeJ ? I : J, true, o, 1)); } - + NHOp->insertBefore(IBeforeJ ? J : I); HOp = NHOp; } } if (ArgType->isVectorTy()) { - unsigned numElem = cast<VectorType>(VArgType)->getNumElements(); + unsigned numElem = VArgType->getVectorNumElements(); std::vector<Constant*> Mask(numElem); for (unsigned v = 0; v < numElem; ++v) { unsigned Idx = v; @@ -2746,16 +2749,8 @@ namespace { VectorType *VType = getVecTypeForPair(IType, JType); unsigned numElem = VType->getNumElements(); - unsigned numElemI, numElemJ; - if (IType->isVectorTy()) - numElemI = cast<VectorType>(IType)->getNumElements(); - else - numElemI = 1; - - if (JType->isVectorTy()) - numElemJ = cast<VectorType>(JType)->getNumElements(); - else - numElemJ = 1; + unsigned numElemI = getNumScalarElements(IType); + unsigned numElemJ = getNumScalarElements(JType); if (IType->isVectorTy()) { std::vector<Constant*> Mask1(numElemI), Mask2(numElemI); @@ -2804,6 +2799,8 @@ namespace { DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); + if (I->mayWriteToMemory()) WriteSet.add(I); + for (; cast<Instruction>(L) != J; ++L) (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs); @@ -2824,6 +2821,8 @@ namespace { DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); + if (I->mayWriteToMemory()) WriteSet.add(I); + for (; cast<Instruction>(L) != J;) { if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) { // Move this instruction @@ -2853,6 +2852,7 @@ namespace { DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); + if (I->mayWriteToMemory()) WriteSet.add(I); // Note: We cannot end the loop when we reach J because J could be moved // farther down the use chain by another instruction pairing. Also, J diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index a62fedc..5e75871 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -48,6 +48,7 @@ #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" @@ -126,6 +127,9 @@ static const unsigned MaxVectorWidth = 64; /// Maximum vectorization unroll count. static const unsigned MaxUnrollFactor = 16; +/// The cost of a loop that is considered 'small' by the unroller. +static const unsigned SmallLoopCost = 20; + namespace { // Forward declarations. @@ -167,7 +171,9 @@ public: updateAnalysis(); } -private: + virtual ~InnerLoopVectorizer() {} + +protected: /// A small list of PHINodes. typedef SmallVector<PHINode*, 4> PhiVector; /// When we unroll loops we have multiple vector values for each scalar. @@ -187,7 +193,13 @@ private: /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(LoopVectorizationLegality *Legal); /// Copy and widen the instructions from the old loop. - void vectorizeLoop(LoopVectorizationLegality *Legal); + virtual void vectorizeLoop(LoopVectorizationLegality *Legal); + + /// \brief The Loop exit block may have single value PHI nodes where the + /// incoming value is 'Undef'. While vectorizing we only handled real values + /// that were defined inside the loop. Here we fix the 'undef case'. + /// See PR14725. + void fixLCSSAPHIs(); /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* @@ -201,16 +213,23 @@ private: void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, PhiVector *PV); + /// Vectorize a single PHINode in a block. This method handles the induction + /// variable canonicalization. It supports both VF = 1 for unrolled loops and + /// arbitrary length vectors. + void widenPHIInstruction(Instruction *PN, VectorParts &Entry, + LoopVectorizationLegality *Legal, + unsigned UF, unsigned VF, PhiVector *PV); + /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence /// of scalars. - void scalarizeInstruction(Instruction *Instr); + virtual void scalarizeInstruction(Instruction *Instr); /// Vectorize Load and Store instructions, - void vectorizeMemoryInstruction(Instruction *Instr, + virtual void vectorizeMemoryInstruction(Instruction *Instr, LoopVectorizationLegality *Legal); /// Create a broadcast instruction. This method generates a broadcast @@ -218,12 +237,12 @@ private: /// value. If this is the induction variable then we extend it to N, N+1, ... /// this is needed because each iteration in the loop corresponds to a SIMD /// element. - Value *getBroadcastInstrs(Value *V); + virtual Value *getBroadcastInstrs(Value *V); /// This function adds 0, 1, 2 ... to each vector element, starting at zero. /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). /// The sequence starts at StartIndex. - Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); + virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. @@ -233,7 +252,7 @@ private: VectorParts &getVectorValue(Value *V); /// Generate a shuffle sequence that will reverse the vector Vec. - Value *reverseVector(Value *Vec); + virtual Value *reverseVector(Value *Vec); /// This is a helper class that holds the vectorizer state. It maps scalar /// instructions to vector instructions. When the code is 'unrolled' then @@ -291,6 +310,8 @@ private: /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. unsigned VF; + +protected: /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -326,6 +347,22 @@ private: EdgeMaskCache MaskCache; }; +class InnerLoopUnroller : public InnerLoopVectorizer { +public: + InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, + DominatorTree *DT, DataLayout *DL, + const TargetLibraryInfo *TLI, unsigned UnrollFactor) : + InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { } + +private: + virtual void scalarizeInstruction(Instruction *Instr); + virtual void vectorizeMemoryInstruction(Instruction *Instr, + LoopVectorizationLegality *Legal); + virtual Value *getBroadcastInstrs(Value *V); + virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); + virtual Value *reverseVector(Value *Vec); +}; + /// \brief Look for a meaningful debug location on the instruction or it's /// operands. static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { @@ -409,7 +446,7 @@ public: MRK_FloatMax }; - /// This POD struct holds information about reduction variables. + /// This struct holds information about reduction variables. struct ReductionDescriptor { ReductionDescriptor() : StartValue(0), LoopExitInstr(0), Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} @@ -446,8 +483,8 @@ public: MinMaxReductionKind MinMaxKind; }; - // This POD struct holds information about the memory runtime legality - // check that a group of pointers do not overlap. + /// This struct holds information about the memory runtime legality + /// check that a group of pointers do not overlap. struct RuntimePointerCheck { RuntimePointerCheck() : Need(false) {} @@ -457,6 +494,8 @@ public: Pointers.clear(); Starts.clear(); Ends.clear(); + IsWritePtr.clear(); + DependencySetId.clear(); } /// Insert a pointer and calculate the start and end SCEVs. @@ -478,7 +517,7 @@ public: SmallVector<unsigned, 2> DependencySetId; }; - /// A POD for saving information about induction variables. + /// A struct for saving information about induction variables. struct InductionInfo { InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} InductionInfo() : StartValue(0), IK(IK_NoInduction) {} @@ -725,9 +764,9 @@ struct LoopVectorizeHints { /// Vectorization unroll factor. unsigned Unroll; - LoopVectorizeHints(const Loop *L) + LoopVectorizeHints(const Loop *L, bool DisableUnrolling) : Width(VectorizationFactor) - , Unroll(VectorizationUnroll) + , Unroll(DisableUnrolling ? 1 : VectorizationUnroll) , LoopID(L->getLoopID()) { getHints(L); // The command line options override any loop metadata except for when @@ -736,6 +775,9 @@ struct LoopVectorizeHints { Width = VectorizationFactor; if (VectorizationUnroll.getNumOccurrences() > 0) Unroll = VectorizationUnroll; + + DEBUG(if (DisableUnrolling && Unroll == 1) + dbgs() << "LV: Unrolling disabled by the pass manager\n"); } /// Return the loop vectorizer metadata prefix. @@ -762,6 +804,7 @@ struct LoopVectorizeHints { Vals.push_back(LoopID->getOperand(i)); Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width)); + Vals.push_back(createHint(Context, Twine(Prefix(), "unroll").str(), 1)); MDNode *NewLoopID = MDNode::get(Context, Vals); // Set operand 0 to refer to the loop id itself. @@ -825,15 +868,18 @@ private: unsigned Val = C->getZExtValue(); if (Hint == "width") { - assert(isPowerOf2_32(Val) && Val <= MaxVectorWidth && - "Invalid width metadata"); - Width = Val; + if (isPowerOf2_32(Val) && Val <= MaxVectorWidth) + Width = Val; + else + DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n"); } else if (Hint == "unroll") { - assert(isPowerOf2_32(Val) && Val <= MaxUnrollFactor && - "Invalid unroll metadata"); - Unroll = Val; - } else - DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint); + if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor) + Unroll = Val; + else + DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n"); + } else { + DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n'); + } } }; @@ -842,7 +888,8 @@ struct LoopVectorize : public LoopPass { /// Pass identification, replacement for typeid static char ID; - explicit LoopVectorize() : LoopPass(ID) { + explicit LoopVectorize(bool NoUnrolling = false) + : LoopPass(ID), DisableUnrolling(NoUnrolling) { initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } @@ -852,6 +899,7 @@ struct LoopVectorize : public LoopPass { TargetTransformInfo *TTI; DominatorTree *DT; TargetLibraryInfo *TLI; + bool DisableUnrolling; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { // We only vectorize innermost loops. @@ -865,17 +913,22 @@ struct LoopVectorize : public LoopPass { DT = &getAnalysis<DominatorTree>(); TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + // If the target claims to have no vector registers don't attempt + // vectorization. + if (!TTI->getNumberOfRegisters(true)) + return false; + if (DL == NULL) { - DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout"); + DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout\n"); return false; } DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); - LoopVectorizeHints Hints(L); + LoopVectorizeHints Hints(L, DisableUnrolling); - if (Hints.Width == 1) { + if (Hints.Width == 1 && Hints.Unroll == 1) { DEBUG(dbgs() << "LV: Not vectorizing.\n"); return false; } @@ -912,19 +965,23 @@ struct LoopVectorize : public LoopPass { unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width, VF.Cost); + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< + F->getParent()->getModuleIdentifier() << '\n'); + DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n'); + if (VF.Width == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); - return false; + if (UF == 1) + return false; + // We decided not to vectorize, but we may want to unroll. + InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); + Unroller.vectorize(&LVL); + } else { + // If we decided that it is *legal* to vectorize the loop then do it. + InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); + LB.vectorize(&LVL); } - DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< - F->getParent()->getModuleIdentifier()<<"\n"); - DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n"); - - // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); - LB.vectorize(&LVL); - // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(L); @@ -971,25 +1028,19 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, } Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { - // Save the current insertion location. - Instruction *Loc = Builder.GetInsertPoint(); - // We need to place the broadcast of invariant variables outside the loop. Instruction *Instr = dyn_cast<Instruction>(V); bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; // Place the code for broadcasting invariant variables in the new preheader. + IRBuilder<>::InsertPointGuard Guard(Builder); if (Invariant) Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); // Broadcast the scalar into all locations in the vector. Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); - // Restore the builder insertion point. - if (Invariant) - Builder.SetInsertPoint(Loc); - return Shuf; } @@ -1016,10 +1067,35 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, return Builder.CreateAdd(Val, Cv, "induction"); } +/// \brief Find the operand of the GEP that should be checked for consecutive +/// stores. This ignores trailing indices that have no effect on the final +/// pointer. +static unsigned getGEPInductionOperand(DataLayout *DL, + const GetElementPtrInst *Gep) { + unsigned LastOperand = Gep->getNumOperands() - 1; + unsigned GEPAllocSize = DL->getTypeAllocSize( + cast<PointerType>(Gep->getType()->getScalarType())->getElementType()); + + // Walk backwards and try to peel off zeros. + while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) { + // Find the type we're currently indexing into. + gep_type_iterator GEPTI = gep_type_begin(Gep); + std::advance(GEPTI, LastOperand - 1); + + // If it's a type with the same allocation size as the result of the GEP we + // can peel off the zero index. + if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize) + break; + --LastOperand; + } + + return LastOperand; +} + int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); // Make sure that the pointer does not point to structs. - if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType()) + if (Ptr->getType()->getPointerElementType()->isAggregateType()) return 0; // If this value is a pointer induction variable we know it is consecutive. @@ -1037,8 +1113,6 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return 0; unsigned NumOperands = Gep->getNumOperands(); - Value *LastIndex = Gep->getOperand(NumOperands - 1); - Value *GpPtr = Gep->getPointerOperand(); // If this GEP value is a consecutive pointer induction variable and all of // the indices are constant then we know it is consecutive. We can @@ -1062,14 +1136,18 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return -1; } - // Check that all of the gep indices are uniform except for the last. - for (unsigned i = 0; i < NumOperands - 1; ++i) - if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + unsigned InductionOperand = getGEPInductionOperand(DL, Gep); + + // Check that all of the gep indices are uniform except for our induction + // operand. + for (unsigned i = 0; i != NumOperands; ++i) + if (i != InductionOperand && + !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) return 0; - // We can emit wide load/stores only if the last index is the induction - // variable. - const SCEV *Last = SE->getSCEV(LastIndex); + // We can emit wide load/stores only if the last non-zero index is the + // induction variable. + const SCEV *Last = SE->getSCEV(Gep->getOperand(InductionOperand)); if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { const SCEV *Step = AR->getStepRecurrence(*SE); @@ -1127,6 +1205,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); + // An alignment of 0 means target abi alignment. We need to use the scalar's + // target abi alignment in such a case. + if (!Alignment) + Alignment = DL->getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; @@ -1166,7 +1248,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); - unsigned LastOperand = NumOperands - 1; + unsigned InductionOperand = getGEPInductionOperand(DL, Gep); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); @@ -1175,9 +1257,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); // Update last index or loop invariant instruction anchored in loop. - if (i == LastOperand || + if (i == InductionOperand || (GepOperandInst && OrigLoop->contains(GepOperandInst))) { - assert((i == LastOperand || + assert((i == InductionOperand || SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && "Must be last index or loop invariant"); @@ -1301,7 +1383,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instrucions with extracted scalars. + // Replace the operands of the cloned instructions with extracted scalars. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { Value *Op = Params[op][Part]; // Param is a vector. Need to extract the right lane. @@ -1335,11 +1417,9 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, SmallVector<TrackingVH<Value> , 2> Starts; SmallVector<TrackingVH<Value> , 2> Ends; + LLVMContext &Ctx = Loc->getContext(); SCEVExpander Exp(*SE, "induction"); - // Use this type for pointer arithmetic. - Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0); - for (unsigned i = 0; i < NumPointers; ++i) { Value *Ptr = PtrRtCheck->Pointers[i]; const SCEV *Sc = SE->getSCEV(Ptr); @@ -1350,7 +1430,11 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, Starts.push_back(Ptr); Ends.push_back(Ptr); } else { - DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); + DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'); + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + + // Use this type for pointer arithmetic. + Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); @@ -1372,10 +1456,20 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) continue; - Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc"); - Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc"); - Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc"); - Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy, "bc"); + unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); + unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); + + assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && + (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && + "Trying to bounds check pointers with different address spaces"); + + Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); + Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); + + Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); + Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); + Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); + Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); @@ -1390,9 +1484,8 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, // We have to do this trickery because the IRBuilder might fold the check to a // constant expression in which case there is no Instruction anchored in a // the block. - LLVMContext &Ctx = Loc->getContext(); - Instruction * Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, - ConstantInt::getTrue(Ctx)); + Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, + ConstantInt::getTrue(Ctx)); ChkBuilder.Insert(Check, "memcheck.conflict"); return Check; } @@ -1444,6 +1537,16 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); + // The exit count might have the type of i64 while the phi is i32. This can + // happen if we have an induction variable that is sign extended before the + // compare. The only way that we get a backedge taken count is that the + // induction variable was signed and as such will not overflow. In such a case + // truncation is legal. + if (ExitCount->getType()->getPrimitiveSizeInBits() > + IdxTy->getPrimitiveSizeInBits()) + ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); + + ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); // Get the total trip count from the count by adding 1. ExitCount = SE->getAddExpr(ExitCount, SE->getConstant(ExitCount->getType(), 1)); @@ -1496,7 +1599,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(VecBody->getFirstNonPHI()); // Generate the induction variable. setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); @@ -1724,6 +1827,9 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { LoopExitBlock = ExitBlock; LoopVectorBody = VecBody; LoopScalarBody = OldBasicBlock; + + LoopVectorizeHints Hints(Lp, true); + Hints.setAlreadyVectorized(Lp); } /// This function returns the identity element (or neutral element) for @@ -1753,6 +1859,31 @@ LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { } } +static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I, + Intrinsic::ID ValidIntrinsicID) { + if (I.getNumArgOperands() != 1 || + !I.getArgOperand(0)->getType()->isFloatingPointTy() || + I.getType() != I.getArgOperand(0)->getType() || + !I.onlyReadsMemory()) + return Intrinsic::not_intrinsic; + + return ValidIntrinsicID; +} + +static Intrinsic::ID checkBinaryFloatSignature(const CallInst &I, + Intrinsic::ID ValidIntrinsicID) { + if (I.getNumArgOperands() != 2 || + !I.getArgOperand(0)->getType()->isFloatingPointTy() || + !I.getArgOperand(1)->getType()->isFloatingPointTy() || + I.getType() != I.getArgOperand(0)->getType() || + I.getType() != I.getArgOperand(1)->getType() || + !I.onlyReadsMemory()) + return Intrinsic::not_intrinsic; + + return ValidIntrinsicID; +} + + static Intrinsic::ID getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { // If we have an intrinsic call, check if it is trivially vectorizable. @@ -1767,11 +1898,13 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { case Intrinsic::log10: case Intrinsic::log2: case Intrinsic::fabs: + case Intrinsic::copysign: case Intrinsic::floor: case Intrinsic::ceil: case Intrinsic::trunc: case Intrinsic::rint: case Intrinsic::nearbyint: + case Intrinsic::round: case Intrinsic::pow: case Intrinsic::fma: case Intrinsic::fmuladd: @@ -1789,8 +1922,9 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { LibFunc::Func Func; Function *F = CI->getCalledFunction(); // We're going to make assumptions on the semantics of the functions, check - // that the target knows that it's available in this environment. - if (!F || !TLI->getLibFunc(F->getName(), Func)) + // that the target knows that it's available in this environment and it does + // not have local linkage. + if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func)) return Intrinsic::not_intrinsic; // Otherwise check if we have a call to a function that can be turned into a @@ -1801,59 +1935,67 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { case LibFunc::sin: case LibFunc::sinf: case LibFunc::sinl: - return Intrinsic::sin; + return checkUnaryFloatSignature(*CI, Intrinsic::sin); case LibFunc::cos: case LibFunc::cosf: case LibFunc::cosl: - return Intrinsic::cos; + return checkUnaryFloatSignature(*CI, Intrinsic::cos); case LibFunc::exp: case LibFunc::expf: case LibFunc::expl: - return Intrinsic::exp; + return checkUnaryFloatSignature(*CI, Intrinsic::exp); case LibFunc::exp2: case LibFunc::exp2f: case LibFunc::exp2l: - return Intrinsic::exp2; + return checkUnaryFloatSignature(*CI, Intrinsic::exp2); case LibFunc::log: case LibFunc::logf: case LibFunc::logl: - return Intrinsic::log; + return checkUnaryFloatSignature(*CI, Intrinsic::log); case LibFunc::log10: case LibFunc::log10f: case LibFunc::log10l: - return Intrinsic::log10; + return checkUnaryFloatSignature(*CI, Intrinsic::log10); case LibFunc::log2: case LibFunc::log2f: case LibFunc::log2l: - return Intrinsic::log2; + return checkUnaryFloatSignature(*CI, Intrinsic::log2); case LibFunc::fabs: case LibFunc::fabsf: case LibFunc::fabsl: - return Intrinsic::fabs; + return checkUnaryFloatSignature(*CI, Intrinsic::fabs); + case LibFunc::copysign: + case LibFunc::copysignf: + case LibFunc::copysignl: + return checkBinaryFloatSignature(*CI, Intrinsic::copysign); case LibFunc::floor: case LibFunc::floorf: case LibFunc::floorl: - return Intrinsic::floor; + return checkUnaryFloatSignature(*CI, Intrinsic::floor); case LibFunc::ceil: case LibFunc::ceilf: case LibFunc::ceill: - return Intrinsic::ceil; + return checkUnaryFloatSignature(*CI, Intrinsic::ceil); case LibFunc::trunc: case LibFunc::truncf: case LibFunc::truncl: - return Intrinsic::trunc; + return checkUnaryFloatSignature(*CI, Intrinsic::trunc); case LibFunc::rint: case LibFunc::rintf: case LibFunc::rintl: - return Intrinsic::rint; + return checkUnaryFloatSignature(*CI, Intrinsic::rint); case LibFunc::nearbyint: case LibFunc::nearbyintf: case LibFunc::nearbyintl: - return Intrinsic::nearbyint; + return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint); + case LibFunc::round: + case LibFunc::roundf: + case LibFunc::roundl: + return checkUnaryFloatSignature(*CI, Intrinsic::round); case LibFunc::pow: case LibFunc::powf: case LibFunc::powl: - return Intrinsic::pow; + return checkBinaryFloatSignature(*CI, Intrinsic::pow); } return Intrinsic::not_intrinsic; @@ -1925,6 +2067,54 @@ Value *createMinMaxOp(IRBuilder<> &Builder, return Select; } +namespace { +struct CSEDenseMapInfo { + static bool canHandle(Instruction *I) { + return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || + isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); + } + static inline Instruction *getEmptyKey() { + return DenseMapInfo<Instruction *>::getEmptyKey(); + } + static inline Instruction *getTombstoneKey() { + return DenseMapInfo<Instruction *>::getTombstoneKey(); + } + static unsigned getHashValue(Instruction *I) { + assert(canHandle(I) && "Unknown instruction!"); + return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), + I->value_op_end())); + } + static bool isEqual(Instruction *LHS, Instruction *RHS) { + if (LHS == getEmptyKey() || RHS == getEmptyKey() || + LHS == getTombstoneKey() || RHS == getTombstoneKey()) + return LHS == RHS; + return LHS->isIdenticalTo(RHS); + } +}; +} + +///\brief Perform cse of induction variable instructions. +static void cse(BasicBlock *BB) { + // Perform simple cse. + SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { + Instruction *In = I++; + + if (!CSEDenseMapInfo::canHandle(In)) + continue; + + // Check if we can replace this instruction with any of the + // visited instructions. + if (Instruction *V = CSEMap.lookup(In)) { + In->replaceAllUsesWith(V); + In->eraseFromParent(); + continue; + } + + CSEMap[In] = In; + } +} + void InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { //===------------------------------------------------===// @@ -1995,18 +2185,31 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { // MinMax reduction have the start value as their identify. - VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue, - "minmax.ident"); + if (VF == 1) { + VectorStart = Identity = RdxDesc.StartValue; + } else { + VectorStart = Identity = Builder.CreateVectorSplat(VF, + RdxDesc.StartValue, + "minmax.ident"); + } } else { + // Handle other reduction kinds: Constant *Iden = - LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, - VecTy->getScalarType()); - Identity = ConstantVector::getSplat(VF, Iden); - - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - VectorStart = Builder.CreateInsertElement(Identity, - RdxDesc.StartValue, Zero); + LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, + VecTy->getScalarType()); + if (VF == 1) { + Identity = Iden; + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + VectorStart = RdxDesc.StartValue; + } else { + Identity = ConstantVector::getSplat(VF, Iden); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + VectorStart = Builder.CreateInsertElement(Identity, + RdxDesc.StartValue, Zero); + } } // Fix the vector-loop phi. @@ -2062,37 +2265,40 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { ReducedPartRdx, RdxParts[part]); } - // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles - // and vector ops, reducing the set of values being computed by half each - // round. - assert(isPowerOf2_32(VF) && - "Reduction emission only supported for pow2 vectors!"); - Value *TmpVec = ReducedPartRdx; - SmallVector<Constant*, 32> ShuffleMask(VF, 0); - for (unsigned i = VF; i != 1; i >>= 1) { - // Move the upper half of the vector to the lower half. - for (unsigned j = 0; j != i/2; ++j) - ShuffleMask[j] = Builder.getInt32(i/2 + j); - - // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i/2], ShuffleMask.end(), - UndefValue::get(Builder.getInt32Ty())); - - Value *Shuf = + if (VF > 1) { + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = ReducedPartRdx; + SmallVector<Constant*, 32> ShuffleMask(VF, 0); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i/2; ++j) + ShuffleMask[j] = Builder.getInt32(i/2 + j); + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i/2], ShuffleMask.end(), + UndefValue::get(Builder.getInt32Ty())); + + Value *Shuf = Builder.CreateShuffleVector(TmpVec, UndefValue::get(TmpVec->getType()), ConstantVector::get(ShuffleMask), "rdx.shuf"); - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, - "bin.rdx"); - else - TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); - } + if (Op != Instruction::ICmp && Op != Instruction::FCmp) + TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, + "bin.rdx"); + else + TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); + } - // The result is in the first element of the vector. - Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + // The result is in the first element of the vector. + ReducedPartRdx = Builder.CreateExtractElement(TmpVec, + Builder.getInt32(0)); + } // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. @@ -2101,7 +2307,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { for (BasicBlock::iterator LEI = LoopExitBlock->begin(), LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); - if (!LCSSAPhi) continue; + if (!LCSSAPhi) break; // All PHINodes need to have a single entry edge, or two if // we already fixed them. @@ -2111,7 +2317,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // incoming bypass edge. if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { // Add an edge coming from the bypass. - LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); + LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); break; } }// end of the LCSSA phi scan. @@ -2123,23 +2329,26 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); // Pick the other block. int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); + (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx); (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); }// end of for each redux variable. - // The Loop exit block may have single value PHI nodes where the incoming - // value is 'undef'. While vectorizing we only handled real values that - // were defined inside the loop. Here we handle the 'undef case'. - // See PR14725. + fixLCSSAPHIs(); + + // Remove redundant induction instructions. + cse(LoopVectorBody); +} + +void InnerLoopVectorizer::fixLCSSAPHIs() { for (BasicBlock::iterator LEI = LoopExitBlock->begin(), LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); - if (!LCSSAPhi) continue; + if (!LCSSAPhi) break; if (LCSSAPhi->getNumIncomingValues() == 1) LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), LoopMiddleBlock); } -} +} InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { @@ -2200,161 +2409,185 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { return BlockMask; } -void -InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, - BasicBlock *BB, PhiVector *PV) { - // For each instruction in the old loop. - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - VectorParts &Entry = WidenMap.get(it); - switch (it->getOpcode()) { - case Instruction::Br: - // Nothing to do for PHIs and BR, since we already took care of the - // loop control flow instructions. - continue; - case Instruction::PHI:{ - PHINode* P = cast<PHINode>(it); - // Handle reduction variables: - if (Legal->getReductionVars()->count(P)) { - for (unsigned part = 0; part < UF; ++part) { - // This is phase one of vectorizing PHIs. - Type *VecTy = VectorType::get(it->getType(), VF); - Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", - LoopVectorBody-> getFirstInsertionPt()); - } - PV->push_back(P); - continue; - } +void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, + InnerLoopVectorizer::VectorParts &Entry, + LoopVectorizationLegality *Legal, + unsigned UF, unsigned VF, PhiVector *PV) { + PHINode* P = cast<PHINode>(PN); + // Handle reduction variables: + if (Legal->getReductionVars()->count(P)) { + for (unsigned part = 0; part < UF; ++part) { + // This is phase one of vectorizing PHIs. + Type *VecTy = (VF == 1) ? PN->getType() : + VectorType::get(PN->getType(), VF); + Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", + LoopVectorBody-> getFirstInsertionPt()); + } + PV->push_back(P); + return; + } - setDebugLocFromInst(Builder, P); - // Check for PHI nodes that are lowered to vector selects. - if (P->getParent() != OrigLoop->getHeader()) { - // We know that all PHIs in non header blocks are converted into - // selects, so we don't have to worry about the insertion order and we - // can just use the builder. - // At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - unsigned NumIncoming = P->getNumIncomingValues(); - - // Generate a sequence of selects of the form: - // SELECT(Mask3, In3, - // SELECT(Mask2, In2, - // ( ...))) - for (unsigned In = 0; In < NumIncoming; In++) { - VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), - P->getParent()); - VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); - - for (unsigned part = 0; part < UF; ++part) { - // We might have single edge PHIs (blocks) - use an identity - // 'select' for the first PHI operand. - if (In == 0) - Entry[part] = Builder.CreateSelect(Cond[part], In0[part], - In0[part]); - else - // Select between the current value and the previous incoming edge - // based on the incoming mask. - Entry[part] = Builder.CreateSelect(Cond[part], In0[part], - Entry[part], "predphi"); - } - } - continue; + setDebugLocFromInst(Builder, P); + // Check for PHI nodes that are lowered to vector selects. + if (P->getParent() != OrigLoop->getHeader()) { + // We know that all PHIs in non header blocks are converted into + // selects, so we don't have to worry about the insertion order and we + // can just use the builder. + // At this point we generate the predication tree. There may be + // duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + + unsigned NumIncoming = P->getNumIncomingValues(); + + // Generate a sequence of selects of the form: + // SELECT(Mask3, In3, + // SELECT(Mask2, In2, + // ( ...))) + for (unsigned In = 0; In < NumIncoming; In++) { + VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), + P->getParent()); + VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); + + for (unsigned part = 0; part < UF; ++part) { + // We might have single edge PHIs (blocks) - use an identity + // 'select' for the first PHI operand. + if (In == 0) + Entry[part] = Builder.CreateSelect(Cond[part], In0[part], + In0[part]); + else + // Select between the current value and the previous incoming edge + // based on the incoming mask. + Entry[part] = Builder.CreateSelect(Cond[part], In0[part], + Entry[part], "predphi"); } + } + return; + } - // This PHINode must be an induction variable. - // Make sure that we know about it. - assert(Legal->getInductionVars()->count(P) && - "Not an induction variable"); - - LoopVectorizationLegality::InductionInfo II = - Legal->getInductionVars()->lookup(P); - - switch (II.IK) { - case LoopVectorizationLegality::IK_NoInduction: - llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IK_IntInduction: { - assert(P->getType() == II.StartValue->getType() && "Types must match"); - Type *PhiTy = P->getType(); - Value *Broadcasted; - if (P == OldInduction) { - // Handle the canonical induction variable. We might have had to - // extend the type. - Broadcasted = Builder.CreateTrunc(Induction, PhiTy); - } else { - // Handle other induction variables that are now based on the - // canonical one. - Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, - "normalized.idx"); - NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); - Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, - "offset.idx"); - } - Broadcasted = getBroadcastInstrs(Broadcasted); - // After broadcasting the induction variable we need to make the vector - // consecutive by adding 0, 1, 2, etc. + // This PHINode must be an induction variable. + // Make sure that we know about it. + assert(Legal->getInductionVars()->count(P) && + "Not an induction variable"); + + LoopVectorizationLegality::InductionInfo II = + Legal->getInductionVars()->lookup(P); + + switch (II.IK) { + case LoopVectorizationLegality::IK_NoInduction: + llvm_unreachable("Unknown induction"); + case LoopVectorizationLegality::IK_IntInduction: { + assert(P->getType() == II.StartValue->getType() && "Types must match"); + Type *PhiTy = P->getType(); + Value *Broadcasted; + if (P == OldInduction) { + // Handle the canonical induction variable. We might have had to + // extend the type. + Broadcasted = Builder.CreateTrunc(Induction, PhiTy); + } else { + // Handle other induction variables that are now based on the + // canonical one. + Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, + "normalized.idx"); + NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); + Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, + "offset.idx"); + } + Broadcasted = getBroadcastInstrs(Broadcasted); + // After broadcasting the induction variable we need to make the vector + // consecutive by adding 0, 1, 2, etc. + for (unsigned part = 0; part < UF; ++part) + Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); + return; + } + case LoopVectorizationLegality::IK_ReverseIntInduction: + case LoopVectorizationLegality::IK_PtrInduction: + case LoopVectorizationLegality::IK_ReversePtrInduction: + // Handle reverse integer and pointer inductions. + Value *StartIdx = ExtendedIdx; + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, + "normalized.idx"); + + // Handle the reverse integer induction variable case. + if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { + IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); + Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, + "resize.norm.idx"); + Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, + "reverse.idx"); + + // This is a new value so do not hoist it out. + Value *Broadcasted = getBroadcastInstrs(ReverseInd); + // After broadcasting the induction variable we need to make the + // vector consecutive by adding ... -3, -2, -1, 0. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); - continue; + Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, + true); + return; } - case LoopVectorizationLegality::IK_ReverseIntInduction: - case LoopVectorizationLegality::IK_PtrInduction: - case LoopVectorizationLegality::IK_ReversePtrInduction: - // Handle reverse integer and pointer inductions. - Value *StartIdx = ExtendedIdx; - // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, - "normalized.idx"); - // Handle the reverse integer induction variable case. - if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { - IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); - Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, - "resize.norm.idx"); - Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, - "reverse.idx"); - - // This is a new value so do not hoist it out. - Value *Broadcasted = getBroadcastInstrs(ReverseInd); - // After broadcasting the induction variable we need to make the - // vector consecutive by adding ... -3, -2, -1, 0. - for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, - true); + // Handle the pointer induction variable case. + assert(P->getType()->isPointerTy() && "Unexpected type."); + + // Is this a reverse induction ptr or a consecutive induction ptr. + bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == + II.IK); + + // This is the vector of results. Notice that we don't generate + // vector geps because scalar geps result in better code. + for (unsigned part = 0; part < UF; ++part) { + if (VF == 1) { + int EltIndex = (part) * (Reverse ? -1 : 1); + Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); + Value *GlobalIdx; + if (Reverse) + GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); + else + GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); + + Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, + "next.gep"); + Entry[part] = SclrGep; continue; } - // Handle the pointer induction variable case. - assert(P->getType()->isPointerTy() && "Unexpected type."); - - // Is this a reverse induction ptr or a consecutive induction ptr. - bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == - II.IK); - - // This is the vector of results. Notice that we don't generate - // vector geps because scalar geps result in better code. - for (unsigned part = 0; part < UF; ++part) { - Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); - for (unsigned int i = 0; i < VF; ++i) { - int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); - Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); - Value *GlobalIdx; - if (!Reverse) - GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); - else - GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); - - Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, - "next.gep"); - VecVal = Builder.CreateInsertElement(VecVal, SclrGep, - Builder.getInt32(i), - "insert.gep"); - } - Entry[part] = VecVal; + Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); + for (unsigned int i = 0; i < VF; ++i) { + int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); + Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); + Value *GlobalIdx; + if (!Reverse) + GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); + else + GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); + + Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, + "next.gep"); + VecVal = Builder.CreateInsertElement(VecVal, SclrGep, + Builder.getInt32(i), + "insert.gep"); } - continue; + Entry[part] = VecVal; } + return; + } +} +void +InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, + BasicBlock *BB, PhiVector *PV) { + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + VectorParts &Entry = WidenMap.get(it); + switch (it->getOpcode()) { + case Instruction::Br: + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + continue; + case Instruction::PHI:{ + // Vectorize PHINodes. + widenPHIInstruction(it, Entry, Legal, UF, VF, PV); + continue; }// End of PHI. case Instruction::Add: @@ -2413,8 +2646,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, VectorParts &Cond = getVectorValue(it->getOperand(0)); VectorParts &Op0 = getVectorValue(it->getOperand(1)); VectorParts &Op1 = getVectorValue(it->getOperand(2)); - Value *ScalarCond = Builder.CreateExtractElement(Cond[0], - Builder.getInt32(0)); + + Value *ScalarCond = (VF == 1) ? Cond[0] : + Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); + for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = Builder.CreateSelect( InvariantCond ? ScalarCond : Cond[Part], @@ -2475,7 +2710,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, break; } /// Vectorize casts. - Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); + Type *DestTy = (VF == 1) ? CI->getType() : + VectorType::get(CI->getType(), VF); VectorParts &A = getVectorValue(it->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) @@ -2505,7 +2741,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); Args.push_back(Arg[Part]); } - Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) }; + Type *Tys[] = {CI->getType()}; + if (VF > 1) + Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF); + Function *F = Intrinsic::getDeclaration(M, ID, Tys); Entry[Part] = Builder.CreateCall(F, Args); } @@ -2542,19 +2781,36 @@ void InnerLoopVectorizer::updateAnalysis() { DEBUG(DT->verifyAnalysis()); } +/// \brief Check whether it is safe to if-convert this phi node. +/// +/// Phi nodes with constant expressions that can trap are not safe to if +/// convert. +static bool canIfConvertPHINodes(BasicBlock *BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + PHINode *Phi = dyn_cast<PHINode>(I); + if (!Phi) + return true; + for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p) + if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p))) + if (C->canTrap()) + return false; + } + return true; +} + bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) return false; assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); - std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); // A list of pointers that we can safely read and write to. SmallPtrSet<Value *, 8> SafePointes; // Collect safe addresses. - for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { - BasicBlock *BB = LoopBlocks[i]; + for (Loop::block_iterator BI = TheLoop->block_begin(), + BE = TheLoop->block_end(); BI != BE; ++BI) { + BasicBlock *BB = *BI; if (blockNeedsPredication(BB)) continue; @@ -2568,16 +2824,22 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { } // Collect the blocks that need predication. - for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { - BasicBlock *BB = LoopBlocks[i]; + BasicBlock *Header = TheLoop->getHeader(); + for (Loop::block_iterator BI = TheLoop->block_begin(), + BE = TheLoop->block_end(); BI != BE; ++BI) { + BasicBlock *BB = *BI; // We don't support switch statements inside loops. if (!isa<BranchInst>(BB->getTerminator())) return false; // We must be able to predicate all blocks that need to be predicated. - if (blockNeedsPredication(BB) && !blockCanBePredicated(BB, SafePointes)) + if (blockNeedsPredication(BB)) { + if (!blockCanBePredicated(BB, SafePointes)) + return false; + } else if (BB != Header && !canIfConvertPHINodes(BB)) return false; + } // We can if-convert this loop. @@ -2602,19 +2864,17 @@ bool LoopVectorizationLegality::canVectorize() { if (!TheLoop->getExitingBlock()) return false; - unsigned NumBlocks = TheLoop->getNumBlocks(); + // We need to have a loop header. + DEBUG(dbgs() << "LV: Found a loop: " << + TheLoop->getHeader()->getName() << '\n'); // Check if we can if-convert non single-bb loops. + unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); return false; } - // We need to have a loop header. - BasicBlock *Latch = TheLoop->getLoopLatch(); - DEBUG(dbgs() << "LV: Found a loop: " << - TheLoop->getHeader()->getName() << "\n"); - // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); if (ExitCount == SE->getCouldNotCompute()) { @@ -2623,6 +2883,7 @@ bool LoopVectorizationLegality::canVectorize() { } // Do not loop-vectorize loops with a tiny trip count. + BasicBlock *Latch = TheLoop->getLoopLatch(); unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); if (TC > 0u && TC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << @@ -2657,7 +2918,13 @@ bool LoopVectorizationLegality::canVectorize() { static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) { if (Ty->isPointerTy()) - return DL.getIntPtrType(Ty->getContext()); + return DL.getIntPtrType(Ty); + + // It is possible that char's or short's overflow when we ask for the loop's + // trip count, work around this by changing the type size. + if (Ty->getScalarSizeInBits() < 32) + return Type::getInt32Ty(Ty->getContext()); + return Ty; } @@ -2682,7 +2949,7 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, Instruction *U = cast<Instruction>(*I); // This user may be a reduction exit value. if (!TheLoop->contains(U)) { - DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + DEBUG(dbgs() << "LV: Found an outside user for : " << *U << '\n'); return true; } } @@ -2758,6 +3025,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { DEBUG(dbgs() << "LV: Found an induction variable.\n"); Inductions[Phi] = InductionInfo(StartValue, IK); + + // Until we explicitly handle the case of an induction variable with + // an outside loop user we have to give up vectorizing this loop. + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + return false; + continue; } @@ -2812,9 +3085,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } // Check that the instruction return type is vectorizable. - if (!VectorType::isValidElementType(it->getType()) && - !it->getType()->isVoidTy()) { - DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); + // Also, we can't vectorize extractelement instructions. + if ((!VectorType::isValidElementType(it->getType()) && + !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { + DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; } @@ -2904,7 +3178,7 @@ public: /// non-intersection. bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, unsigned &NumComparisons, ScalarEvolution *SE, - Loop *TheLoop); + Loop *TheLoop, bool ShouldCheckStride = false); /// \brief Goes over all memory accesses, checks whether a RT check is needed /// and builds sets of dependent accesses. @@ -2918,6 +3192,7 @@ public: bool isRTCheckNeeded() { return IsRTCheckNeeded; } bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } + void resetDepChecks() { CheckDeps.clear(); } MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } @@ -2972,10 +3247,15 @@ static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) { return AR->isAffine(); } +/// \brief Check the stride of the pointer and ensure that it does not wrap in +/// the address space. +static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr, + const Loop *Lp); + bool AccessAnalysis::canCheckPtrAtRT( LoopVectorizationLegality::RuntimePointerCheck &RtCheck, unsigned &NumComparisons, ScalarEvolution *SE, - Loop *TheLoop) { + Loop *TheLoop, bool ShouldCheckStride) { // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. unsigned NumReadPtrChecks = 0; @@ -3003,7 +3283,10 @@ bool AccessAnalysis::canCheckPtrAtRT( else ++NumReadPtrChecks; - if (hasComputableBounds(SE, Ptr)) { + if (hasComputableBounds(SE, Ptr) && + // When we run after a failing dependency check we have to make sure we + // don't have wrapping pointers. + (!ShouldCheckStride || isStridedPtr(SE, DL, Ptr, TheLoop) == 1)) { // The id of the dependence set. unsigned DepId; @@ -3019,7 +3302,7 @@ bool AccessAnalysis::canCheckPtrAtRT( RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId); - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n"); + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); } else { CanDoRT = false; } @@ -3027,9 +3310,36 @@ bool AccessAnalysis::canCheckPtrAtRT( if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) NumComparisons = 0; // Only one dependence set. - else + else { NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks + NumWritePtrChecks - 1)); + } + + // If the pointers that we would use for the bounds comparison have different + // address spaces, assume the values aren't directly comparable, so we can't + // use them for the runtime check. We also have to assume they could + // overlap. In the future there should be metadata for whether address spaces + // are disjoint. + unsigned NumPointers = RtCheck.Pointers.size(); + for (unsigned i = 0; i < NumPointers; ++i) { + for (unsigned j = i + 1; j < NumPointers; ++j) { + // Only need to check pointers between two different dependency sets. + if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) + continue; + + Value *PtrI = RtCheck.Pointers[i]; + Value *PtrJ = RtCheck.Pointers[j]; + + unsigned ASi = PtrI->getType()->getPointerAddressSpace(); + unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); + if (ASi != ASj) { + DEBUG(dbgs() << "LV: Runtime check would require comparison between" + " different address spaces\n"); + return false; + } + } + } + return CanDoRT; } @@ -3084,7 +3394,7 @@ void AccessAnalysis::processMemAccesses(bool UseDeferred) { !isa<Argument>(UnderlyingObj)) && !isIdentifiedObject(UnderlyingObj))) { DEBUG(dbgs() << "LV: Found an unidentified " << - (IsWrite ? "write" : "read" ) << " ptr:" << *UnderlyingObj << + (IsWrite ? "write" : "read" ) << " ptr: " << *UnderlyingObj << "\n"); IsRTCheckNeeded = (IsRTCheckNeeded || !isIdentifiedObject(UnderlyingObj) || @@ -3158,8 +3468,9 @@ public: typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; - MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) : - SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0) {} + MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) + : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), + ShouldRetryWithRuntimeCheck(false) {} /// \brief Register the location (instructions are given increasing numbers) /// of a write access. @@ -3189,6 +3500,10 @@ public: /// the accesses safely with. unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + /// \brief In same cases when the dependency check fails we can still + /// vectorize the loop with a dynamic array access check. + bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } + private: ScalarEvolution *SE; DataLayout *DL; @@ -3206,6 +3521,10 @@ private: // We can access this many bytes in parallel safely. unsigned MaxSafeDepDistBytes; + /// \brief If we see a non constant dependence distance we can still try to + /// vectorize this loop with runtime checks. + bool ShouldRetryWithRuntimeCheck; + /// \brief Check whether there is a plausible dependence between the two /// accesses. /// @@ -3403,6 +3722,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); if (!C) { DEBUG(dbgs() << "LV: Dependence because of non constant distance\n"); + ShouldRetryWithRuntimeCheck = true; return true; } @@ -3428,7 +3748,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, if (Val == 0) { if (ATy == BTy) return false; - DEBUG(dbgs() << "LV: Zero dependence difference but different types"); + DEBUG(dbgs() << "LV: Zero dependence difference but different types\n"); return true; } @@ -3437,7 +3757,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // Positive distance bigger than max vectorization factor. if (ATy != BTy) { DEBUG(dbgs() << - "LV: ReadWrite-Write positive dependency with different types"); + "LV: ReadWrite-Write positive dependency with different types\n"); return false; } @@ -3454,7 +3774,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, 2*TypeByteSize > MaxSafeDepDistBytes || Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { DEBUG(dbgs() << "LV: Failure because of Positive distance " - << Val.getSExtValue() << "\n"); + << Val.getSExtValue() << '\n'); return true; } @@ -3467,7 +3787,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, return true; DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() << - " with max VF=" << MaxSafeDepDistBytes/TypeByteSize << "\n"); + " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'); return false; } @@ -3571,8 +3891,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { Stores.push_back(St); DepChecker.addAccess(St); } - } // next instr. - } // next block. + } // Next instr. + } // Next block. // Now we have two lists that hold the loads and the stores. // Next, we find the pointers that they use. @@ -3619,7 +3939,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } - SmallPtrSet<Value *, 16> ReadOnlyPtr; for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { LoadInst *LD = cast<LoadInst>(*I); Value* Ptr = LD->getPointerOperand(); @@ -3667,7 +3986,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (NumComparisons == 0 && NeedRTCheck) NeedRTCheck = false; - // Check that we did not collect too many pointers or found a unsizeable + // Check that we did not collect too many pointers or found an unsizeable // pointer. if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { PtrRtCheck.reset(); @@ -3693,9 +4012,32 @@ bool LoopVectorizationLegality::canVectorizeMemory() { CanVecMem = DepChecker.areDepsSafe(DependentAccesses, Accesses.getDependenciesToCheck()); MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); + + if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { + DEBUG(dbgs() << "LV: Retrying with memory checks\n"); + NeedRTCheck = true; + + // Clear the dependency checks. We assume they are not needed. + Accesses.resetDepChecks(); + + PtrRtCheck.reset(); + PtrRtCheck.Need = true; + + CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, + TheLoop, true); + // Check that we did not collect too many pointers or found an unsizeable + // pointer. + if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { + DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); + PtrRtCheck.reset(); + return false; + } + + CanVecMem = true; + } } - DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << + DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"); return CanVecMem; @@ -3839,6 +4181,12 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, if (ExitInstruction != 0 || Cur == Phi) return false; + // The instruction used by an outside user must be the last instruction + // before we feed back to the reduction phi. Otherwise, we loose VF-1 + // operations on the value. + if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end()) + return false; + ExitInstruction = Cur; continue; } @@ -4045,6 +4393,14 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, if (it->mayWriteToMemory() || it->mayThrow()) return false; + // Check that we don't have a constant expression that can trap as operand. + for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); + OI != OE; ++OI) { + if (Constant *C = dyn_cast<Constant>(*OI)) + if (C->canTrap()) + return false; + } + // The instructions below can trap. switch (it->getOpcode()) { default: continue; @@ -4071,7 +4427,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, // Find the trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); - DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n"); + DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); unsigned WidestType = getWidestType(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); @@ -4082,7 +4438,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); - DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n"); + DEBUG(dbgs() << "LV: The Widest register is: " + << WidestRegister << " bits.\n"); if (MaxVectorSize == 0) { DEBUG(dbgs() << "LV: The target has no vector registers.\n"); @@ -4118,7 +4475,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, if (UserVF != 0) { assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); - DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n"); + DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); Factor.Width = UserVF; return Factor; @@ -4126,13 +4483,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, float Cost = expectedCost(1); unsigned Width = 1; - DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); + DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n"); for (unsigned i=2; i <= VF; i*=2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. float VectorCost = expectedCost(i) / (float)i; - DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << + DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); if (VectorCost < Cost) { Cost = VectorCost; @@ -4256,8 +4613,20 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, else if (UF < 1) UF = 1; - if (Legal->getReductionVars()->size()) { - DEBUG(dbgs() << "LV: Unrolling because of reductions. \n"); + bool HasReductions = Legal->getReductionVars()->size(); + + // Decide if we want to unroll if we decided that it is legal to vectorize + // but not profitable. + if (VF == 1) { + if (TheLoop->getNumBlocks() > 1 || !HasReductions || + LoopCost > SmallLoopCost) + return 1; + + return UF; + } + + if (HasReductions) { + DEBUG(dbgs() << "LV: Unrolling because of reductions.\n"); return UF; } @@ -4265,14 +4634,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and unroll until the cost of the // loop overhead is about 5% of the cost of the loop. - DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n"); - if (LoopCost < 20) { - DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n"); - unsigned NewUF = 20/LoopCost + 1; + DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); + if (LoopCost < SmallLoopCost) { + DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n"); + unsigned NewUF = SmallLoopCost / (LoopCost + 1); return std::min(NewUF, UF); } - DEBUG(dbgs() << "LV: Not Unrolling. \n"); + DEBUG(dbgs() << "LV: Not Unrolling.\n"); return 1; } @@ -4373,16 +4742,16 @@ LoopVectorizationCostModel::calculateRegisterUsage() { MaxUsage = std::max(MaxUsage, OpenIntervals.size()); DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << - OpenIntervals.size() <<"\n"); + OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); } unsigned Invariant = LoopInvariants.size(); - DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n"); - DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n"); - DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n"); + DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); + DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); + DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); R.LoopInvariantRegs = Invariant; R.MaxLocalUsers = MaxUsage; @@ -4406,8 +4775,8 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { unsigned C = getInstructionCost(it, VF); BlockCost += C; - DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << - VF << " For instruction: "<< *it << "\n"); + DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " << + VF << " For instruction: " << *it << '\n'); } // We assume that if-converted blocks have a 50% chance of being executed. @@ -4669,13 +5038,16 @@ char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { - Pass *createLoopVectorizePass() { - return new LoopVectorize(); + Pass *createLoopVectorizePass(bool NoUnrolling) { + return new LoopVectorize(NoUnrolling); } } @@ -4690,3 +5062,96 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { return false; } + + +void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { + assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + // Holds vector parameters or scalars, in case of uniform vals. + SmallVector<VectorParts, 4> Params; + + setDebugLocFromInst(Builder, Instr); + + // Find all of the vectorized parameters. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *SrcOp = Instr->getOperand(op); + + // If we are accessing the old induction variable, use the new one. + if (SrcOp == OldInduction) { + Params.push_back(getVectorValue(SrcOp)); + continue; + } + + // Try using previously calculated values. + Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); + + // If the src is an instruction that appeared earlier in the basic block + // then it should already be vectorized. + if (SrcInst && OrigLoop->contains(SrcInst)) { + assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); + // The parameter is a vector value from earlier. + Params.push_back(WidenMap.get(SrcInst)); + } else { + // The parameter is a scalar from outside the loop. Maybe even a constant. + VectorParts Scalars; + Scalars.append(UF, SrcOp); + Params.push_back(Scalars); + } + } + + assert(Params.size() == Instr->getNumOperands() && + "Invalid number of operands"); + + // Does this instruction return a value ? + bool IsVoidRetTy = Instr->getType()->isVoidTy(); + + Value *UndefVec = IsVoidRetTy ? 0 : + UndefValue::get(Instr->getType()); + // Create a new entry in the WidenMap and initialize it to Undef or Null. + VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + + // For each vector unroll 'part': + for (unsigned Part = 0; Part < UF; ++Part) { + // For each scalar that we create: + + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); + // Replace the operands of the cloned instructions with extracted scalars. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *Op = Params[op][Part]; + Cloned->setOperand(op, Op); + } + + // Place the cloned scalar in the new loop. + Builder.Insert(Cloned); + + // If the original scalar returns a value we need to place it in a vector + // so that future users will be able to use it. + if (!IsVoidRetTy) + VecResults[Part] = Cloned; + } +} + +void +InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr, + LoopVectorizationLegality*) { + return scalarizeInstruction(Instr); +} + +Value *InnerLoopUnroller::reverseVector(Value *Vec) { + return Vec; +} + +Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { + return V; +} + +Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, + bool Negate) { + // When unrolling and the VF is 1, we only need to add a simple scalar. + Type *ITy = Val->getType(); + assert(!ITy->isVectorTy() && "Val must be a scalar"); + Constant *C = ConstantInt::get(ITy, StartIdx, Negate); + return Builder.CreateAdd(Val, C, "induction"); +} + diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 9312b4b..c72b51f 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -25,8 +25,8 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/Verifier.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/DataLayout.h" @@ -49,34 +49,24 @@ static cl::opt<int> SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number ")); + +static cl::opt<bool> +ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, + cl::desc("Attempt to vectorize horizontal reductions")); + +static cl::opt<bool> ShouldStartVectorizeHorAtStore( + "slp-vectorize-hor-store", cl::init(false), cl::Hidden, + cl::desc( + "Attempt to vectorize horizontal reductions feeding into a store")); + namespace { static const unsigned MinVecRegSize = 128; static const unsigned RecursionMaxDepth = 12; -/// RAII pattern to save the insertion point of the IR builder. -class BuilderLocGuard { -public: - BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()), - DbgLoc(B.getCurrentDebugLocation()) {} - ~BuilderLocGuard() { - Builder.SetCurrentDebugLocation(DbgLoc); - if (Loc) - Builder.SetInsertPoint(Loc); - } - -private: - // Prevent copying. - BuilderLocGuard(const BuilderLocGuard &); - BuilderLocGuard &operator=(const BuilderLocGuard &); - IRBuilder<> &Builder; - AssertingVH<Instruction> Loc; - DebugLoc DbgLoc; -}; - -/// A helper class for numbering instructions in multible blocks. -/// Numbers starts at zero for each basic block. +/// A helper class for numbering instructions in multiple blocks. +/// Numbers start at zero for each basic block. struct BlockNumbering { BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {} @@ -173,6 +163,37 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) { return Opcode; } +/// \returns \p I after propagating metadata from \p VL. +static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { + Instruction *I0 = cast<Instruction>(VL[0]); + SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; + I0->getAllMetadataOtherThanDebugLoc(Metadata); + + for (unsigned i = 0, n = Metadata.size(); i != n; ++i) { + unsigned Kind = Metadata[i].first; + MDNode *MD = Metadata[i].second; + + for (int i = 1, e = VL.size(); MD && i != e; i++) { + Instruction *I = cast<Instruction>(VL[i]); + MDNode *IMD = I->getMetadata(Kind); + + switch (Kind) { + default: + MD = 0; // Remove unknown metadata + break; + case LLVMContext::MD_tbaa: + MD = MDNode::getMostGenericTBAA(MD, IMD); + break; + case LLVMContext::MD_fpmath: + MD = MDNode::getMostGenericFPMath(MD, IMD); + break; + } + } + I->setMetadata(Kind, MD); + } + return I; +} + /// \returns The type that all of the values in \p VL have or null if there /// are different types. static Type* getSameType(ArrayRef<Value *> VL) { @@ -216,6 +237,104 @@ static bool CanReuseExtract(ArrayRef<Value *> VL) { return true; } +static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right) { + + SmallVector<Value *, 16> OrigLeft, OrigRight; + + bool AllSameOpcodeLeft = true; + bool AllSameOpcodeRight = true; + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + Instruction *I = cast<Instruction>(VL[i]); + Value *V0 = I->getOperand(0); + Value *V1 = I->getOperand(1); + + OrigLeft.push_back(V0); + OrigRight.push_back(V1); + + Instruction *I0 = dyn_cast<Instruction>(V0); + Instruction *I1 = dyn_cast<Instruction>(V1); + + // Check whether all operands on one side have the same opcode. In this case + // we want to preserve the original order and not make things worse by + // reordering. + AllSameOpcodeLeft = I0; + AllSameOpcodeRight = I1; + + if (i && AllSameOpcodeLeft) { + if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) { + if(P0->getOpcode() != I0->getOpcode()) + AllSameOpcodeLeft = false; + } else + AllSameOpcodeLeft = false; + } + if (i && AllSameOpcodeRight) { + if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) { + if(P1->getOpcode() != I1->getOpcode()) + AllSameOpcodeRight = false; + } else + AllSameOpcodeRight = false; + } + + // Sort two opcodes. In the code below we try to preserve the ability to use + // broadcast of values instead of individual inserts. + // vl1 = load + // vl2 = phi + // vr1 = load + // vr2 = vr2 + // = vl1 x vr1 + // = vl2 x vr2 + // If we just sorted according to opcode we would leave the first line in + // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). + // = vl1 x vr1 + // = vr2 x vl2 + // Because vr2 and vr1 are from the same load we loose the opportunity of a + // broadcast for the packed right side in the backend: we have [vr1, vl2] + // instead of [vr1, vr2=vr1]. + if (I0 && I1) { + if(!i && I0->getOpcode() > I1->getOpcode()) { + Left.push_back(I1); + Right.push_back(I0); + } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) { + // Try not to destroy a broad cast for no apparent benefit. + Left.push_back(I1); + Right.push_back(I0); + } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) { + // Try preserve broadcasts. + Left.push_back(I1); + Right.push_back(I0); + } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) { + // Try preserve broadcasts. + Left.push_back(I1); + Right.push_back(I0); + } else { + Left.push_back(I0); + Right.push_back(I1); + } + continue; + } + // One opcode, put the instruction on the right. + if (I0) { + Left.push_back(V1); + Right.push_back(I0); + continue; + } + Left.push_back(V0); + Right.push_back(V1); + } + + bool LeftBroadcast = isSplat(Left); + bool RightBroadcast = isSplat(Right); + + // Don't reorder if the operands where good to begin with. + if (!(LeftBroadcast || RightBroadcast) && + (AllSameOpcodeRight || AllSameOpcodeLeft)) { + Left = OrigLeft; + Right = OrigRight; + } +} + /// Bottom Up SLP Vectorizer. class BoUpSLP { public: @@ -238,17 +357,20 @@ public: } /// \brief Vectorize the tree that starts with the elements in \p VL. - void vectorizeTree(); + /// Returns the vectorized root. + Value *vectorizeTree(); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. int getTreeCost(); - /// Construct a vectorizable tree that starts at \p Roots. - void buildTree(ArrayRef<Value *> Roots); + /// Construct a vectorizable tree that starts at \p Roots and is possibly + /// used by a reduction of \p RdxOps. + void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { + RdxOps = 0; VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); @@ -278,7 +400,7 @@ private: /// \returns the pointer to the vectorized value if \p VL is already /// vectorized, or NULL. They may happen in cycles. - Value *alreadyVectorized(ArrayRef<Value *> VL); + Value *alreadyVectorized(ArrayRef<Value *> VL) const; /// \brief Take the pointer operand from the Load/Store instruction. /// \returns NULL if this is not a valid Load/Store instruction. @@ -305,26 +427,31 @@ private: /// \returns the pointer to the barrier instruction if we can't sink. Value *getSinkBarrier(Instruction *Src, Instruction *Dst); - /// \returns the index of the last instrucion in the BB from \p VL. + /// \returns the index of the last instruction in the BB from \p VL. int getLastIndex(ArrayRef<Value *> VL); - /// \returns the Instrucion in the bundle \p VL. + /// \returns the Instruction in the bundle \p VL. Instruction *getLastInstruction(ArrayRef<Value *> VL); + /// \brief Set the Builder insert point to one after the last instruction in + /// the bundle + void setInsertPointAfterBundle(ArrayRef<Value *> VL); + /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); + /// \returns whether the VectorizableTree is fully vectoriable and will + /// be beneficial even the tree height is tiny. + bool isFullyVectorizableTinyTree(); + struct TreeEntry { TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0), NeedToGather(0) {} /// \returns true if the scalars in VL are equal to this entry. - bool isSame(ArrayRef<Value *> VL) { + bool isSame(ArrayRef<Value *> VL) const { assert(VL.size() == Scalars.size() && "Invalid size"); - for (int i = 0, e = VL.size(); i != e; ++i) - if (VL[i] != Scalars[i]) - return false; - return true; + return std::equal(VL.begin(), VL.end(), Scalars.begin()); } /// A vector of scalars. @@ -393,10 +520,15 @@ private: /// Holds all of the instructions that we gathered. SetVector<Instruction *> GatherSeq; + /// A list of blocks that we are going to CSE. + SmallSet<BasicBlock *, 8> CSEBlocks; /// Numbers instructions in different blocks. DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers; + /// Reduction operators. + ValueSet *RdxOps; + // Analysis and block reference. Function *F; ScalarEvolution *SE; @@ -409,8 +541,9 @@ private: IRBuilder<> Builder; }; -void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { +void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) { deleteTree(); + RdxOps = Rdx; if (!getSameType(Roots)) return; buildTree_rec(Roots, 0); @@ -431,18 +564,20 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { UE = Scalar->use_end(); User != UE; ++User) { DEBUG(dbgs() << "SLP: Checking user:" << **User << ".\n"); - bool Gathered = MustGather.count(*User); - // Skip in-tree scalars that become vectors. - if (ScalarToTreeEntry.count(*User) && !Gathered) { + if (ScalarToTreeEntry.count(*User)) { DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << **User << ".\n"); int Idx = ScalarToTreeEntry[*User]; (void) Idx; assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); continue; } + Instruction *UserInst = dyn_cast<Instruction>(*User); + if (!UserInst) + continue; - if (!isa<Instruction>(*User)) + // Ignore uses that are part of the reduction. + if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end()) continue; DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " << @@ -574,6 +709,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { continue; } + // This user is part of the reduction. + if (RdxOps && RdxOps->count(User)) + continue; + // Make sure that we can schedule this unknown user. BlockNumbering &BN = BlocksNumbers[BB]; int UserIndex = BN.getIndex(User); @@ -635,6 +774,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { switch (Opcode) { case Instruction::PHI: { PHINode *PH = dyn_cast<PHINode>(VL0); + + // Check for terminator values (e.g. invoke). + for (unsigned j = 0; j < VL.size(); ++j) + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + TerminatorInst *Term = dyn_cast<TerminatorInst>(cast<PHINode>(VL[j])->getIncomingValue(i)); + if (Term) { + DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); + newTreeEntry(VL, false); + return; + } + } + newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); @@ -658,13 +809,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } case Instruction::Load: { // Check if the loads are consecutive or of we need to swizzle them. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { + LoadInst *L = cast<LoadInst>(VL[i]); + if (!L->isSimple() || !isConsecutiveAccess(VL[i], VL[i + 1])) { newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Need to swizzle loads.\n"); return; } - + } newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; @@ -753,6 +905,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); + // Sort operands of the instructions so that each side is more likely to + // have the same opcode. + if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { + ValueList Left, Right; + reorderInputsAccordingToOpcode(VL, Left, Right); + buildTree_rec(Left, Depth + 1); + buildTree_rec(Right, Depth + 1); + return; + } + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -874,9 +1036,24 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); } else { - ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy); - VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); + // Certain instructions can be cheaper to vectorize if they have a + // constant second vector operand. + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + + // Check whether all second operands are constant. + for (unsigned i = 0; i < VL.size(); ++i) + if (!isa<ConstantInt>(cast<Instruction>(VL[i])->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + break; + } + + ScalarCost = + VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK); + VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK); } return VecCost - ScalarCost; } @@ -884,14 +1061,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Cost of wide load - cost of scalar loads. int ScalarLdCost = VecTy->getNumElements() * TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0); return VecLdCost - ScalarLdCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. int ScalarStCost = VecTy->getNumElements() * TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0); return VecStCost - ScalarStCost; } default: @@ -899,19 +1076,32 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } } +bool BoUpSLP::isFullyVectorizableTinyTree() { + DEBUG(dbgs() << "SLP: Check whether the tree with height " << + VectorizableTree.size() << " is fully vectorizable .\n"); + + // We only handle trees of height 2. + if (VectorizableTree.size() != 2) + return false; + + // Gathering cost would be too much for tiny trees. + if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) + return false; + + return true; +} + int BoUpSLP::getTreeCost() { int Cost = 0; DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); - // Don't vectorize tiny trees. Small load/store chains or consecutive stores - // of constants will be vectoried in SelectionDAG in MergeConsecutiveStores. - // The SelectionDAG vectorizer can only handle pairs (trees of height = 2). - if (VectorizableTree.size() < 3) { + // We only vectorize tiny trees if it is fully vectorizable. + if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { if (!VectorizableTree.size()) { assert(!ExternalUses.size() && "We should not have any external users"); } - return 0; + return INT_MAX; } unsigned BundleWidth = VectorizableTree[0].Scalars.size(); @@ -992,63 +1182,29 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) return false; - // Calculate a constant offset from the base pointer without using SCEV - // in the supported cases. - // TODO: Add support for the case where one of the pointers is a GEP that - // uses the other pointer. - GetElementPtrInst *GepA = dyn_cast<GetElementPtrInst>(PtrA); - GetElementPtrInst *GepB = dyn_cast<GetElementPtrInst>(PtrB); - - unsigned BW = DL->getPointerSizeInBits(ASA); + unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA); Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); - int64_t Sz = DL->getTypeStoreSize(Ty); + APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty)); - // Check if PtrA is the base and PtrB is a constant offset. - if (GepB && GepB->getPointerOperand() == PtrA) { - APInt Offset(BW, 0); - if (GepB->accumulateConstantOffset(*DL, Offset)) - return Offset.getSExtValue() == Sz; - return false; - } + APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB); - // Check if PtrB is the base and PtrA is a constant offset. - if (GepA && GepA->getPointerOperand() == PtrB) { - APInt Offset(BW, 0); - if (GepA->accumulateConstantOffset(*DL, Offset)) - return Offset.getSExtValue() == -Sz; - return false; - } + APInt OffsetDelta = OffsetB - OffsetA; - // If both pointers are GEPs: - if (GepA && GepB) { - // Check that they have the same base pointer and number of indices. - if (GepA->getPointerOperand() != GepB->getPointerOperand() || - GepA->getNumIndices() != GepB->getNumIndices()) - return false; + // Check if they are based on the same pointer. That makes the offsets + // sufficient. + if (PtrA == PtrB) + return OffsetDelta == Size; - // Try to strip the geps. This makes SCEV faster. - // Make sure that all of the indices except for the last are identical. - int LastIdx = GepA->getNumIndices(); - for (int i = 0; i < LastIdx - 1; i++) { - if (GepA->getOperand(i+1) != GepB->getOperand(i+1)) - return false; - } - - PtrA = GepA->getOperand(LastIdx); - PtrB = GepB->getOperand(LastIdx); - Sz = 1; - } - - ConstantInt *CA = dyn_cast<ConstantInt>(PtrA); - ConstantInt *CB = dyn_cast<ConstantInt>(PtrB); - if (CA && CB) { - return (CA->getSExtValue() + Sz == CB->getSExtValue()); - } + // Compute the necessary base pointer delta to have the necessary final delta + // equal to the size. + APInt BaseDelta = Size - OffsetDelta; - // Calculate the distance. + // Otherwise compute the distance with SCEV between the base pointers. const SCEV *PtrSCEVA = SE->getSCEV(PtrA); const SCEV *PtrSCEVB = SE->getSCEV(PtrB); - const SCEV *C = SE->getConstant(PtrSCEVA->getType(), Sz); + const SCEV *C = SE->getConstant(BaseDelta); const SCEV *X = SE->getAddExpr(PtrSCEVA, C); return X == PtrSCEVB; } @@ -1102,6 +1258,15 @@ Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) { return I; } +void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { + Instruction *VL0 = cast<Instruction>(VL[0]); + Instruction *LastInst = getLastInstruction(VL); + BasicBlock::iterator NextInst = LastInst; + ++NextInst; + Builder.SetInsertPoint(VL0->getParent(), NextInst); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); +} + Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { Value *Vec = UndefValue::get(Ty); // Generate the 'InsertElement' instruction. @@ -1109,6 +1274,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { GatherSeq.insert(Insrt); + CSEBlocks.insert(Insrt->getParent()); // Add to our 'need-to-extract' list. if (ScalarToTreeEntry.count(VL[i])) { @@ -1132,10 +1298,12 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { return Vec; } -Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) { - if (ScalarToTreeEntry.count(VL[0])) { - int Idx = ScalarToTreeEntry[VL[0]]; - TreeEntry *En = &VectorizableTree[Idx]; +Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { + SmallDenseMap<Value*, int>::const_iterator Entry + = ScalarToTreeEntry.find(VL[0]); + if (Entry != ScalarToTreeEntry.end()) { + int Idx = Entry->second; + const TreeEntry *En = &VectorizableTree[Idx]; if (En->isSame(VL) && En->VectorizedValue) return En->VectorizedValue; } @@ -1159,38 +1327,48 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { } Value *BoUpSLP::vectorizeTree(TreeEntry *E) { - BuilderLocGuard Guard(Builder); + IRBuilder<>::InsertPointGuard Guard(Builder); if (E->VectorizedValue) { DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); return E->VectorizedValue; } - Type *ScalarTy = E->Scalars[0]->getType(); - if (StoreInst *SI = dyn_cast<StoreInst>(E->Scalars[0])) + Instruction *VL0 = cast<Instruction>(E->Scalars[0]); + Type *ScalarTy = VL0->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); if (E->NeedToGather) { + setInsertPointAfterBundle(E->Scalars); return Gather(E->Scalars, VecTy); } - Instruction *VL0 = cast<Instruction>(E->Scalars[0]); unsigned Opcode = VL0->getOpcode(); assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); switch (Opcode) { case Instruction::PHI: { PHINode *PH = dyn_cast<PHINode>(VL0); - Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt()); + Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); E->VectorizedValue = NewPhi; + // PHINodes may have multiple entries from the same block. We want to + // visit every block once. + SmallSet<BasicBlock*, 4> VisitedBBs; + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { ValueList Operands; BasicBlock *IBB = PH->getIncomingBlock(i); + if (!VisitedBBs.insert(IBB)) { + NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); + continue; + } + // Prepare the operand vector. for (unsigned j = 0; j < E->Scalars.size(); ++j) Operands.push_back(cast<PHINode>(E->Scalars[j])-> @@ -1231,8 +1409,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { for (int i = 0, e = E->Scalars.size(); i < e; ++i) INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); - Builder.SetInsertPoint(getLastInstruction(E->Scalars)); - Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + setInsertPointAfterBundle(E->Scalars); Value *InVec = vectorizeTree(INVL); @@ -1252,8 +1429,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); } - Builder.SetInsertPoint(getLastInstruction(E->Scalars)); - Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + setInsertPointAfterBundle(E->Scalars); Value *L = vectorizeTree(LHSV); Value *R = vectorizeTree(RHSV); @@ -1279,8 +1455,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2)); } - Builder.SetInsertPoint(getLastInstruction(E->Scalars)); - Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + setInsertPointAfterBundle(E->Scalars); Value *Cond = vectorizeTree(CondVec); Value *True = vectorizeTree(TrueVec); @@ -1288,7 +1463,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (Value *V = alreadyVectorized(E->Scalars)) return V; - + Value *V = Builder.CreateSelect(Cond, True, False); E->VectorizedValue = V; return V; @@ -1312,13 +1487,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Or: case Instruction::Xor: { ValueList LHSVL, RHSVL; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); - RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); - } + if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) + reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); + else + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + } - Builder.SetInsertPoint(getLastInstruction(E->Scalars)); - Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + setInsertPointAfterBundle(E->Scalars); Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); @@ -1333,41 +1510,46 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { BinaryOperator *BinOp = cast<BinaryOperator>(VL0); Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); E->VectorizedValue = V; + + if (Instruction *I = dyn_cast<Instruction>(V)) + return propagateMetadata(I, E->Scalars); + return V; } case Instruction::Load: { // Loads are inserted at the head of the tree because we don't want to // sink them all the way down past store instructions. - Builder.SetInsertPoint(getLastInstruction(E->Scalars)); - Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + setInsertPointAfterBundle(E->Scalars); LoadInst *LI = cast<LoadInst>(VL0); - Value *VecPtr = - Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo()); + unsigned AS = LI->getPointerAddressSpace(); + + Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), + VecTy->getPointerTo(AS)); unsigned Alignment = LI->getAlignment(); LI = Builder.CreateLoad(VecPtr); LI->setAlignment(Alignment); E->VectorizedValue = LI; - return LI; + return propagateMetadata(LI, E->Scalars); } case Instruction::Store: { StoreInst *SI = cast<StoreInst>(VL0); unsigned Alignment = SI->getAlignment(); + unsigned AS = SI->getPointerAddressSpace(); ValueList ValueOp; for (int i = 0, e = E->Scalars.size(); i < e; ++i) ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand()); - Builder.SetInsertPoint(getLastInstruction(E->Scalars)); - Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + setInsertPointAfterBundle(E->Scalars); Value *VecValue = vectorizeTree(ValueOp); - Value *VecPtr = - Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); + Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), + VecTy->getPointerTo(AS)); StoreInst *S = Builder.CreateStore(VecValue, VecPtr); S->setAlignment(Alignment); E->VectorizedValue = S; - return S; + return propagateMetadata(S, E->Scalars); } default: llvm_unreachable("unknown inst"); @@ -1375,7 +1557,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return 0; } -void BoUpSLP::vectorizeTree() { +Value *BoUpSLP::vectorizeTree() { Builder.SetInsertPoint(F->getEntryBlock().begin()); vectorizeTree(&VectorizableTree[0]); @@ -1407,6 +1589,7 @@ void BoUpSLP::vectorizeTree() { if (PHINode *PN = dyn_cast<PHINode>(Vec)) { Builder.SetInsertPoint(PN->getParent()->getFirstInsertionPt()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); + CSEBlocks.insert(PN->getParent()); User->replaceUsesOfWith(Scalar, Ex); } else if (isa<Instruction>(Vec)){ if (PHINode *PH = dyn_cast<PHINode>(User)) { @@ -1414,17 +1597,20 @@ void BoUpSLP::vectorizeTree() { if (PH->getIncomingValue(i) == Scalar) { Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); + CSEBlocks.insert(PH->getIncomingBlock(i)); PH->setOperand(i, Ex); } } } else { Builder.SetInsertPoint(cast<Instruction>(User)); Value *Ex = Builder.CreateExtractElement(Vec, Lane); + CSEBlocks.insert(cast<Instruction>(User)->getParent()); User->replaceUsesOfWith(Scalar, Ex); } } else { Builder.SetInsertPoint(F->getEntryBlock().begin()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); + CSEBlocks.insert(&F->getEntryBlock()); User->replaceUsesOfWith(Scalar, Ex); } @@ -1450,9 +1636,10 @@ void BoUpSLP::vectorizeTree() { for (Value::use_iterator User = Scalar->use_begin(), UE = Scalar->use_end(); User != UE; ++User) { DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n"); - assert(!MustGather.count(*User) && - "Replacing gathered value with undef"); - assert(ScalarToTreeEntry.count(*User) && + + assert((ScalarToTreeEntry.count(*User) || + // It is legal to replace the reduction users by undef. + (RdxOps && RdxOps->count(*User))) && "Replacing out-of-tree value with undef"); } Value *Undef = UndefValue::get(Ty); @@ -1467,8 +1654,20 @@ void BoUpSLP::vectorizeTree() { BlocksNumbers[it].forget(); } Builder.ClearInsertionPoint(); + + return VectorizableTree[0].VectorizedValue; } +class DTCmp { + const DominatorTree *DT; + +public: + DTCmp(const DominatorTree *DT) : DT(DT) {} + bool operator()(const BasicBlock *A, const BasicBlock *B) const { + return DT->properlyDominates(A, B); + } +}; + void BoUpSLP::optimizeGatherSequence() { DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() << " gather sequences instructions.\n"); @@ -1504,45 +1703,48 @@ void BoUpSLP::optimizeGatherSequence() { Insert->moveBefore(PreHeader->getTerminator()); } + // Sort blocks by domination. This ensures we visit a block after all blocks + // dominating it are visited. + SmallVector<BasicBlock *, 8> CSEWorkList(CSEBlocks.begin(), CSEBlocks.end()); + std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), DTCmp(DT)); + // Perform O(N^2) search over the gather sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the // instructions into different buckets based on the insert lane. - SmallPtrSet<Instruction*, 16> Visited; - SmallVector<Instruction*, 16> ToRemove; - ReversePostOrderTraversal<Function*> RPOT(F); - for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(), - E = RPOT.end(); I != E; ++I) { + SmallVector<Instruction *, 16> Visited; + for (SmallVectorImpl<BasicBlock *>::iterator I = CSEWorkList.begin(), + E = CSEWorkList.end(); + I != E; ++I) { + assert((I == CSEWorkList.begin() || !DT->dominates(*I, *llvm::prior(I))) && + "Worklist not sorted properly!"); BasicBlock *BB = *I; - // For all instructions in the function: - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - Instruction *In = it; - if ((!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) || - !GatherSeq.count(In)) + // For all instructions in blocks containing gather sequences: + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { + Instruction *In = it++; + if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) continue; // Check if we can replace this instruction with any of the // visited instructions. - for (SmallPtrSet<Instruction*, 16>::iterator v = Visited.begin(), - ve = Visited.end(); v != ve; ++v) { + for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(), + ve = Visited.end(); + v != ve; ++v) { if (In->isIdenticalTo(*v) && DT->dominates((*v)->getParent(), In->getParent())) { In->replaceAllUsesWith(*v); - ToRemove.push_back(In); + In->eraseFromParent(); In = 0; break; } } - if (In) - Visited.insert(In); + if (In) { + assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end()); + Visited.push_back(In); + } } } - - // Erase all of the instructions that we RAUWed. - for (SmallVectorImpl<Instruction *>::iterator v = ToRemove.begin(), - ve = ToRemove.end(); v != ve; ++v) { - assert((*v)->getNumUses() == 0 && "Can't remove instructions with uses"); - (*v)->eraseFromParent(); - } + CSEBlocks.clear(); + GatherSeq.clear(); } /// The SLPVectorizer Pass. @@ -1575,14 +1777,18 @@ struct SLPVectorizer : public FunctionPass { StoreRefs.clear(); bool Changed = false; + // If the target claims to have no vector registers don't attempt + // vectorization. + if (!TTI->getNumberOfRegisters(true)) + return false; + // Must have DataLayout. We can't require it because some tests run w/o // triple. if (!DL) return false; // Don't vectorize when the attribute NoImplicitFloat is used. - if (F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat)) + if (F.hasFnAttribute(Attribute::NoImplicitFloat)) return false; DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); @@ -1661,6 +1867,21 @@ private: StoreListMap StoreRefs; }; +/// \brief Check that the Values in the slice in VL array are still existant in +/// the WeakVH array. +/// Vectorization of part of the VL array may cause later values in the VL array +/// to become invalid. We track when this has happened in the WeakVH array. +static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL, + SmallVectorImpl<WeakVH> &VH, + unsigned SliceBegin, + unsigned SliceSize) { + for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i) + if (VH[i] != VL[i]) + return true; + + return false; +} + bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, BoUpSLP &R) { unsigned ChainLen = Chain.size(); @@ -1673,11 +1894,19 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, if (!isPowerOf2_32(Sz) || VF < 2) return false; + // Keep track of values that were delete by vectorizing in the loop below. + SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end()); + bool Changed = false; // Look for profitable vectorizable trees at all offsets, starting at zero. for (unsigned i = 0, e = ChainLen; i < e; ++i) { if (i + VF > e) break; + + // Check that a previous iteration of this loop did not delete the Value. + if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) + continue; + DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i << "\n"); ArrayRef<Value *> Operands = Chain.slice(i, VF); @@ -1697,7 +1926,7 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, } } - return Changed; + return Changed; } bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, @@ -1764,15 +1993,17 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { if (!SI) continue; + // Don't touch volatile stores. + if (!SI->isSimple()) + continue; + // Check that the pointer points to scalars. Type *Ty = SI->getValueOperand()->getType(); if (Ty->isAggregateType() || Ty->isVectorTy()) return 0; - // Find the base of the GEP. - Value *Ptr = SI->getPointerOperand(); - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) - Ptr = GEP->getPointerOperand(); + // Find the base pointer. + Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); // Save the store locations. StoreRefs[Ptr].push_back(SI); @@ -1797,28 +2028,61 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { // Check that all of the parts are scalar instructions of the same type. Instruction *I0 = dyn_cast<Instruction>(VL[0]); if (!I0) - return 0; + return false; unsigned Opcode0 = I0->getOpcode(); + Type *Ty0 = I0->getType(); + unsigned Sz = DL->getTypeSizeInBits(Ty0); + unsigned VF = MinVecRegSize / Sz; + for (int i = 0, e = VL.size(); i < e; ++i) { Type *Ty = VL[i]->getType(); if (Ty->isAggregateType() || Ty->isVectorTy()) - return 0; + return false; Instruction *Inst = dyn_cast<Instruction>(VL[i]); if (!Inst || Inst->getOpcode() != Opcode0) - return 0; + return false; } - R.buildTree(VL); - int Cost = R.getTreeCost(); + bool Changed = false; - if (Cost >= -SLPCostThreshold) - return false; + // Keep track of values that were delete by vectorizing in the loop below. + SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); - DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n"); - R.vectorizeTree(); - return true; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + unsigned OpsWidth = 0; + + if (i + VF > e) + OpsWidth = e - i; + else + OpsWidth = VF; + + if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) + break; + + // Check that a previous iteration of this loop did not delete the Value. + if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth)) + continue; + + DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " + << "\n"); + ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); + + R.buildTree(Ops); + int Cost = R.getTreeCost(); + + if (Cost < -SLPCostThreshold) { + DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n"); + R.vectorizeTree(); + + // Move to the next bundle. + i += VF - 1; + Changed = true; + } + } + + return Changed; } bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { @@ -1861,30 +2125,405 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { return 0; } +/// \brief Generate a shuffle mask to be used in a reduction tree. +/// +/// \param VecLen The length of the vector to be reduced. +/// \param NumEltsToRdx The number of elements that should be reduced in the +/// vector. +/// \param IsPairwise Whether the reduction is a pairwise or splitting +/// reduction. A pairwise reduction will generate a mask of +/// <0,2,...> or <1,3,..> while a splitting reduction will generate +/// <2,3, undef,undef> for a vector of 4 and NumElts = 2. +/// \param IsLeft True will generate a mask of even elements, odd otherwise. +static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, + bool IsPairwise, bool IsLeft, + IRBuilder<> &Builder) { + assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); + + SmallVector<Constant *, 32> ShuffleMask( + VecLen, UndefValue::get(Builder.getInt32Ty())); + + if (IsPairwise) + // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). + for (unsigned i = 0; i != NumEltsToRdx; ++i) + ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); + else + // Move the upper half of the vector to the lower half. + for (unsigned i = 0; i != NumEltsToRdx; ++i) + ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); + + return ConstantVector::get(ShuffleMask); +} + + +/// Model horizontal reductions. +/// +/// A horizontal reduction is a tree of reduction operations (currently add and +/// fadd) that has operations that can be put into a vector as its leaf. +/// For example, this tree: +/// +/// mul mul mul mul +/// \ / \ / +/// + + +/// \ / +/// + +/// This tree has "mul" as its reduced values and "+" as its reduction +/// operations. A reduction might be feeding into a store or a binary operation +/// feeding a phi. +/// ... +/// \ / +/// + +/// | +/// phi += +/// +/// Or: +/// ... +/// \ / +/// + +/// | +/// *p = +/// +class HorizontalReduction { + SmallPtrSet<Value *, 16> ReductionOps; + SmallVector<Value *, 32> ReducedVals; + + BinaryOperator *ReductionRoot; + PHINode *ReductionPHI; + + /// The opcode of the reduction. + unsigned ReductionOpcode; + /// The opcode of the values we perform a reduction on. + unsigned ReducedValueOpcode; + /// The width of one full horizontal reduction operation. + unsigned ReduxWidth; + /// Should we model this reduction as a pairwise reduction tree or a tree that + /// splits the vector in halves and adds those halves. + bool IsPairwiseReduction; + +public: + HorizontalReduction() + : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0), + ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} + + /// \brief Try to find a reduction tree. + bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, + DataLayout *DL) { + assert((!Phi || + std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && + "Thi phi needs to use the binary operator"); + + // We could have a initial reductions that is not an add. + // r *= v1 + v2 + v3 + v4 + // In such a case start looking for a tree rooted in the first '+'. + if (Phi) { + if (B->getOperand(0) == Phi) { + Phi = 0; + B = dyn_cast<BinaryOperator>(B->getOperand(1)); + } else if (B->getOperand(1) == Phi) { + Phi = 0; + B = dyn_cast<BinaryOperator>(B->getOperand(0)); + } + } + + if (!B) + return false; + + Type *Ty = B->getType(); + if (Ty->isVectorTy()) + return false; + + ReductionOpcode = B->getOpcode(); + ReducedValueOpcode = 0; + ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); + ReductionRoot = B; + ReductionPHI = Phi; + + if (ReduxWidth < 4) + return false; + + // We currently only support adds. + if (ReductionOpcode != Instruction::Add && + ReductionOpcode != Instruction::FAdd) + return false; + + // Post order traverse the reduction tree starting at B. We only handle true + // trees containing only binary operators. + SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack; + Stack.push_back(std::make_pair(B, 0)); + while (!Stack.empty()) { + BinaryOperator *TreeN = Stack.back().first; + unsigned EdgeToVist = Stack.back().second++; + bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; + + // Only handle trees in the current basic block. + if (TreeN->getParent() != B->getParent()) + return false; + + // Each tree node needs to have one user except for the ultimate + // reduction. + if (!TreeN->hasOneUse() && TreeN != B) + return false; + + // Postorder vist. + if (EdgeToVist == 2 || IsReducedValue) { + if (IsReducedValue) { + // Make sure that the opcodes of the operations that we are going to + // reduce match. + if (!ReducedValueOpcode) + ReducedValueOpcode = TreeN->getOpcode(); + else if (ReducedValueOpcode != TreeN->getOpcode()) + return false; + ReducedVals.push_back(TreeN); + } else { + // We need to be able to reassociate the adds. + if (!TreeN->isAssociative()) + return false; + ReductionOps.insert(TreeN); + } + // Retract. + Stack.pop_back(); + continue; + } + + // Visit left or right. + Value *NextV = TreeN->getOperand(EdgeToVist); + BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV); + if (Next) + Stack.push_back(std::make_pair(Next, 0)); + else if (NextV != Phi) + return false; + } + return true; + } + + /// \brief Attempt to vectorize the tree found by + /// matchAssociativeReduction. + bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { + if (ReducedVals.empty()) + return false; + + unsigned NumReducedVals = ReducedVals.size(); + if (NumReducedVals < ReduxWidth) + return false; + + Value *VectorizedTree = 0; + IRBuilder<> Builder(ReductionRoot); + FastMathFlags Unsafe; + Unsafe.setUnsafeAlgebra(); + Builder.SetFastMathFlags(Unsafe); + unsigned i = 0; + + for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { + ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth); + V.buildTree(ValsToReduce, &ReductionOps); + + // Estimate cost. + int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); + if (Cost >= -SLPCostThreshold) + break; + + DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost + << ". (HorRdx)\n"); + + // Vectorize a tree. + DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); + Value *VectorizedRoot = V.vectorizeTree(); + + // Emit a reduction. + Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); + if (VectorizedTree) { + Builder.SetCurrentDebugLocation(Loc); + VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, + ReducedSubTree, "bin.rdx"); + } else + VectorizedTree = ReducedSubTree; + } + + if (VectorizedTree) { + // Finish the reduction. + for (; i < NumReducedVals; ++i) { + Builder.SetCurrentDebugLocation( + cast<Instruction>(ReducedVals[i])->getDebugLoc()); + VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, + ReducedVals[i]); + } + // Update users. + if (ReductionPHI) { + assert(ReductionRoot != NULL && "Need a reduction operation"); + ReductionRoot->setOperand(0, VectorizedTree); + ReductionRoot->setOperand(1, ReductionPHI); + } else + ReductionRoot->replaceAllUsesWith(VectorizedTree); + } + return VectorizedTree != 0; + } + +private: + + /// \brief Calcuate the cost of a reduction. + int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { + Type *ScalarTy = FirstReducedVal->getType(); + Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); + + int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); + int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); + + IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; + int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; + + int ScalarReduxCost = + ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); + + DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost + << " for reduction that starts with " << *FirstReducedVal + << " (It is a " + << (IsPairwiseReduction ? "pairwise" : "splitting") + << " reduction)\n"); + + return VecReduxCost - ScalarReduxCost; + } + + static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, + Value *R, const Twine &Name = "") { + if (Opcode == Instruction::FAdd) + return Builder.CreateFAdd(L, R, Name); + return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); + } + + /// \brief Emit a horizontal reduction of the vectorized value. + Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { + assert(VectorizedValue && "Need to have a vectorized tree node"); + Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue); + assert(isPowerOf2_32(ReduxWidth) && + "We only handle power-of-two reductions for now"); + + Value *TmpVec = ValToReduce; + for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { + if (IsPairwiseReduction) { + Value *LeftMask = + createRdxShuffleMask(ReduxWidth, i, true, true, Builder); + Value *RightMask = + createRdxShuffleMask(ReduxWidth, i, true, false, Builder); + + Value *LeftShuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); + Value *RightShuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), + "rdx.shuf.r"); + TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, + "bin.rdx"); + } else { + Value *UpperHalf = + createRdxShuffleMask(ReduxWidth, i, false, false, Builder); + Value *Shuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); + TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); + } + } + + // The result is in the first element of the vector. + return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + } +}; + +/// \brief Recognize construction of vectors like +/// %ra = insertelement <4 x float> undef, float %s0, i32 0 +/// %rb = insertelement <4 x float> %ra, float %s1, i32 1 +/// %rc = insertelement <4 x float> %rb, float %s2, i32 2 +/// %rd = insertelement <4 x float> %rc, float %s3, i32 3 +/// +/// Returns true if it matches +/// +static bool findBuildVector(InsertElementInst *IE, + SmallVectorImpl<Value *> &Ops) { + if (!isa<UndefValue>(IE->getOperand(0))) + return false; + + while (true) { + Ops.push_back(IE->getOperand(1)); + + if (IE->use_empty()) + return false; + + InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->use_back()); + if (!NextUse) + return true; + + // If this isn't the final use, make sure the next insertelement is the only + // use. It's OK if the final constructed vector is used multiple times + if (!IE->hasOneUse()) + return false; + + IE = NextUse; + } + + return false; +} + +static bool PhiTypeSorterFunc(Value *V, Value *V2) { + return V->getType() < V2->getType(); +} + bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector<Value *, 4> Incoming; - // Collect the incoming values from the PHIs. - for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie; - ++instr) { - PHINode *P = dyn_cast<PHINode>(instr); - - if (!P) - break; + SmallSet<Value *, 16> VisitedInstrs; + + bool HaveVectorizedPhiNodes = true; + while (HaveVectorizedPhiNodes) { + HaveVectorizedPhiNodes = false; + + // Collect the incoming values from the PHIs. + Incoming.clear(); + for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie; + ++instr) { + PHINode *P = dyn_cast<PHINode>(instr); + if (!P) + break; - // Stop constructing the list when you reach a different type. - if (Incoming.size() && P->getType() != Incoming[0]->getType()) { - Changed |= tryToVectorizeList(Incoming, R); - Incoming.clear(); + if (!VisitedInstrs.count(P)) + Incoming.push_back(P); } - Incoming.push_back(P); + // Sort by type. + std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc); + + // Try to vectorize elements base on their type. + for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), + E = Incoming.end(); + IncIt != E;) { + + // Look for the next elements with the same type. + SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; + while (SameTypeIt != E && + (*SameTypeIt)->getType() == (*IncIt)->getType()) { + VisitedInstrs.insert(*SameTypeIt); + ++SameTypeIt; + } + + // Try to vectorize them. + unsigned NumElts = (SameTypeIt - IncIt); + DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); + if (NumElts > 1 && + tryToVectorizeList(ArrayRef<Value *>(IncIt, NumElts), R)) { + // Success start over because instructions might have been changed. + HaveVectorizedPhiNodes = true; + Changed = true; + break; + } + + // Start over at the next instruction of a differnt type (or the end). + IncIt = SameTypeIt; + } } - if (Incoming.size() > 1) - Changed |= tryToVectorizeList(Incoming, R); + VisitedInstrs.clear(); + + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { + // We may go through BB multiple times so skip the one we have checked. + if (!VisitedInstrs.insert(it)) + continue; - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { if (isa<DbgInfoIntrinsic>(it)) continue; @@ -1902,24 +2541,86 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (!BI) continue; - Value *Inst = BI->getOperand(0); + // Try to match and vectorize a horizontal reduction. + HorizontalReduction HorRdx; + if (ShouldVectorizeHor && + HorRdx.matchAssociativeReduction(P, BI, DL) && + HorRdx.tryToReduce(R, TTI)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; + } + + Value *Inst = BI->getOperand(0); if (Inst == P) Inst = BI->getOperand(1); - Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R); + if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) { + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; + } + continue; } + // Try to vectorize horizontal reductions feeding into a store. + if (ShouldStartVectorizeHorAtStore) + if (StoreInst *SI = dyn_cast<StoreInst>(it)) + if (BinaryOperator *BinOp = + dyn_cast<BinaryOperator>(SI->getValueOperand())) { + HorizontalReduction HorRdx; + if (((HorRdx.matchAssociativeReduction(0, BinOp, DL) && + HorRdx.tryToReduce(R, TTI)) || + tryToVectorize(BinOp, R))) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; + } + } + // Try to vectorize trees that start at compare instructions. if (CmpInst *CI = dyn_cast<CmpInst>(it)) { if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { - Changed |= true; + Changed = true; + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + it = BB->begin(); + e = BB->end(); + continue; + } + + for (int i = 0; i < 2; ++i) { + if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) { + if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { + Changed = true; + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + it = BB->begin(); + e = BB->end(); + } + } + } + continue; + } + + // Try to vectorize trees that start at insertelement instructions. + if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) { + SmallVector<Value *, 8> Ops; + if (!findBuildVector(IE, Ops)) continue; + + if (tryToVectorizeList(Ops, R)) { + Changed = true; + it = BB->begin(); + e = BB->end(); } - for (int i = 0; i < 2; ++i) - if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) - Changed |= - tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R); + continue; } } |