diff options
Diffstat (limited to 'lib/Transforms')
124 files changed, 19313 insertions, 5594 deletions
diff --git a/lib/Transforms/Hello/Hello.cpp b/lib/Transforms/Hello/Hello.cpp index b0e22de..9f2343b 100644 --- a/lib/Transforms/Hello/Hello.cpp +++ b/lib/Transforms/Hello/Hello.cpp @@ -13,10 +13,10 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "hello" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Function.h" #include "llvm/Pass.h" -#include "llvm/Function.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(HelloCounter, "Counts number of functions greeted"); diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp index b94dd69..385544a 100644 --- a/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -31,21 +31,21 @@ #define DEBUG_TYPE "argpromotion" #include "llvm/Transforms/IPO.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Module.h" -#include "llvm/CallGraphSCCPass.h" -#include "llvm/Instructions.h" -#include "llvm/LLVMContext.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CallGraph.h" -#include "llvm/Support/CallSite.h" +#include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" #include "llvm/Support/CFG.h" +#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" #include <set> using namespace llvm; @@ -153,7 +153,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { SmallPtrSet<Argument*, 8> ArgsToPromote; SmallPtrSet<Argument*, 8> ByValArgsToTransform; for (unsigned i = 0; i != PointerArgs.size(); ++i) { - bool isByVal = F->paramHasAttr(PointerArgs[i].second+1, Attribute::ByVal); + bool isByVal=F->getAttributes(). + hasAttribute(PointerArgs[i].second+1, Attribute::ByVal); Argument *PtrArg = PointerArgs[i].first; Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType(); @@ -510,15 +511,17 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // what the new GEP/Load instructions we are inserting look like. std::map<IndicesVector, LoadInst*> OriginalLoads; - // Attributes - Keep track of the parameter attributes for the arguments + // Attribute - Keep track of the parameter attributes for the arguments // that we are *not* promoting. For the ones that we do promote, the parameter // attributes are lost SmallVector<AttributeWithIndex, 8> AttributesVec; - const AttrListPtr &PAL = F->getAttributes(); + const AttributeSet &PAL = F->getAttributes(); // Add any return attributes. - if (Attributes attrs = PAL.getRetAttributes()) - AttributesVec.push_back(AttributeWithIndex::get(0, attrs)); + Attribute attrs = PAL.getRetAttributes(); + if (attrs.hasAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex, + attrs)); // First, determine the new argument list unsigned ArgIndex = 1; @@ -534,7 +537,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, } else if (!ArgsToPromote.count(I)) { // Unchanged argument Params.push_back(I->getType()); - if (Attributes attrs = PAL.getParamAttributes(ArgIndex)) + Attribute attrs = PAL.getParamAttributes(ArgIndex); + if (attrs.hasAttributes()) AttributesVec.push_back(AttributeWithIndex::get(Params.size(), attrs)); } else if (I->use_empty()) { // Dead argument (which are always marked as promotable) @@ -587,19 +591,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, } // Add any function attributes. - if (Attributes attrs = PAL.getFnAttributes()) - AttributesVec.push_back(AttributeWithIndex::get(~0, attrs)); + attrs = PAL.getFnAttributes(); + if (attrs.hasAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex, + attrs)); Type *RetTy = FTy->getReturnType(); - // Work around LLVM bug PR56: the CWriter cannot emit varargs functions which - // have zero fixed arguments. - bool ExtraArgHack = false; - if (Params.empty() && FTy->isVarArg()) { - ExtraArgHack = true; - Params.push_back(Type::getInt32Ty(F->getContext())); - } - // Construct the new function type using the new arguments. FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg()); @@ -613,7 +611,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // Recompute the parameter attributes list based on the new arguments for // the function. - NF->setAttributes(AttrListPtr::get(AttributesVec)); + NF->setAttributes(AttributeSet::get(F->getContext(), AttributesVec)); AttributesVec.clear(); F->getParent()->getFunctionList().insert(F, NF); @@ -638,11 +636,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, CallSite CS(F->use_back()); assert(CS.getCalledFunction() == F); Instruction *Call = CS.getInstruction(); - const AttrListPtr &CallPAL = CS.getAttributes(); + const AttributeSet &CallPAL = CS.getAttributes(); // Add any return attributes. - if (Attributes attrs = CallPAL.getRetAttributes()) - AttributesVec.push_back(AttributeWithIndex::get(0, attrs)); + Attribute attrs = CallPAL.getRetAttributes(); + if (attrs.hasAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex, + attrs)); // Loop over the operands, inserting GEP and loads in the caller as // appropriate. @@ -653,7 +653,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) { Args.push_back(*AI); // Unmodified argument - if (Attributes Attrs = CallPAL.getParamAttributes(ArgIndex)) + Attribute Attrs = CallPAL.getParamAttributes(ArgIndex); + if (Attrs.hasAttributes()) AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs)); } else if (ByValArgsToTransform.count(I)) { @@ -711,30 +712,32 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, } } - if (ExtraArgHack) - Args.push_back(Constant::getNullValue(Type::getInt32Ty(F->getContext()))); - // Push any varargs arguments on the list. for (; AI != CS.arg_end(); ++AI, ++ArgIndex) { Args.push_back(*AI); - if (Attributes Attrs = CallPAL.getParamAttributes(ArgIndex)) + Attribute Attrs = CallPAL.getParamAttributes(ArgIndex); + if (Attrs.hasAttributes()) AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs)); } // Add any function attributes. - if (Attributes attrs = CallPAL.getFnAttributes()) - AttributesVec.push_back(AttributeWithIndex::get(~0, attrs)); + attrs = CallPAL.getFnAttributes(); + if (attrs.hasAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex, + attrs)); Instruction *New; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), Args, "", Call); cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); - cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(AttributesVec)); + cast<InvokeInst>(New)->setAttributes(AttributeSet::get(II->getContext(), + AttributesVec)); } else { New = CallInst::Create(NF, Args, "", Call); cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); - cast<CallInst>(New)->setAttributes(AttrListPtr::get(AttributesVec)); + cast<CallInst>(New)->setAttributes(AttributeSet::get(New->getContext(), + AttributesVec)); if (cast<CallInst>(Call)->isTailCall()) cast<CallInst>(New)->setTailCall(); } @@ -870,16 +873,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, } // Increment I2 past all of the arguments added for this promoted pointer. - for (unsigned i = 0, e = ArgIndices.size(); i != e; ++i) - ++I2; + std::advance(I2, ArgIndices.size()); } - // Notify the alias analysis implementation that we inserted a new argument. - if (ExtraArgHack) - AA.copyValue(Constant::getNullValue(Type::getInt32Ty(F->getContext())), - NF->arg_begin()); - - // Tell the alias analysis that the old function is about to disappear. AA.replaceWithNewValue(F, NF); diff --git a/lib/Transforms/IPO/BarrierNoopPass.cpp b/lib/Transforms/IPO/BarrierNoopPass.cpp new file mode 100644 index 0000000..2e32240 --- /dev/null +++ b/lib/Transforms/IPO/BarrierNoopPass.cpp @@ -0,0 +1,47 @@ +//===- BarrierNoopPass.cpp - A barrier pass for the pass manager ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// NOTE: DO NOT USE THIS IF AVOIDABLE +// +// This pass is a nonce pass intended to allow manipulation of the implicitly +// nesting pass manager. For example, it can be used to cause a CGSCC pass +// manager to be closed prior to running a new collection of function passes. +// +// FIXME: This is a huge HACK. This should be removed when the pass manager's +// nesting is made explicit instead of implicit. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" +using namespace llvm; + +namespace { +/// \brief A nonce module pass used to place a barrier in a pass manager. +/// +/// There is no mechanism for ending a CGSCC pass manager once one is started. +/// This prevents extension points from having clear deterministic ordering +/// when they are phrased as non-module passes. +class BarrierNoop : public ModulePass { +public: + static char ID; // Pass identification. + + BarrierNoop() : ModulePass(ID) { + initializeBarrierNoopPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) { return false; } +}; +} + +ModulePass *llvm::createBarrierNoopPass() { return new BarrierNoop(); } + +char BarrierNoop::ID = 0; +INITIALIZE_PASS(BarrierNoop, "barrier", "A No-Op Barrier Pass", + false, false) diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt index 3f6b1de..90c1c33 100644 --- a/lib/Transforms/IPO/CMakeLists.txt +++ b/lib/Transforms/IPO/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMipo ArgumentPromotion.cpp + BarrierNoopPass.cpp ConstantMerge.cpp DeadArgumentElimination.cpp ExtractGV.cpp diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp index d8fae8a..8336d3a 100644 --- a/lib/Transforms/IPO/ConstantMerge.cpp +++ b/lib/Transforms/IPO/ConstantMerge.cpp @@ -19,15 +19,15 @@ #define DEBUG_TYPE "constmerge" #include "llvm/Transforms/IPO.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" -#include "llvm/Target/TargetData.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" using namespace llvm; STATISTIC(NumMerged, "Number of global constants merged"); @@ -50,7 +50,7 @@ namespace { // alignment to a concrete value. unsigned getAlignment(GlobalVariable *GV) const; - const TargetData *TD; + const DataLayout *TD; }; } @@ -98,7 +98,7 @@ unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const { } bool ConstantMerge::runOnModule(Module &M) { - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); // Find all the globals that are marked "used". These cannot be merged. SmallPtrSet<const GlobalValue*, 8> UsedGlobals; @@ -107,7 +107,7 @@ bool ConstantMerge::runOnModule(Module &M) { // Map unique <constants, has-unknown-alignment> pairs to globals. We don't // want to merge globals of unknown alignment with those of explicit - // alignment. If we have TargetData, we always know the alignment. + // alignment. If we have DataLayout, we always know the alignment. DenseMap<PointerIntPair<Constant*, 1, bool>, GlobalVariable*> CMap; // Replacements - This vector contains a list of replacements to perform. diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp index fd23a93..ff040e7 100644 --- a/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -19,20 +19,23 @@ #define DEBUG_TYPE "deadargelim" #include "llvm/Transforms/IPO.h" -#include "llvm/CallingConv.h" -#include "llvm/Constant.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" #include <map> #include <set> using namespace llvm; @@ -121,6 +124,15 @@ namespace { typedef SmallVector<RetOrArg, 5> UseVector; + // Map each LLVM function to corresponding metadata with debug info. If + // the function is replaced with another one, we should patch the pointer + // to LLVM function in metadata. + // As the code generation for module is finished (and DIBuilder is + // finalized) we assume that subprogram descriptors won't be changed, and + // they are stored in map for short duration anyway. + typedef DenseMap<Function*, DISubprogram> FunctionDIMap; + FunctionDIMap FunctionDIs; + protected: // DAH uses this to specify a different ID. explicit DAE(char &ID) : ModulePass(ID) {} @@ -141,6 +153,7 @@ namespace { unsigned RetValNum = 0); Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses); + void CollectFunctionDIs(Module &M); void SurveyFunction(const Function &F); void MarkValue(const RetOrArg &RA, Liveness L, const UseVector &MaybeLiveUses); @@ -180,6 +193,33 @@ INITIALIZE_PASS(DAH, "deadarghaX0r", ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); } ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); } +/// CollectFunctionDIs - Map each function in the module to its debug info +/// descriptor. +void DAE::CollectFunctionDIs(Module &M) { + FunctionDIs.clear(); + + for (Module::named_metadata_iterator I = M.named_metadata_begin(), + E = M.named_metadata_end(); I != E; ++I) { + NamedMDNode &NMD = *I; + for (unsigned MDIndex = 0, MDNum = NMD.getNumOperands(); + MDIndex < MDNum; ++MDIndex) { + MDNode *Node = NMD.getOperand(MDIndex); + if (!DIDescriptor(Node).isCompileUnit()) + continue; + DICompileUnit CU(Node); + const DIArray &SPs = CU.getSubprograms(); + for (unsigned SPIndex = 0, SPNum = SPs.getNumElements(); + SPIndex < SPNum; ++SPIndex) { + DISubprogram SP(SPs.getElement(SPIndex)); + if (!SP.Verify()) + continue; + if (Function *F = SP.getFunction()) + FunctionDIs[F] = SP; + } + } + } +} + /// DeleteDeadVarargs - If this is an function that takes a ... list, and if /// llvm.vastart is never called, the varargs list is dead for the function. bool DAE::DeleteDeadVarargs(Function &Fn) { @@ -231,14 +271,16 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { Args.assign(CS.arg_begin(), CS.arg_begin() + NumArgs); // Drop any attributes that were on the vararg arguments. - AttrListPtr PAL = CS.getAttributes(); + AttributeSet PAL = CS.getAttributes(); if (!PAL.isEmpty() && PAL.getSlot(PAL.getNumSlots() - 1).Index > NumArgs) { SmallVector<AttributeWithIndex, 8> AttributesVec; for (unsigned i = 0; PAL.getSlot(i).Index <= NumArgs; ++i) AttributesVec.push_back(PAL.getSlot(i)); - if (Attributes FnAttrs = PAL.getFnAttributes()) - AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); - PAL = AttrListPtr::get(AttributesVec); + Attribute FnAttrs = PAL.getFnAttributes(); + if (FnAttrs.hasAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex, + FnAttrs)); + PAL = AttributeSet::get(Fn.getContext(), AttributesVec); } Instruction *New; @@ -284,6 +326,11 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { I2->takeName(I); } + // Patch the pointer to LLVM function in debug info descriptor. + FunctionDIMap::iterator DI = FunctionDIs.find(&Fn); + if (DI != FunctionDIs.end()) + DI->second.replaceFunction(NF); + // Finally, nuke the old function. Fn.eraseFromParent(); return true; @@ -651,11 +698,11 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { // Set up to build a new list of parameter attributes. SmallVector<AttributeWithIndex, 8> AttributesVec; - const AttrListPtr &PAL = F->getAttributes(); + const AttributeSet &PAL = F->getAttributes(); // The existing function return attributes. - Attributes RAttrs = PAL.getRetAttributes(); - Attributes FnAttrs = PAL.getFnAttributes(); + Attribute RAttrs = PAL.getRetAttributes(); + Attribute FnAttrs = PAL.getFnAttributes(); // Find out the new return value. @@ -717,13 +764,17 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { // here. Currently, this should not be possible, but special handling might be // required when new return value attributes are added. if (NRetTy->isVoidTy()) - RAttrs &= ~Attribute::typeIncompatible(NRetTy); + RAttrs = + Attribute::get(NRetTy->getContext(), AttrBuilder(RAttrs). + removeAttributes(Attribute::typeIncompatible(NRetTy))); else - assert((RAttrs & Attribute::typeIncompatible(NRetTy)) == 0 - && "Return attributes no longer compatible?"); + assert(!AttrBuilder(RAttrs). + hasAttributes(Attribute::typeIncompatible(NRetTy)) && + "Return attributes no longer compatible?"); - if (RAttrs) - AttributesVec.push_back(AttributeWithIndex::get(0, RAttrs)); + if (RAttrs.hasAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex, + RAttrs)); // Remember which arguments are still alive. SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false); @@ -740,7 +791,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { // Get the original parameter attributes (skipping the first one, that is // for the return value. - if (Attributes Attrs = PAL.getParamAttributes(i + 1)) + Attribute Attrs = PAL.getParamAttributes(i + 1); + if (Attrs.hasAttributes()) AttributesVec.push_back(AttributeWithIndex::get(Params.size(), Attrs)); } else { ++NumArgumentsEliminated; @@ -749,11 +801,12 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { } } - if (FnAttrs != Attribute::None) - AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); + if (FnAttrs.hasAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex, + FnAttrs)); // Reconstruct the AttributesList based on the vector we constructed. - AttrListPtr NewPAL = AttrListPtr::get(AttributesVec); + AttributeSet NewPAL = AttributeSet::get(F->getContext(), AttributesVec); // Create the new function type based on the recomputed parameters. FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg()); @@ -780,15 +833,18 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { Instruction *Call = CS.getInstruction(); AttributesVec.clear(); - const AttrListPtr &CallPAL = CS.getAttributes(); + const AttributeSet &CallPAL = CS.getAttributes(); // The call return attributes. - Attributes RAttrs = CallPAL.getRetAttributes(); - Attributes FnAttrs = CallPAL.getFnAttributes(); + Attribute RAttrs = CallPAL.getRetAttributes(); + Attribute FnAttrs = CallPAL.getFnAttributes(); // Adjust in case the function was changed to return void. - RAttrs &= ~Attribute::typeIncompatible(NF->getReturnType()); - if (RAttrs) - AttributesVec.push_back(AttributeWithIndex::get(0, RAttrs)); + RAttrs = + Attribute::get(NF->getContext(), AttrBuilder(RAttrs). + removeAttributes(Attribute::typeIncompatible(NF->getReturnType()))); + if (RAttrs.hasAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex, + RAttrs)); // Declare these outside of the loops, so we can reuse them for the second // loop, which loops the varargs. @@ -800,22 +856,25 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { if (ArgAlive[i]) { Args.push_back(*I); // Get original parameter attributes, but skip return attributes. - if (Attributes Attrs = CallPAL.getParamAttributes(i + 1)) + Attribute Attrs = CallPAL.getParamAttributes(i + 1); + if (Attrs.hasAttributes()) AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs)); } // Push any varargs arguments on the list. Don't forget their attributes. for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) { Args.push_back(*I); - if (Attributes Attrs = CallPAL.getParamAttributes(i + 1)) + Attribute Attrs = CallPAL.getParamAttributes(i + 1); + if (Attrs.hasAttributes()) AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs)); } - if (FnAttrs != Attribute::None) - AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); + if (FnAttrs.hasAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex, + FnAttrs)); // Reconstruct the AttributesList based on the vector we constructed. - AttrListPtr NewCallPAL = AttrListPtr::get(AttributesVec); + AttributeSet NewCallPAL = AttributeSet::get(F->getContext(), AttributesVec); Instruction *New; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { @@ -952,6 +1011,11 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { BB->getInstList().erase(RI); } + // Patch the pointer to LLVM function in debug info descriptor. + FunctionDIMap::iterator DI = FunctionDIs.find(F); + if (DI != FunctionDIs.end()) + DI->second.replaceFunction(NF); + // Now that the old function is dead, delete it. F->eraseFromParent(); @@ -961,6 +1025,9 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { bool DAE::runOnModule(Module &M) { bool Changed = false; + // Collect debug info descriptors for functions. + CollectFunctionDIs(M); + // First pass: Do a simple check to see if any functions can have their "..." // removed. We can do this if they never call va_start. This loop cannot be // fused with the next loop, because deleting a function invalidates diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp index 4c7f0ed..8a6bfc6 100644 --- a/lib/Transforms/IPO/ExtractGV.cpp +++ b/lib/Transforms/IPO/ExtractGV.cpp @@ -11,13 +11,13 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Instructions.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" -#include "llvm/Constants.h" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/SetVector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include <algorithm> using namespace llvm; @@ -51,32 +51,75 @@ namespace { // Visit the GlobalVariables. for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I) { - if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) { - I->setInitializer(0); - } else { + bool Delete = + deleteStuff == (bool)Named.count(I) && !I->isDeclaration(); + if (!Delete) { if (I->hasAvailableExternallyLinkage()) continue; if (I->getName() == "llvm.global_ctors") continue; } - if (I->hasLocalLinkage()) + bool Local = I->hasLocalLinkage(); + if (Local) I->setVisibility(GlobalValue::HiddenVisibility); - I->setLinkage(GlobalValue::ExternalLinkage); + + if (Local || Delete) + I->setLinkage(GlobalValue::ExternalLinkage); + + if (Delete) + I->setInitializer(0); } // Visit the Functions. for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) { - I->deleteBody(); - } else { + bool Delete = + deleteStuff == (bool)Named.count(I) && !I->isDeclaration(); + if (!Delete) { if (I->hasAvailableExternallyLinkage()) continue; } - if (I->hasLocalLinkage()) + bool Local = I->hasLocalLinkage(); + if (Local) I->setVisibility(GlobalValue::HiddenVisibility); - I->setLinkage(GlobalValue::ExternalLinkage); + + if (Local || Delete) + I->setLinkage(GlobalValue::ExternalLinkage); + + if (Delete) + I->deleteBody(); + } + + // Visit the Aliases. + for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); + I != E;) { + Module::alias_iterator CurI = I; + ++I; + + if (CurI->hasLocalLinkage()) { + CurI->setVisibility(GlobalValue::HiddenVisibility); + CurI->setLinkage(GlobalValue::ExternalLinkage); + } + + if (deleteStuff == (bool)Named.count(CurI)) { + Type *Ty = CurI->getType()->getElementType(); + + CurI->removeFromParent(); + llvm::Value *Declaration; + if (FunctionType *FTy = dyn_cast<FunctionType>(Ty)) { + Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage, + CurI->getName(), &M); + + } else { + Declaration = + new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage, + 0, CurI->getName()); + + } + CurI->replaceAllUsesWith(Declaration); + delete CurI; + } } return true; diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp index f3f6228..e9bc4ad 100644 --- a/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/lib/Transforms/IPO/FunctionAttrs.cpp @@ -20,17 +20,17 @@ #define DEBUG_TYPE "functionattrs" #include "llvm/Transforms/IPO.h" -#include "llvm/CallGraphSCCPass.h" -#include "llvm/GlobalVariable.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/CallGraph.h" -#include "llvm/Analysis/CaptureTracking.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/UniqueVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/Support/InstIterator.h" using namespace llvm; @@ -212,10 +212,17 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) { MadeChange = true; // Clear out any existing attributes. - F->removeAttribute(~0, Attribute::ReadOnly | Attribute::ReadNone); + AttrBuilder B; + B.addAttribute(Attribute::ReadOnly) + .addAttribute(Attribute::ReadNone); + F->removeAttribute(AttributeSet::FunctionIndex, + Attribute::get(F->getContext(), B)); // Add in the new attribute. - F->addAttribute(~0, ReadsMemory? Attribute::ReadOnly : Attribute::ReadNone); + B.clear(); + B.addAttribute(ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone); + F->addAttribute(AttributeSet::FunctionIndex, + Attribute::get(F->getContext(), B)); if (ReadsMemory) ++NumReadOnly; @@ -276,8 +283,6 @@ namespace { void tooManyUses() { Captured = true; } - bool shouldExplore(Use *U) { return true; } - bool captured(Use *U) { CallSite CS(U->getUser()); if (!CS.getInstruction()) { Captured = true; return true; } @@ -352,6 +357,9 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) { ArgumentGraph AG; + AttrBuilder B; + B.addAttribute(Attribute::NoCapture); + // Check each function in turn, determining which pointer arguments are not // captured. for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { @@ -373,7 +381,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) { for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E; ++A) { if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) { - A->addAttr(Attribute::NoCapture); + A->addAttr(Attribute::get(F->getContext(), B)); ++NumNoCapture; Changed = true; } @@ -388,7 +396,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) { if (!Tracker.Captured) { if (Tracker.Uses.empty()) { // If it's trivially not captured, mark it nocapture now. - A->addAttr(Attribute::NoCapture); + A->addAttr(Attribute::get(F->getContext(), B)); ++NumNoCapture; Changed = true; } else { @@ -421,7 +429,9 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) { // eg. "void f(int* x) { if (...) f(x); }" if (ArgumentSCC[0]->Uses.size() == 1 && ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) { - ArgumentSCC[0]->Definition->addAttr(Attribute::NoCapture); + ArgumentSCC[0]-> + Definition-> + addAttr(Attribute::get(ArgumentSCC[0]->Definition->getContext(), B)); ++NumNoCapture; Changed = true; } @@ -463,7 +473,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) { for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; - A->addAttr(Attribute::NoCapture); + A->addAttr(Attribute::get(A->getContext(), B)); ++NumNoCapture; Changed = true; } @@ -476,13 +486,13 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) { /// or a pointer that doesn't alias any other pointer visible to the caller. bool FunctionAttrs::IsFunctionMallocLike(Function *F, SmallPtrSet<Function*, 8> &SCCNodes) const { - UniqueVector<Value *> FlowsToReturn; + SmallSetVector<Value *, 8> FlowsToReturn; for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) if (ReturnInst *Ret = dyn_cast<ReturnInst>(I->getTerminator())) FlowsToReturn.insert(Ret->getReturnValue()); for (unsigned i = 0; i != FlowsToReturn.size(); ++i) { - Value *RetVal = FlowsToReturn[i+1]; // UniqueVector[0] is reserved. + Value *RetVal = FlowsToReturn[i]; if (Constant *C = dyn_cast<Constant>(RetVal)) { if (!C->isNullValue() && !isa<UndefValue>(C)) diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp index 18c1c7b..dc99492 100644 --- a/lib/Transforms/IPO/GlobalDCE.cpp +++ b/lib/Transforms/IPO/GlobalDCE.cpp @@ -17,11 +17,11 @@ #define DEBUG_TYPE "globaldce" #include "llvm/Transforms/IPO.h" -#include "llvm/Constants.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" using namespace llvm; STATISTIC(NumAliases , "Number of global aliases removed"); diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index b888e95..abd37c2 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -15,29 +15,29 @@ #define DEBUG_TYPE "globalopt" #include "llvm/Transforms/IPO.h" -#include "llvm/CallingConv.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Module.h" -#include "llvm/Operator.h" -#include "llvm/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/MemoryBuiltins.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/Target/TargetLibraryInfo.h" #include <algorithm> using namespace llvm; @@ -83,7 +83,7 @@ namespace { const GlobalStatus &GS); bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn); - TargetData *TD; + DataLayout *TD; TargetLibraryInfo *TLI; }; } @@ -148,17 +148,13 @@ struct GlobalStatus { /// an instruction (e.g. a constant expr or GV initializer). bool HasNonInstructionUser; - /// HasPHIUser - Set to true if this global has a user that is a PHI node. - bool HasPHIUser; - /// AtomicOrdering - Set to the strongest atomic ordering requirement. AtomicOrdering Ordering; GlobalStatus() : isCompared(false), isLoaded(false), StoredType(NotStored), StoredOnceValue(0), AccessingFunction(0), HasMultipleAccessingFunctions(false), - HasNonInstructionUser(false), HasPHIUser(false), - Ordering(NotAtomic) {} + HasNonInstructionUser(false), Ordering(NotAtomic) {} }; } @@ -200,11 +196,11 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, const User *U = *UI; if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { GS.HasNonInstructionUser = true; - + // If the result of the constantexpr isn't pointer type, then we won't // know to expect it in various places. Just reject early. if (!isa<PointerType>(CE->getType())) return true; - + if (AnalyzeGlobal(CE, GS, PHIUsers)) return true; } else if (const Instruction *I = dyn_cast<Instruction>(U)) { if (!GS.HasMultipleAccessingFunctions) { @@ -225,6 +221,7 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, // Don't hack on volatile stores. if (SI->isVolatile()) return true; + GS.Ordering = StrongerOrdering(GS.Ordering, SI->getOrdering()); // If this is a direct store to the global (i.e., the global is a scalar @@ -234,6 +231,14 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, if (const GlobalVariable *GV = dyn_cast<GlobalVariable>( SI->getOperand(1))) { Value *StoredVal = SI->getOperand(0); + + if (Constant *C = dyn_cast<Constant>(StoredVal)) { + if (C->isThreadDependent()) { + // The stored value changes between threads; don't track it. + return true; + } + } + if (StoredVal == GV->getInitializer()) { if (GS.StoredType < GlobalStatus::isInitializerStored) GS.StoredType = GlobalStatus::isInitializerStored; @@ -265,7 +270,6 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, // have to be careful about infinite recursion. if (PHIUsers.insert(PN)) // Not already visited. if (AnalyzeGlobal(I, GS, PHIUsers)) return true; - GS.HasPHIUser = true; } else if (isa<CmpInst>(I)) { GS.isCompared = true; } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) { @@ -464,7 +468,7 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV, /// quick scan over the use list to clean up the easy and obvious cruft. This /// returns true if it made a change. static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, - TargetData *TD, TargetLibraryInfo *TLI) { + DataLayout *TD, TargetLibraryInfo *TLI) { bool Changed = false; for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;) { User *U = *UI++; @@ -656,7 +660,7 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) { /// behavior of the program in a more fine-grained way. We have determined that /// this transformation is safe already. We return the first global variable we /// insert so that the caller can reprocess it. -static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) { +static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &TD) { // Make sure this global only has simple uses that we can SRA. if (!GlobalUsersSafeToSRA(GV)) return 0; @@ -932,7 +936,7 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { /// if the loaded value is dynamically null, then we know that they cannot be /// reachable with a null optimize away the load. static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, - TargetData *TD, + DataLayout *TD, TargetLibraryInfo *TLI) { bool Changed = false; @@ -962,7 +966,9 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, // If we get here we could have other crazy uses that are transitively // loaded. assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) || - isa<ConstantExpr>(GlobalUser) || isa<CmpInst>(GlobalUser)) && + isa<ConstantExpr>(GlobalUser) || isa<CmpInst>(GlobalUser) || + isa<BitCastInst>(GlobalUser) || + isa<GetElementPtrInst>(GlobalUser)) && "Only expect load and stores!"); } } @@ -994,7 +1000,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, /// ConstantPropUsersOf - Walk the use list of V, constant folding all of the /// instructions that are foldable. static void ConstantPropUsersOf(Value *V, - TargetData *TD, TargetLibraryInfo *TLI) { + DataLayout *TD, TargetLibraryInfo *TLI) { for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ) if (Instruction *I = dyn_cast<Instruction>(*UI++)) if (Constant *NewC = ConstantFoldInstruction(I, TD, TLI)) { @@ -1017,7 +1023,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, ConstantInt *NElements, - TargetData *TD, + DataLayout *TD, TargetLibraryInfo *TLI) { DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << " CALL = " << *CI << '\n'); @@ -1466,7 +1472,7 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, /// PerformHeapAllocSRoA - CI is an allocation of an array of structures. Break /// it up into multiple allocations of arrays of the fields. static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, - Value *NElems, TargetData *TD, + Value *NElems, DataLayout *TD, const TargetLibraryInfo *TLI) { DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << " MALLOC = " << *CI << '\n'); Type *MAT = getMallocAllocatedType(CI, TLI); @@ -1658,7 +1664,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, Type *AllocTy, AtomicOrdering Ordering, Module::global_iterator &GVI, - TargetData *TD, + DataLayout *TD, TargetLibraryInfo *TLI) { if (!TD) return false; @@ -1757,7 +1763,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, AtomicOrdering Ordering, Module::global_iterator &GVI, - TargetData *TD, TargetLibraryInfo *TLI) { + DataLayout *TD, TargetLibraryInfo *TLI) { // Ignore no-op GEPs and bitcasts. StoredOnceVal = StoredOnceVal->stripPointerCasts(); @@ -2000,7 +2006,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, ++NumMarked; return true; } else if (!GV->getInitializer()->getType()->isSingleValueType()) { - if (TargetData *TD = getAnalysisIfAvailable<TargetData>()) + if (DataLayout *TD = getAnalysisIfAvailable<DataLayout>()) if (GlobalVariable *FirstNewGV = SRAGlobal(GV, *TD)) { GVI = FirstNewGV; // Don't skip the newly produced globals! return true; @@ -2059,25 +2065,26 @@ static void ChangeCalleesToFastCall(Function *F) { } } -static AttrListPtr StripNest(const AttrListPtr &Attrs) { +static AttributeSet StripNest(LLVMContext &C, const AttributeSet &Attrs) { for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) { - if ((Attrs.getSlot(i).Attrs & Attribute::Nest) == 0) + if (!Attrs.getSlot(i).Attrs.hasAttribute(Attribute::Nest)) continue; // There can be only one. - return Attrs.removeAttr(Attrs.getSlot(i).Index, Attribute::Nest); + return Attrs.removeAttr(C, Attrs.getSlot(i).Index, + Attribute::get(C, Attribute::Nest)); } return Attrs; } static void RemoveNestAttribute(Function *F) { - F->setAttributes(StripNest(F->getAttributes())); + F->setAttributes(StripNest(F->getContext(), F->getAttributes())); for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){ if (isa<BlockAddress>(*UI)) continue; CallSite User(cast<Instruction>(*UI)); - User.setAttributes(StripNest(User.getAttributes())); + User.setAttributes(StripNest(F->getContext(), User.getAttributes())); } } @@ -2145,7 +2152,7 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) { GlobalVariable *GlobalOpt::FindGlobalCtors(Module &M) { GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); if (GV == 0) return 0; - + // Verify that the initializer is simple enough for us to handle. We are // only allowed to optimize the initializer if it is unique. if (!GV->hasUniqueInitializer()) return 0; @@ -2251,10 +2258,10 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL, } -static inline bool +static inline bool isSimpleEnoughValueToCommit(Constant *C, SmallPtrSet<Constant*, 8> &SimpleConstants, - const TargetData *TD); + const DataLayout *TD); /// isSimpleEnoughValueToCommit - Return true if the specified constant can be @@ -2267,13 +2274,13 @@ isSimpleEnoughValueToCommit(Constant *C, /// time. static bool isSimpleEnoughValueToCommitHelper(Constant *C, SmallPtrSet<Constant*, 8> &SimpleConstants, - const TargetData *TD) { + const DataLayout *TD) { // Simple integer, undef, constant aggregate zero, global addresses, etc are // all supported. if (C->getNumOperands() == 0 || isa<BlockAddress>(C) || isa<GlobalValue>(C)) return true; - + // Aggregate values are safe if all their elements are. if (isa<ConstantArray>(C) || isa<ConstantStruct>(C) || isa<ConstantVector>(C)) { @@ -2284,7 +2291,7 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C, } return true; } - + // We don't know exactly what relocations are allowed in constant expressions, // so we allow &global+constantoffset, which is safe and uniformly supported // across targets. @@ -2302,14 +2309,14 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C, TD->getTypeSizeInBits(CE->getOperand(0)->getType())) return false; return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD); - + // GEP is fine if it is simple + constant offset. case Instruction::GetElementPtr: for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i) if (!isa<ConstantInt>(CE->getOperand(i))) return false; return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD); - + case Instruction::Add: // We allow simple+cst. if (!isa<ConstantInt>(CE->getOperand(1))) @@ -2319,10 +2326,10 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C, return false; } -static inline bool +static inline bool isSimpleEnoughValueToCommit(Constant *C, SmallPtrSet<Constant*, 8> &SimpleConstants, - const TargetData *TD) { + const DataLayout *TD) { // If we already checked this constant, we win. if (!SimpleConstants.insert(C)) return true; // Check the constant. @@ -2367,7 +2374,7 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) { return false; return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE); - + // A constantexpr bitcast from a pointer to another pointer is a no-op, // and we know how to evaluate it by moving the bitcast from the pointer // operand to the value operand. @@ -2378,7 +2385,7 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) { return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer(); } } - + return false; } @@ -2408,7 +2415,7 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val, // Return the modified struct. return ConstantStruct::get(STy, Elts); } - + ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo)); SequentialType *InitTy = cast<SequentialType>(Init->getType()); @@ -2453,7 +2460,7 @@ namespace { /// Once an evaluation call fails, the evaluation object should not be reused. class Evaluator { public: - Evaluator(const TargetData *TD, const TargetLibraryInfo *TLI) + Evaluator(const DataLayout *TD, const TargetLibraryInfo *TLI) : TD(TD), TLI(TLI) { ValueStack.push_back(new DenseMap<Value*, Constant*>); } @@ -2534,7 +2541,7 @@ private: /// simple enough to live in a static initializer of a global. SmallPtrSet<Constant*, 8> SimpleConstants; - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; }; @@ -2585,23 +2592,23 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, if (!isSimpleEnoughPointerToCommit(Ptr)) // If this is too complex for us to commit, reject it. return false; - + Constant *Val = getVal(SI->getOperand(0)); // If this might be too difficult for the backend to handle (e.g. the addr // of one global variable divided by another) then we can't commit it. if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, TD)) return false; - + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) if (CE->getOpcode() == Instruction::BitCast) { // If we're evaluating a store through a bitcast, then we need // to pull the bitcast off the pointer type and push it onto the // stored value. Ptr = CE->getOperand(0); - + Type *NewTy = cast<PointerType>(Ptr->getType())->getElementType(); - + // In order to push the bitcast onto the stored value, a bitcast // from NewTy to Val's type must be legal. If it's not, we can try // introspecting NewTy to find a legal conversion. @@ -2626,12 +2633,12 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, return false; } } - + // If we found compatible types, go ahead and push the bitcast // onto the stored value. Val = ConstantExpr::getBitCast(Val, NewTy); } - + MutatedMemory[Ptr] = Val; } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) { InstResult = ConstantExpr::get(BO->getOpcode(), @@ -2793,7 +2800,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, if (!CurInst->use_empty()) { if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult)) InstResult = ConstantFoldConstantExpression(CE, TD, TLI); - + setVal(CurInst, InstResult); } @@ -2872,14 +2879,14 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, /// EvaluateStaticConstructor - Evaluate static constructors in the function, if /// we can. Return true if we can, false otherwise. -static bool EvaluateStaticConstructor(Function *F, const TargetData *TD, +static bool EvaluateStaticConstructor(Function *F, const DataLayout *TD, const TargetLibraryInfo *TLI) { // Call the function. Evaluator Eval(TD, TLI); Constant *RetValDummy; bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy, SmallVector<Constant*, 0>()); - + if (EvalSuccess) { // We succeeded at evaluation: commit the result. DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '" @@ -2999,13 +3006,13 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) { return 0; Function *Fn = M.getFunction(TLI->getName(LibFunc::cxa_atexit)); - + if (!Fn) return 0; FunctionType *FTy = Fn->getFunctionType(); - - // Checking that the function has the right return type, the right number of + + // Checking that the function has the right return type, the right number of // parameters and that they all have pointer types should be enough. if (!FTy->getReturnType()->isIntegerTy() || FTy->getNumParams() != 3 || @@ -3080,7 +3087,7 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) { // and remove them. bool Changed = false; - for (Function::use_iterator I = CXAAtExitFn->use_begin(), + for (Function::use_iterator I = CXAAtExitFn->use_begin(), E = CXAAtExitFn->use_end(); I != E;) { // We're only interested in calls. Theoretically, we could handle invoke // instructions as well, but neither llvm-gcc nor clang generate invokes @@ -3089,7 +3096,7 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) { if (!CI) continue; - Function *DtorFn = + Function *DtorFn = dyn_cast<Function>(CI->getArgOperand(0)->stripPointerCasts()); if (!DtorFn) continue; @@ -3113,7 +3120,7 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) { bool GlobalOpt::runOnModule(Module &M) { bool Changed = false; - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); // Try to find the llvm.globalctors list. diff --git a/lib/Transforms/IPO/IPConstantPropagation.cpp b/lib/Transforms/IPO/IPConstantPropagation.cpp index d757e1f..4ac1dfc 100644 --- a/lib/Transforms/IPO/IPConstantPropagation.cpp +++ b/lib/Transforms/IPO/IPConstantPropagation.cpp @@ -17,14 +17,14 @@ #define DEBUG_TYPE "ipconstprop" #include "llvm/Transforms/IPO.h" -#include "llvm/Constants.h" -#include "llvm/Instructions.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/CallSite.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/SmallVector.h" using namespace llvm; STATISTIC(NumArgumentsProped, "Number of args turned into constants"); diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp index 6233922..5d563d8 100644 --- a/lib/Transforms/IPO/IPO.cpp +++ b/lib/Transforms/IPO/IPO.cpp @@ -1,4 +1,4 @@ -//===-- Scalar.cpp --------------------------------------------------------===// +//===-- IPO.cpp -----------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -95,7 +95,10 @@ void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) { } void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) { - unwrap(PM)->add(createInternalizePass(AllButMain != 0)); + std::vector<const char *> Export; + if (AllButMain) + Export.push_back("main"); + unwrap(PM)->add(createInternalizePass(Export)); } void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) { diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp index 664ddf6..2971803 100644 --- a/lib/Transforms/IPO/InlineAlways.cpp +++ b/lib/Transforms/IPO/InlineAlways.cpp @@ -13,18 +13,18 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "inline" -#include "llvm/CallingConv.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Module.h" -#include "llvm/Type.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" #include "llvm/Support/CallSite.h" -#include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/InlinerPass.h" -#include "llvm/Target/TargetData.h" -#include "llvm/ADT/SmallPtrSet.h" using namespace llvm; @@ -32,6 +32,7 @@ namespace { // AlwaysInliner only inlines functions that are mark as "always inline". class AlwaysInliner : public Inliner { + InlineCostAnalyzer CA; public: // Use extremely low threshold. AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/true) { @@ -43,6 +44,10 @@ namespace { } static char ID; // Pass identification, replacement for typeid virtual InlineCost getInlineCost(CallSite CS); + + using llvm::Pass::doInitialization; + using llvm::Pass::doFinalization; + virtual bool doFinalization(CallGraph &CG) { return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true); } @@ -63,35 +68,6 @@ Pass *llvm::createAlwaysInlinerPass(bool InsertLifetime) { return new AlwaysInliner(InsertLifetime); } -/// \brief Minimal filter to detect invalid constructs for inlining. -static bool isInlineViable(Function &F) { - bool ReturnsTwice = F.hasFnAttr(Attribute::ReturnsTwice); - for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { - // Disallow inlining of functions which contain an indirect branch. - if (isa<IndirectBrInst>(BI->getTerminator())) - return false; - - for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; - ++II) { - CallSite CS(II); - if (!CS) - continue; - - // Disallow recursive calls. - if (&F == CS.getCalledFunction()) - return false; - - // Disallow calls which expose returns-twice to a function not previously - // attributed as such. - if (!ReturnsTwice && CS.isCall() && - cast<CallInst>(CS.getInstruction())->canReturnTwice()) - return false; - } - } - - return true; -} - /// \brief Get the inline cost for the always-inliner. /// /// The always inliner *only* handles functions which are marked with the @@ -106,27 +82,22 @@ static bool isInlineViable(Function &F) { /// likely not worth it in practice. InlineCost AlwaysInliner::getInlineCost(CallSite CS) { Function *Callee = CS.getCalledFunction(); - // We assume indirect calls aren't calling an always-inline function. - if (!Callee) return InlineCost::getNever(); - - // We can't inline calls to external functions. - // FIXME: We shouldn't even get here. - if (Callee->isDeclaration()) return InlineCost::getNever(); - - // Return never for anything not marked as always inline. - if (!Callee->hasFnAttr(Attribute::AlwaysInline)) - return InlineCost::getNever(); - // Do some minimal analysis to preclude non-viable functions. - if (!isInlineViable(*Callee)) - return InlineCost::getNever(); + // Only inline direct calls to functions with always-inline attributes + // that are viable for inlining. FIXME: We shouldn't even get here for + // declarations. + if (Callee && !Callee->isDeclaration() && + Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::AlwaysInline) && + CA.isInlineViable(*Callee)) + return InlineCost::getAlways(); - // Otherwise, force inlining. - return InlineCost::getAlways(); + return InlineCost::getNever(); } // doInitialization - Initializes the vector of functions that have not // been annotated with the "always inline" attribute. bool AlwaysInliner::doInitialization(CallGraph &CG) { + CA.setDataLayout(getAnalysisIfAvailable<DataLayout>()); return false; } diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp index 50038d8..9682923 100644 --- a/lib/Transforms/IPO/InlineSimple.cpp +++ b/lib/Transforms/IPO/InlineSimple.cpp @@ -12,17 +12,17 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "inline" -#include "llvm/CallingConv.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Module.h" -#include "llvm/Type.h" +#include "llvm/Transforms/IPO.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" #include "llvm/Support/CallSite.h" -#include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/InlinerPass.h" -#include "llvm/Target/TargetData.h" using namespace llvm; @@ -42,6 +42,7 @@ namespace { InlineCost getInlineCost(CallSite CS) { return CA.getInlineCost(CS, getInlineThreshold(CS)); } + using llvm::Pass::doInitialization; virtual bool doInitialization(CallGraph &CG); }; } @@ -62,7 +63,7 @@ Pass *llvm::createFunctionInliningPass(int Threshold) { // doInitialization - Initializes the vector of functions that have been // annotated with the noinline attribute. bool SimpleInliner::doInitialization(CallGraph &CG) { - CA.setTargetData(getAnalysisIfAvailable<TargetData>()); + CA.setDataLayout(getAnalysisIfAvailable<DataLayout>()); return false; } diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index 69a22fb..2187a2a 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -14,22 +14,22 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "inline" -#include "llvm/Module.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" +#include "llvm/Transforms/IPO/InlinerPass.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/IPO/InlinerPass.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; STATISTIC(NumInlined, "Number of functions inlined"); @@ -64,8 +64,8 @@ Inliner::Inliner(char &ID, int Threshold, bool InsertLifetime) /// getAnalysisUsage - For this class, we declare that we require and preserve /// the call graph. If the derived class implements this method, it should /// always explicitly call the implementation here. -void Inliner::getAnalysisUsage(AnalysisUsage &Info) const { - CallGraphSCCPass::getAnalysisUsage(Info); +void Inliner::getAnalysisUsage(AnalysisUsage &AU) const { + CallGraphSCCPass::getAnalysisUsage(AU); } @@ -93,10 +93,13 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, // If the inlined function had a higher stack protection level than the // calling function, then bump up the caller's stack protection level. - if (Callee->hasFnAttr(Attribute::StackProtectReq)) + if (Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::StackProtectReq)) Caller->addFnAttr(Attribute::StackProtectReq); - else if (Callee->hasFnAttr(Attribute::StackProtect) && - !Caller->hasFnAttr(Attribute::StackProtectReq)) + else if (Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::StackProtect) && + !Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::StackProtectReq)) Caller->addFnAttr(Attribute::StackProtect); // Look at all of the allocas that we inlined through this call site. If we @@ -209,15 +212,21 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const { // would decrease the threshold. Function *Caller = CS.getCaller(); bool OptSize = Caller && !Caller->isDeclaration() && - Caller->hasFnAttr(Attribute::OptimizeForSize); - if (!(InlineLimit.getNumOccurrences() > 0) && OptSize && OptSizeThreshold < thres) + Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize); + if (!(InlineLimit.getNumOccurrences() > 0) && OptSize && + OptSizeThreshold < thres) thres = OptSizeThreshold; - // Listen to the inlinehint attribute when it would increase the threshold. + // Listen to the inlinehint attribute when it would increase the threshold + // and the caller does not need to minimize its size. Function *Callee = CS.getCalledFunction(); bool InlineHint = Callee && !Callee->isDeclaration() && - Callee->hasFnAttr(Attribute::InlineHint); - if (InlineHint && HintThreshold > thres) + Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::InlineHint); + if (InlineHint && HintThreshold > thres + && !Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::MinSize)) thres = HintThreshold; return thres; @@ -339,7 +348,7 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID, bool Inliner::runOnSCC(CallGraphSCC &SCC) { CallGraph &CG = getAnalysis<CallGraph>(); - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); const TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); SmallPtrSet<Function*, 8> SCCFunctions; @@ -532,7 +541,9 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) { // Handle the case when this function is called and we only want to care // about always-inline functions. This is a bit of a hack to share code // between here and the InlineAlways pass. - if (AlwaysInlineOnly && !F->hasFnAttr(Attribute::AlwaysInline)) + if (AlwaysInlineOnly && + !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::AlwaysInline)) continue; // If the only remaining users of the function are dead constants, remove diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp index fb5869e..70d55b0 100644 --- a/lib/Transforms/IPO/Internalize.cpp +++ b/lib/Transforms/IPO/Internalize.cpp @@ -7,21 +7,21 @@ // //===----------------------------------------------------------------------===// // -// This pass loops over all of the functions in the input module, looking for a -// main function. If a main function is found, all other functions and all -// global variables with initializers are marked as internal. +// This pass loops over all of the functions and variables in the input module. +// If the function or variable is not in the list of external names given to +// the pass it is marked as internal. // //===----------------------------------------------------------------------===// #define DEBUG_TYPE "internalize" -#include "llvm/Analysis/CallGraph.h" #include "llvm/Transforms/IPO.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" #include <fstream> #include <set> using namespace llvm; @@ -45,13 +45,10 @@ APIList("internalize-public-api-list", cl::value_desc("list"), namespace { class InternalizePass : public ModulePass { std::set<std::string> ExternalNames; - /// If no api symbols were specified and a main function is defined, - /// assume the main function is the only API - bool AllButMain; public: static char ID; // Pass identification, replacement for typeid - explicit InternalizePass(bool AllButMain = true); - explicit InternalizePass(const std::vector <const char *>& exportList); + explicit InternalizePass(); + explicit InternalizePass(ArrayRef<const char *> exportList); void LoadFile(const char *Filename); virtual bool runOnModule(Module &M); @@ -66,8 +63,8 @@ char InternalizePass::ID = 0; INITIALIZE_PASS(InternalizePass, "internalize", "Internalize Global Symbols", false, false) -InternalizePass::InternalizePass(bool AllButMain) - : ModulePass(ID), AllButMain(AllButMain){ +InternalizePass::InternalizePass() + : ModulePass(ID) { initializeInternalizePassPass(*PassRegistry::getPassRegistry()); if (!APIFile.empty()) // If a filename is specified, use it. LoadFile(APIFile.c_str()); @@ -75,10 +72,10 @@ InternalizePass::InternalizePass(bool AllButMain) ExternalNames.insert(APIList.begin(), APIList.end()); } -InternalizePass::InternalizePass(const std::vector<const char *>&exportList) - : ModulePass(ID), AllButMain(false){ +InternalizePass::InternalizePass(ArrayRef<const char *> exportList) + : ModulePass(ID){ initializeInternalizePassPass(*PassRegistry::getPassRegistry()); - for(std::vector<const char *>::const_iterator itr = exportList.begin(); + for(ArrayRef<const char *>::const_iterator itr = exportList.begin(); itr != exportList.end(); itr++) { ExternalNames.insert(*itr); } @@ -103,23 +100,6 @@ void InternalizePass::LoadFile(const char *Filename) { bool InternalizePass::runOnModule(Module &M) { CallGraph *CG = getAnalysisIfAvailable<CallGraph>(); CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0; - - if (ExternalNames.empty()) { - // Return if we're not in 'all but main' mode and have no external api - if (!AllButMain) - return false; - // If no list or file of symbols was specified, check to see if there is a - // "main" symbol defined in the module. If so, use it, otherwise do not - // internalize the module, it must be a library or something. - // - Function *MainFunc = M.getFunction("main"); - if (MainFunc == 0 || MainFunc->isDeclaration()) - return false; // No main found, must be a library... - - // Preserve main, internalize all else. - ExternalNames.insert(MainFunc->getName()); - } - bool Changed = false; // Never internalize functions which code-gen might insert. @@ -189,10 +169,10 @@ bool InternalizePass::runOnModule(Module &M) { return Changed; } -ModulePass *llvm::createInternalizePass(bool AllButMain) { - return new InternalizePass(AllButMain); +ModulePass *llvm::createInternalizePass() { + return new InternalizePass(); } -ModulePass *llvm::createInternalizePass(const std::vector <const char *> &el) { +ModulePass *llvm::createInternalizePass(ArrayRef<const char *> el) { return new InternalizePass(el); } diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp index 97d7cdc..8282a8e 100644 --- a/lib/Transforms/IPO/LoopExtractor.cpp +++ b/lib/Transforms/IPO/LoopExtractor.cpp @@ -16,16 +16,16 @@ #define DEBUG_TYPE "loop-extract" #include "llvm/Transforms/IPO.h" -#include "llvm/Instructions.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/CodeExtractor.h" -#include "llvm/ADT/Statistic.h" #include <fstream> #include <set> using namespace llvm; diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp index 9f70f66..892100f 100644 --- a/lib/Transforms/IPO/MergeFunctions.cpp +++ b/lib/Transforms/IPO/MergeFunctions.cpp @@ -45,25 +45,25 @@ #define DEBUG_TYPE "mergefunc" #include "llvm/Transforms/IPO.h" -#include "llvm/Constants.h" -#include "llvm/IRBuilder.h" -#include "llvm/InlineAsm.h" -#include "llvm/Instructions.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" -#include "llvm/Operator.h" -#include "llvm/Pass.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" #include <vector> using namespace llvm; @@ -92,19 +92,19 @@ static unsigned profileFunction(const Function *F) { namespace { /// ComparableFunction - A struct that pairs together functions with a -/// TargetData so that we can keep them together as elements in the DenseSet. +/// DataLayout so that we can keep them together as elements in the DenseSet. class ComparableFunction { public: static const ComparableFunction EmptyKey; static const ComparableFunction TombstoneKey; - static TargetData * const LookupOnly; + static DataLayout * const LookupOnly; - ComparableFunction(Function *Func, TargetData *TD) + ComparableFunction(Function *Func, DataLayout *TD) : Func(Func), Hash(profileFunction(Func)), TD(TD) {} Function *getFunc() const { return Func; } unsigned getHash() const { return Hash; } - TargetData *getTD() const { return TD; } + DataLayout *getTD() const { return TD; } // Drops AssertingVH reference to the function. Outside of debug mode, this // does nothing. @@ -120,13 +120,13 @@ private: AssertingVH<Function> Func; unsigned Hash; - TargetData *TD; + DataLayout *TD; }; const ComparableFunction ComparableFunction::EmptyKey = ComparableFunction(0); const ComparableFunction ComparableFunction::TombstoneKey = ComparableFunction(1); -TargetData *const ComparableFunction::LookupOnly = (TargetData*)(-1); +DataLayout *const ComparableFunction::LookupOnly = (DataLayout*)(-1); } @@ -150,12 +150,12 @@ namespace llvm { namespace { /// FunctionComparator - Compares two functions to determine whether or not -/// they will generate machine code with the same behaviour. TargetData is +/// they will generate machine code with the same behaviour. DataLayout is /// used if available. The comparator always fails conservatively (erring on the /// side of claiming that two functions are different). class FunctionComparator { public: - FunctionComparator(const TargetData *TD, const Function *F1, + FunctionComparator(const DataLayout *TD, const Function *F1, const Function *F2) : F1(F1), F2(F2), TD(TD) {} @@ -190,7 +190,7 @@ private: // The two functions undergoing comparison. const Function *F1, *F2; - const TargetData *TD; + const DataLayout *TD; DenseMap<const Value *, const Value *> id_map; DenseSet<const Value *> seen_values; @@ -346,13 +346,11 @@ bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2) { // When we have target data, we can reduce the GEP down to the value in bytes // added to the address. - if (TD && GEP1->hasAllConstantIndices() && GEP2->hasAllConstantIndices()) { - SmallVector<Value *, 8> Indices1(GEP1->idx_begin(), GEP1->idx_end()); - SmallVector<Value *, 8> Indices2(GEP2->idx_begin(), GEP2->idx_end()); - uint64_t Offset1 = TD->getIndexedOffset(GEP1->getPointerOperandType(), - Indices1); - uint64_t Offset2 = TD->getIndexedOffset(GEP2->getPointerOperandType(), - Indices2); + unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 1; + APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0); + if (TD && + GEP1->accumulateConstantOffset(*TD, Offset1) && + GEP2->accumulateConstantOffset(*TD, Offset2)) { return Offset1 == Offset2; } @@ -591,8 +589,8 @@ private: /// to modify it. FnSetType FnSet; - /// TargetData for more accurate GEP comparisons. May be NULL. - TargetData *TD; + /// DataLayout for more accurate GEP comparisons. May be NULL. + DataLayout *TD; /// Whether or not the target supports global aliases. bool HasGlobalAliases; @@ -609,7 +607,7 @@ ModulePass *llvm::createMergeFunctionsPass() { bool MergeFunctions::runOnModule(Module &M) { bool Changed = false; - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 9c9910b..fa518cb 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -14,14 +14,14 @@ #define DEBUG_TYPE "partialinlining" #include "llvm/Transforms/IPO.h" -#include "llvm/Instructions.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/CodeExtractor.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Support/CFG.h" using namespace llvm; STATISTIC(NumPartialInlined, "Number of functions partially inlined"); diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 43b4ab5..6dc1773 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -14,32 +14,36 @@ #include "llvm/Transforms/IPO/PassManagerBuilder.h" - #include "llvm-c/Transforms/PassManagerBuilder.h" - -#include "llvm/PassManager.h" -#include "llvm/DefaultPasses.h" -#include "llvm/PassManager.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/Verifier.h" +#include "llvm/PassManager.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ManagedStatic.h" #include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Vectorize.h" -#include "llvm/Transforms/IPO.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Support/ManagedStatic.h" using namespace llvm; static cl::opt<bool> -RunVectorization("vectorize", cl::desc("Run vectorization passes")); +RunLoopVectorization("vectorize-loops", + cl::desc("Run the Loop vectorization passes")); + +static cl::opt<bool> +RunBBVectorization("vectorize", cl::desc("Run the BB vectorization passes")); static cl::opt<bool> UseGVNAfterVectorization("use-gvn-after-vectorization", cl::init(false), cl::Hidden, cl::desc("Run GVN instead of Early CSE after vectorization passes")); +static cl::opt<bool> UseNewSROA("use-new-sroa", + cl::init(true), cl::Hidden, + cl::desc("Enable the new, experimental SROA pass")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -48,7 +52,8 @@ PassManagerBuilder::PassManagerBuilder() { DisableSimplifyLibCalls = false; DisableUnitAtATime = false; DisableUnrollLoops = false; - Vectorize = RunVectorization; + Vectorize = RunBBVectorization; + LoopVectorize = RunLoopVectorization; } PassManagerBuilder::~PassManagerBuilder() { @@ -100,7 +105,10 @@ void PassManagerBuilder::populateFunctionPassManager(FunctionPassManager &FPM) { addInitialAliasAnalysisPasses(FPM); FPM.add(createCFGSimplificationPass()); - FPM.add(createScalarReplAggregatesPass()); + if (UseNewSROA) + FPM.add(createSROAPass()); + else + FPM.add(createScalarReplAggregatesPass()); FPM.add(createEarlyCSEPass()); FPM.add(createLowerExpectIntrinsicPass()); } @@ -112,6 +120,14 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(Inliner); Inliner = 0; } + + // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC + // pass manager, but we don't want to add extensions into that pass manager. + // To prevent this we must insert a no-op module pass to reset the pass + // manager to get the same behavior as EP_OptimizerLast in non-O0 builds. + if (!GlobalExtensions->empty() || !Extensions.empty()) + MPM.add(createBarrierNoopPass()); + addExtensionsToPM(EP_EnabledOnOptLevel0, MPM); return; } @@ -147,7 +163,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { // Start of function pass. // Break up aggregate allocas, using SSAUpdater. - MPM.add(createScalarReplAggregatesPass(-1, false)); + if (UseNewSROA) + MPM.add(createSROAPass(/*RequiresDomTree*/ false)); + else + MPM.add(createScalarReplAggregatesPass(-1, false)); MPM.add(createEarlyCSEPass()); // Catch trivial redundancies if (!DisableSimplifyLibCalls) MPM.add(createSimplifyLibCallsPass()); // Library Call Optimizations @@ -166,6 +185,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. MPM.add(createLoopDeletionPass()); // Delete dead loops + + if (LoopVectorize && OptLevel > 2) + MPM.add(createLoopVectorizePass()); + if (!DisableUnrollLoops) MPM.add(createLoopUnrollPass()); // Unroll small loops addExtensionsToPM(EP_LoopOptimizerEnd, MPM); @@ -201,13 +224,12 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { // FIXME: We shouldn't bother with this anymore. MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes - // GlobalOpt already deletes dead functions and globals, at -O3 try a + // GlobalOpt already deletes dead functions and globals, at -O2 try a // late pass of GlobalDCE. It is capable of deleting dead cycles. - if (OptLevel > 2) + if (OptLevel > 1) { MPM.add(createGlobalDCEPass()); // Remove dead fns and globals. - - if (OptLevel > 1) MPM.add(createConstantMergePass()); // Merge dup global constants + } } addExtensionsToPM(EP_OptimizerLast, MPM); } @@ -222,8 +244,11 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, // Now that composite has been compiled, scan through the module, looking // for a main function. If main is defined, mark all other functions // internal. - if (Internalize) - PM.add(createInternalizePass(true)); + if (Internalize) { + std::vector<const char*> E; + E.push_back("main"); + PM.add(createInternalizePass(E)); + } // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function @@ -265,7 +290,10 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, PM.add(createInstructionCombiningPass()); PM.add(createJumpThreadingPass()); // Break up allocas - PM.add(createScalarReplAggregatesPass()); + if (UseNewSROA) + PM.add(createSROAPass()); + else + PM.add(createScalarReplAggregatesPass()); // Run a few AA driven optimizations here and now, to cleanup the code. PM.add(createFunctionAttrsPass()); // Add nocapture. @@ -289,7 +317,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, PM.add(createGlobalDCEPass()); } -LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate(void) { +LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() { PassManagerBuilder *PMB = new PassManagerBuilder(); return wrap(PMB); } diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp index c8cc8fd..d872f0c 100644 --- a/lib/Transforms/IPO/PruneEH.cpp +++ b/lib/Transforms/IPO/PruneEH.cpp @@ -16,16 +16,16 @@ #define DEBUG_TYPE "prune-eh" #include "llvm/Transforms/IPO.h" -#include "llvm/CallGraphSCCPass.h" -#include "llvm/Constants.h" -#include "llvm/Function.h" -#include "llvm/LLVMContext.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Analysis/CallGraph.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/Support/CFG.h" #include <algorithm> using namespace llvm; @@ -137,16 +137,18 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) { // If the SCC doesn't unwind or doesn't throw, note this fact. if (!SCCMightUnwind || !SCCMightReturn) for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Attributes NewAttributes = Attribute::None; + AttrBuilder NewAttributes; if (!SCCMightUnwind) - NewAttributes |= Attribute::NoUnwind; + NewAttributes.addAttribute(Attribute::NoUnwind); if (!SCCMightReturn) - NewAttributes |= Attribute::NoReturn; + NewAttributes.addAttribute(Attribute::NoReturn); Function *F = (*I)->getFunction(); - const AttrListPtr &PAL = F->getAttributes(); - const AttrListPtr &NPAL = PAL.addAttr(~0, NewAttributes); + const AttributeSet &PAL = F->getAttributes(); + const AttributeSet &NPAL = PAL.addAttr(F->getContext(), ~0, + Attribute::get(F->getContext(), + NewAttributes)); if (PAL != NPAL) { MadeChange = true; F->setAttributes(NPAL); diff --git a/lib/Transforms/IPO/StripDeadPrototypes.cpp b/lib/Transforms/IPO/StripDeadPrototypes.cpp index b5f09ec..f00830a 100644 --- a/lib/Transforms/IPO/StripDeadPrototypes.cpp +++ b/lib/Transforms/IPO/StripDeadPrototypes.cpp @@ -16,9 +16,9 @@ #define DEBUG_TYPE "strip-dead-prototypes" #include "llvm/Transforms/IPO.h" -#include "llvm/Pass.h" -#include "llvm/Module.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" using namespace llvm; STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed"); diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp index 80bfc1c..5f8681f 100644 --- a/lib/Transforms/IPO/StripSymbols.cpp +++ b/lib/Transforms/IPO/StripSymbols.cpp @@ -21,17 +21,17 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO.h" -#include "llvm/Constants.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/DebugInfo.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Instructions.h" -#include "llvm/Module.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/TypeFinder.h" +#include "llvm/IR/ValueSymbolTable.h" #include "llvm/Pass.h" -#include "llvm/TypeFinder.h" -#include "llvm/ValueSymbolTable.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallPtrSet.h" using namespace llvm; namespace { diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h index 0d5ef90..959daa2 100644 --- a/lib/Transforms/InstCombine/InstCombine.h +++ b/lib/Transforms/InstCombine/InstCombine.h @@ -11,17 +11,18 @@ #define INSTCOMBINE_INSTCOMBINE_H #include "InstCombineWorklist.h" -#include "llvm/IRBuilder.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Operator.h" -#include "llvm/Pass.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Support/InstVisitor.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" +#include "llvm/InstVisitor.h" +#include "llvm/Pass.h" #include "llvm/Support/TargetFolder.h" +#include "llvm/Transforms/Utils/SimplifyLibCalls.h" namespace llvm { class CallSite; - class TargetData; + class DataLayout; class TargetLibraryInfo; class DbgDeclareInst; class MemIntrinsic; @@ -71,9 +72,11 @@ public: class LLVM_LIBRARY_VISIBILITY InstCombiner : public FunctionPass, public InstVisitor<InstCombiner, Instruction*> { - TargetData *TD; + DataLayout *TD; TargetLibraryInfo *TLI; bool MadeIRChange; + LibCallSimplifier *Simplifier; + bool MinimizeSize; public: /// Worklist - All of the instructions that need to be simplified. InstCombineWorklist Worklist; @@ -85,6 +88,7 @@ public: static char ID; // Pass identification, replacement for typeid InstCombiner() : FunctionPass(ID), TD(0), Builder(0) { + MinimizeSize = false; initializeInstCombinerPass(*PassRegistry::getPassRegistry()); } @@ -95,7 +99,7 @@ public: virtual void getAnalysisUsage(AnalysisUsage &AU) const; - TargetData *getTargetData() const { return TD; } + DataLayout *getDataLayout() const { return TD; } TargetLibraryInfo *getTargetLibraryInfo() const { return TLI; } @@ -112,6 +116,8 @@ public: Instruction *visitSub(BinaryOperator &I); Instruction *visitFSub(BinaryOperator &I); Instruction *visitMul(BinaryOperator &I); + Value *foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C, + Instruction *InsertBefore); Instruction *visitFMul(BinaryOperator &I); Instruction *visitURem(BinaryOperator &I); Instruction *visitSRem(BinaryOperator &I); @@ -218,7 +224,7 @@ private: Type *Ty); Instruction *visitCallSite(CallSite CS); - Instruction *tryOptimizeCall(CallInst *CI, const TargetData *TD); + Instruction *tryOptimizeCall(CallInst *CI, const DataLayout *TD); bool transformConstExprCastCall(CallSite CS); Instruction *transformCallThroughTrampoline(CallSite CS, IntrinsicInst *Tramp); @@ -325,6 +331,11 @@ private: bool SimplifyDemandedBits(Use &U, APInt DemandedMask, APInt& KnownZero, APInt& KnownOne, unsigned Depth=0); + /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded + /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence. + Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl, + APInt DemandedMask, APInt &KnownZero, + APInt &KnownOne); /// SimplifyDemandedInstructionBits - Inst is an integer instruction that /// SimplifyDemandedBits knows about. See if the instruction has any @@ -365,6 +376,10 @@ private: Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned); + + /// Descale - Return a value X such that Val = X * Scale, or null if none. If + /// the multiplication is known not to overflow then NoSignedWrap is set. + Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap); }; diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 99b62f8..f07c58d 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -13,16 +13,719 @@ #include "InstCombine.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Target/TargetData.h" +#include "llvm/IR/DataLayout.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/PatternMatch.h" using namespace llvm; using namespace PatternMatch; +namespace { + + /// Class representing coefficient of floating-point addend. + /// This class needs to be highly efficient, which is especially true for + /// the constructor. As of I write this comment, the cost of the default + /// constructor is merely 4-byte-store-zero (Assuming compiler is able to + /// perform write-merging). + /// + class FAddendCoef { + public: + // The constructor has to initialize a APFloat, which is uncessary for + // most addends which have coefficient either 1 or -1. So, the constructor + // is expensive. In order to avoid the cost of the constructor, we should + // reuse some instances whenever possible. The pre-created instances + // FAddCombine::Add[0-5] embodies this idea. + // + FAddendCoef() : IsFp(false), BufHasFpVal(false), IntVal(0) {} + ~FAddendCoef(); + + void set(short C) { + assert(!insaneIntVal(C) && "Insane coefficient"); + IsFp = false; IntVal = C; + } + + void set(const APFloat& C); + + void negate(); + + bool isZero() const { return isInt() ? !IntVal : getFpVal().isZero(); } + Value *getValue(Type *) const; + + // If possible, don't define operator+/operator- etc because these + // operators inevitably call FAddendCoef's constructor which is not cheap. + void operator=(const FAddendCoef &A); + void operator+=(const FAddendCoef &A); + void operator-=(const FAddendCoef &A); + void operator*=(const FAddendCoef &S); + + bool isOne() const { return isInt() && IntVal == 1; } + bool isTwo() const { return isInt() && IntVal == 2; } + bool isMinusOne() const { return isInt() && IntVal == -1; } + bool isMinusTwo() const { return isInt() && IntVal == -2; } + + private: + bool insaneIntVal(int V) { return V > 4 || V < -4; } + APFloat *getFpValPtr(void) + { return reinterpret_cast<APFloat*>(&FpValBuf.buffer[0]); } + + const APFloat &getFpVal(void) const { + assert(IsFp && BufHasFpVal && "Incorret state"); + return *reinterpret_cast<const APFloat*>(&FpValBuf.buffer[0]); + } + + APFloat &getFpVal(void) + { assert(IsFp && BufHasFpVal && "Incorret state"); return *getFpValPtr(); } + + bool isInt() const { return !IsFp; } + + private: + + bool IsFp; + + // True iff FpValBuf contains an instance of APFloat. + bool BufHasFpVal; + + // The integer coefficient of an individual addend is either 1 or -1, + // and we try to simplify at most 4 addends from neighboring at most + // two instructions. So the range of <IntVal> falls in [-4, 4]. APInt + // is overkill of this end. + short IntVal; + + AlignedCharArrayUnion<APFloat> FpValBuf; + }; + + /// FAddend is used to represent floating-point addend. An addend is + /// represented as <C, V>, where the V is a symbolic value, and C is a + /// constant coefficient. A constant addend is represented as <C, 0>. + /// + class FAddend { + public: + FAddend() { Val = 0; } + + Value *getSymVal (void) const { return Val; } + const FAddendCoef &getCoef(void) const { return Coeff; } + + bool isConstant() const { return Val == 0; } + bool isZero() const { return Coeff.isZero(); } + + void set(short Coefficient, Value *V) { Coeff.set(Coefficient), Val = V; } + void set(const APFloat& Coefficient, Value *V) + { Coeff.set(Coefficient); Val = V; } + void set(const ConstantFP* Coefficient, Value *V) + { Coeff.set(Coefficient->getValueAPF()); Val = V; } + + void negate() { Coeff.negate(); } + + /// Drill down the U-D chain one step to find the definition of V, and + /// try to break the definition into one or two addends. + static unsigned drillValueDownOneStep(Value* V, FAddend &A0, FAddend &A1); + + /// Similar to FAddend::drillDownOneStep() except that the value being + /// splitted is the addend itself. + unsigned drillAddendDownOneStep(FAddend &Addend0, FAddend &Addend1) const; + + void operator+=(const FAddend &T) { + assert((Val == T.Val) && "Symbolic-values disagree"); + Coeff += T.Coeff; + } + + private: + void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; } + + // This addend has the value of "Coeff * Val". + Value *Val; + FAddendCoef Coeff; + }; + + /// FAddCombine is the class for optimizing an unsafe fadd/fsub along + /// with its neighboring at most two instructions. + /// + class FAddCombine { + public: + FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(0) {} + Value *simplify(Instruction *FAdd); + + private: + typedef SmallVector<const FAddend*, 4> AddendVect; + + Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota); + + /// Convert given addend to a Value + Value *createAddendVal(const FAddend &A, bool& NeedNeg); + + /// Return the number of instructions needed to emit the N-ary addition. + unsigned calcInstrNumber(const AddendVect& Vect); + Value *createFSub(Value *Opnd0, Value *Opnd1); + Value *createFAdd(Value *Opnd0, Value *Opnd1); + Value *createFMul(Value *Opnd0, Value *Opnd1); + Value *createFNeg(Value *V); + Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota); + void createInstPostProc(Instruction *NewInst); + + InstCombiner::BuilderTy *Builder; + Instruction *Instr; + + private: + // Debugging stuff are clustered here. + #ifndef NDEBUG + unsigned CreateInstrNum; + void initCreateInstNum() { CreateInstrNum = 0; } + void incCreateInstNum() { CreateInstrNum++; } + #else + void initCreateInstNum() {} + void incCreateInstNum() {} + #endif + }; +} + +//===----------------------------------------------------------------------===// +// +// Implementation of +// {FAddendCoef, FAddend, FAddition, FAddCombine}. +// +//===----------------------------------------------------------------------===// +FAddendCoef::~FAddendCoef() { + if (BufHasFpVal) + getFpValPtr()->~APFloat(); +} + +void FAddendCoef::set(const APFloat& C) { + APFloat *P = getFpValPtr(); + + if (isInt()) { + // As the buffer is meanless byte stream, we cannot call + // APFloat::operator=(). + new(P) APFloat(C); + } else + *P = C; + + IsFp = BufHasFpVal = true; +} + +void FAddendCoef::operator=(const FAddendCoef& That) { + if (That.isInt()) + set(That.IntVal); + else + set(That.getFpVal()); +} + +void FAddendCoef::operator+=(const FAddendCoef &That) { + enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven; + if (isInt() == That.isInt()) { + if (isInt()) + IntVal += That.IntVal; + else + getFpVal().add(That.getFpVal(), RndMode); + return; + } + + if (isInt()) { + const APFloat &T = That.getFpVal(); + set(T); + getFpVal().add(APFloat(T.getSemantics(), IntVal), RndMode); + return; + } + + APFloat &T = getFpVal(); + T.add(APFloat(T.getSemantics(), That.IntVal), RndMode); +} + +void FAddendCoef::operator-=(const FAddendCoef &That) { + enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven; + if (isInt() == That.isInt()) { + if (isInt()) + IntVal -= That.IntVal; + else + getFpVal().subtract(That.getFpVal(), RndMode); + return; + } + + if (isInt()) { + const APFloat &T = That.getFpVal(); + set(T); + getFpVal().subtract(APFloat(T.getSemantics(), IntVal), RndMode); + return; + } + + APFloat &T = getFpVal(); + T.subtract(APFloat(T.getSemantics(), IntVal), RndMode); +} + +void FAddendCoef::operator*=(const FAddendCoef &That) { + if (That.isOne()) + return; + + if (That.isMinusOne()) { + negate(); + return; + } + + if (isInt() && That.isInt()) { + int Res = IntVal * (int)That.IntVal; + assert(!insaneIntVal(Res) && "Insane int value"); + IntVal = Res; + return; + } + + const fltSemantics &Semantic = + isInt() ? That.getFpVal().getSemantics() : getFpVal().getSemantics(); + + if (isInt()) + set(APFloat(Semantic, IntVal)); + APFloat &F0 = getFpVal(); + + if (That.isInt()) + F0.multiply(APFloat(Semantic, That.IntVal), APFloat::rmNearestTiesToEven); + else + F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven); + + return; +} + +void FAddendCoef::negate() { + if (isInt()) + IntVal = 0 - IntVal; + else + getFpVal().changeSign(); +} + +Value *FAddendCoef::getValue(Type *Ty) const { + return isInt() ? + ConstantFP::get(Ty, float(IntVal)) : + ConstantFP::get(Ty->getContext(), getFpVal()); +} + +// The definition of <Val> Addends +// ========================================= +// A + B <1, A>, <1,B> +// A - B <1, A>, <1,B> +// 0 - B <-1, B> +// C * A, <C, A> +// A + C <1, A> <C, NULL> +// 0 +/- 0 <0, NULL> (corner case) +// +// Legend: A and B are not constant, C is constant +// +unsigned FAddend::drillValueDownOneStep + (Value *Val, FAddend &Addend0, FAddend &Addend1) { + Instruction *I = 0; + if (Val == 0 || !(I = dyn_cast<Instruction>(Val))) + return 0; + + unsigned Opcode = I->getOpcode(); + + if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) { + ConstantFP *C0, *C1; + Value *Opnd0 = I->getOperand(0); + Value *Opnd1 = I->getOperand(1); + if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero()) + Opnd0 = 0; + + if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero()) + Opnd1 = 0; + + if (Opnd0) { + if (!C0) + Addend0.set(1, Opnd0); + else + Addend0.set(C0, 0); + } + + if (Opnd1) { + FAddend &Addend = Opnd0 ? Addend1 : Addend0; + if (!C1) + Addend.set(1, Opnd1); + else + Addend.set(C1, 0); + if (Opcode == Instruction::FSub) + Addend.negate(); + } + + if (Opnd0 || Opnd1) + return Opnd0 && Opnd1 ? 2 : 1; + + // Both operands are zero. Weird! + Addend0.set(APFloat(C0->getValueAPF().getSemantics()), 0); + return 1; + } + + if (I->getOpcode() == Instruction::FMul) { + Value *V0 = I->getOperand(0); + Value *V1 = I->getOperand(1); + if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) { + Addend0.set(C, V1); + return 1; + } + + if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) { + Addend0.set(C, V0); + return 1; + } + } + + return 0; +} + +// Try to break *this* addend into two addends. e.g. Suppose this addend is +// <2.3, V>, and V = X + Y, by calling this function, we obtain two addends, +// i.e. <2.3, X> and <2.3, Y>. +// +unsigned FAddend::drillAddendDownOneStep + (FAddend &Addend0, FAddend &Addend1) const { + if (isConstant()) + return 0; + + unsigned BreakNum = FAddend::drillValueDownOneStep(Val, Addend0, Addend1); + if (!BreakNum || Coeff.isOne()) + return BreakNum; + + Addend0.Scale(Coeff); + + if (BreakNum == 2) + Addend1.Scale(Coeff); + + return BreakNum; +} + +Value *FAddCombine::simplify(Instruction *I) { + assert(I->hasUnsafeAlgebra() && "Should be in unsafe mode"); + + // Currently we are not able to handle vector type. + if (I->getType()->isVectorTy()) + return 0; + + assert((I->getOpcode() == Instruction::FAdd || + I->getOpcode() == Instruction::FSub) && "Expect add/sub"); + + // Save the instruction before calling other member-functions. + Instr = I; + + FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1; + + unsigned OpndNum = FAddend::drillValueDownOneStep(I, Opnd0, Opnd1); + + // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1. + unsigned Opnd0_ExpNum = 0; + unsigned Opnd1_ExpNum = 0; + + if (!Opnd0.isConstant()) + Opnd0_ExpNum = Opnd0.drillAddendDownOneStep(Opnd0_0, Opnd0_1); + + // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1. + if (OpndNum == 2 && !Opnd1.isConstant()) + Opnd1_ExpNum = Opnd1.drillAddendDownOneStep(Opnd1_0, Opnd1_1); + + // Step 3: Try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1 + if (Opnd0_ExpNum && Opnd1_ExpNum) { + AddendVect AllOpnds; + AllOpnds.push_back(&Opnd0_0); + AllOpnds.push_back(&Opnd1_0); + if (Opnd0_ExpNum == 2) + AllOpnds.push_back(&Opnd0_1); + if (Opnd1_ExpNum == 2) + AllOpnds.push_back(&Opnd1_1); + + // Compute instruction quota. We should save at least one instruction. + unsigned InstQuota = 0; + + Value *V0 = I->getOperand(0); + Value *V1 = I->getOperand(1); + InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) && + (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1; + + if (Value *R = simplifyFAdd(AllOpnds, InstQuota)) + return R; + } + + if (OpndNum != 2) { + // The input instruction is : "I=0.0 +/- V". If the "V" were able to be + // splitted into two addends, say "V = X - Y", the instruction would have + // been optimized into "I = Y - X" in the previous steps. + // + const FAddendCoef &CE = Opnd0.getCoef(); + return CE.isOne() ? Opnd0.getSymVal() : 0; + } + + // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1] + if (Opnd1_ExpNum) { + AddendVect AllOpnds; + AllOpnds.push_back(&Opnd0); + AllOpnds.push_back(&Opnd1_0); + if (Opnd1_ExpNum == 2) + AllOpnds.push_back(&Opnd1_1); + + if (Value *R = simplifyFAdd(AllOpnds, 1)) + return R; + } + + // step 5: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1] + if (Opnd0_ExpNum) { + AddendVect AllOpnds; + AllOpnds.push_back(&Opnd1); + AllOpnds.push_back(&Opnd0_0); + if (Opnd0_ExpNum == 2) + AllOpnds.push_back(&Opnd0_1); + + if (Value *R = simplifyFAdd(AllOpnds, 1)) + return R; + } + + return 0; +} + +Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { + + unsigned AddendNum = Addends.size(); + assert(AddendNum <= 4 && "Too many addends"); + + // For saving intermediate results; + unsigned NextTmpIdx = 0; + FAddend TmpResult[3]; + + // Points to the constant addend of the resulting simplified expression. + // If the resulting expr has constant-addend, this constant-addend is + // desirable to reside at the top of the resulting expression tree. Placing + // constant close to supper-expr(s) will potentially reveal some optimization + // opportunities in super-expr(s). + // + const FAddend *ConstAdd = 0; + + // Simplified addends are placed <SimpVect>. + AddendVect SimpVect; + + // The outer loop works on one symbolic-value at a time. Suppose the input + // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ... + // The symbolic-values will be processed in this order: x, y, z. + // + for (unsigned SymIdx = 0; SymIdx < AddendNum; SymIdx++) { + + const FAddend *ThisAddend = Addends[SymIdx]; + if (!ThisAddend) { + // This addend was processed before. + continue; + } + + Value *Val = ThisAddend->getSymVal(); + unsigned StartIdx = SimpVect.size(); + SimpVect.push_back(ThisAddend); + + // The inner loop collects addends sharing same symbolic-value, and these + // addends will be later on folded into a single addend. Following above + // example, if the symbolic value "y" is being processed, the inner loop + // will collect two addends "<b1,y>" and "<b2,Y>". These two addends will + // be later on folded into "<b1+b2, y>". + // + for (unsigned SameSymIdx = SymIdx + 1; + SameSymIdx < AddendNum; SameSymIdx++) { + const FAddend *T = Addends[SameSymIdx]; + if (T && T->getSymVal() == Val) { + // Set null such that next iteration of the outer loop will not process + // this addend again. + Addends[SameSymIdx] = 0; + SimpVect.push_back(T); + } + } + + // If multiple addends share same symbolic value, fold them together. + if (StartIdx + 1 != SimpVect.size()) { + FAddend &R = TmpResult[NextTmpIdx ++]; + R = *SimpVect[StartIdx]; + for (unsigned Idx = StartIdx + 1; Idx < SimpVect.size(); Idx++) + R += *SimpVect[Idx]; + + // Pop all addends being folded and push the resulting folded addend. + SimpVect.resize(StartIdx); + if (Val != 0) { + if (!R.isZero()) { + SimpVect.push_back(&R); + } + } else { + // Don't push constant addend at this time. It will be the last element + // of <SimpVect>. + ConstAdd = &R; + } + } + } + + assert((NextTmpIdx <= sizeof(TmpResult)/sizeof(TmpResult[0]) + 1) && + "out-of-bound access"); + + if (ConstAdd) + SimpVect.push_back(ConstAdd); + + Value *Result; + if (!SimpVect.empty()) + Result = createNaryFAdd(SimpVect, InstrQuota); + else { + // The addition is folded to 0.0. + Result = ConstantFP::get(Instr->getType(), 0.0); + } + + return Result; +} + +Value *FAddCombine::createNaryFAdd + (const AddendVect &Opnds, unsigned InstrQuota) { + assert(!Opnds.empty() && "Expect at least one addend"); + + // Step 1: Check if the # of instructions needed exceeds the quota. + // + unsigned InstrNeeded = calcInstrNumber(Opnds); + if (InstrNeeded > InstrQuota) + return 0; + + initCreateInstNum(); + + // step 2: Emit the N-ary addition. + // Note that at most three instructions are involved in Fadd-InstCombine: the + // addition in question, and at most two neighboring instructions. + // The resulting optimized addition should have at least one less instruction + // than the original addition expression tree. This implies that the resulting + // N-ary addition has at most two instructions, and we don't need to worry + // about tree-height when constructing the N-ary addition. + + Value *LastVal = 0; + bool LastValNeedNeg = false; + + // Iterate the addends, creating fadd/fsub using adjacent two addends. + for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end(); + I != E; I++) { + bool NeedNeg; + Value *V = createAddendVal(**I, NeedNeg); + if (!LastVal) { + LastVal = V; + LastValNeedNeg = NeedNeg; + continue; + } + + if (LastValNeedNeg == NeedNeg) { + LastVal = createFAdd(LastVal, V); + continue; + } + + if (LastValNeedNeg) + LastVal = createFSub(V, LastVal); + else + LastVal = createFSub(LastVal, V); + + LastValNeedNeg = false; + } + + if (LastValNeedNeg) { + LastVal = createFNeg(LastVal); + } + + #ifndef NDEBUG + assert(CreateInstrNum == InstrNeeded && + "Inconsistent in instruction numbers"); + #endif + + return LastVal; +} + +Value *FAddCombine::createFSub + (Value *Opnd0, Value *Opnd1) { + Value *V = Builder->CreateFSub(Opnd0, Opnd1); + createInstPostProc(cast<Instruction>(V)); + return V; +} + +Value *FAddCombine::createFNeg(Value *V) { + Value *Zero = cast<Value>(ConstantFP::get(V->getType(), 0.0)); + return createFSub(Zero, V); +} + +Value *FAddCombine::createFAdd + (Value *Opnd0, Value *Opnd1) { + Value *V = Builder->CreateFAdd(Opnd0, Opnd1); + createInstPostProc(cast<Instruction>(V)); + return V; +} + +Value *FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) { + Value *V = Builder->CreateFMul(Opnd0, Opnd1); + createInstPostProc(cast<Instruction>(V)); + return V; +} + +void FAddCombine::createInstPostProc(Instruction *NewInstr) { + NewInstr->setDebugLoc(Instr->getDebugLoc()); + + // Keep track of the number of instruction created. + incCreateInstNum(); + + // Propagate fast-math flags + NewInstr->setFastMathFlags(Instr->getFastMathFlags()); +} + +// Return the number of instruction needed to emit the N-ary addition. +// NOTE: Keep this function in sync with createAddendVal(). +unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) { + unsigned OpndNum = Opnds.size(); + unsigned InstrNeeded = OpndNum - 1; + + // The number of addends in the form of "(-1)*x". + unsigned NegOpndNum = 0; + + // Adjust the number of instructions needed to emit the N-ary add. + for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end(); + I != E; I++) { + const FAddend *Opnd = *I; + if (Opnd->isConstant()) + continue; + + const FAddendCoef &CE = Opnd->getCoef(); + if (CE.isMinusOne() || CE.isMinusTwo()) + NegOpndNum++; + + // Let the addend be "c * x". If "c == +/-1", the value of the addend + // is immediately available; otherwise, it needs exactly one instruction + // to evaluate the value. + if (!CE.isMinusOne() && !CE.isOne()) + InstrNeeded++; + } + if (NegOpndNum == OpndNum) + InstrNeeded++; + return InstrNeeded; +} + +// Input Addend Value NeedNeg(output) +// ================================================================ +// Constant C C false +// <+/-1, V> V coefficient is -1 +// <2/-2, V> "fadd V, V" coefficient is -2 +// <C, V> "fmul V, C" false +// +// NOTE: Keep this function in sync with FAddCombine::calcInstrNumber. +Value *FAddCombine::createAddendVal + (const FAddend &Opnd, bool &NeedNeg) { + const FAddendCoef &Coeff = Opnd.getCoef(); + + if (Opnd.isConstant()) { + NeedNeg = false; + return Coeff.getValue(Instr->getType()); + } + + Value *OpndVal = Opnd.getSymVal(); + + if (Coeff.isMinusOne() || Coeff.isOne()) { + NeedNeg = Coeff.isMinusOne(); + return OpndVal; + } + + if (Coeff.isTwo() || Coeff.isMinusTwo()) { + NeedNeg = Coeff.isMinusTwo(); + return createFAdd(OpndVal, OpndVal); + } + + NeedNeg = false; + return createFMul(OpndVal, Coeff.getValue(Instr->getType())); +} + /// AddOne - Add one to a ConstantInt. static Constant *AddOne(Constant *C) { return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); } + /// SubOne - Subtract one from a ConstantInt. static Constant *SubOne(ConstantInt *C) { return ConstantInt::get(C->getContext(), C->getValue()-1); @@ -37,10 +740,10 @@ static Constant *SubOne(ConstantInt *C) { static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) { if (!V->hasOneUse() || !V->getType()->isIntegerTy()) return 0; - + Instruction *I = dyn_cast<Instruction>(V); if (I == 0) return 0; - + if (I->getOpcode() == Instruction::Mul) if ((CST = dyn_cast<ConstantInt>(I->getOperand(1)))) return I->getOperand(0); @@ -64,22 +767,22 @@ static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) { bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS) { // There are different heuristics we can use for this. Here are some simple // ones. - - // Add has the property that adding any two 2's complement numbers can only + + // Add has the property that adding any two 2's complement numbers can only // have one carry bit which can change a sign. As such, if LHS and RHS each // have at least two sign bits, we know that the addition of the two values // will sign extend fine. if (ComputeNumSignBits(LHS) > 1 && ComputeNumSignBits(RHS) > 1) return true; - - + + // If one of the operands only has one non-zero bit, and if the other operand // has a known-zero bit in a more significant place than it (not including the // sign bit) the ripple may go up to and fill the zero, but won't change the // sign. For example, (X & ~4) + 1. - + // TODO: Implement. - + return false; } @@ -100,7 +803,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { const APInt &Val = CI->getValue(); if (Val.isSignBit()) return BinaryOperator::CreateXor(LHS, RHS); - + // See if SimplifyDemandedBits can simplify this. This handles stuff like // (X & 254)+1 -> (X&254)|1 if (SimplifyDemandedInstructionBits(I)) @@ -110,7 +813,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS)) if (ZI->getSrcTy()->isIntegerTy(1)) return SelectInst::Create(ZI->getOperand(0), AddOne(CI), CI); - + Value *XorLHS = 0; ConstantInt *XorRHS = 0; if (match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) { uint32_t TySizeBits = I.getType()->getScalarSizeInBits(); @@ -124,13 +827,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { else if (XorRHS->getValue().isPowerOf2()) ExtendAmt = TySizeBits - XorRHS->getValue().logBase2() - 1; } - + if (ExtendAmt) { APInt Mask = APInt::getHighBitsSet(TySizeBits, ExtendAmt); if (!MaskedValueIsZero(XorLHS, Mask)) ExtendAmt = 0; } - + if (ExtendAmt) { Constant *ShAmt = ConstantInt::get(I.getType(), ExtendAmt); Value *NewShl = Builder->CreateShl(XorLHS, ShAmt, "sext"); @@ -175,7 +878,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum"); return BinaryOperator::CreateNeg(NewAdd); } - + return BinaryOperator::CreateSub(RHS, LHSV); } @@ -209,7 +912,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { APInt RHSKnownOne(IT->getBitWidth(), 0); APInt RHSKnownZero(IT->getBitWidth(), 0); ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne); - + // No bits in common -> bitwise or. if ((LHSKnownZero|RHSKnownZero).isAllOnesValue()) return BinaryOperator::CreateOr(LHS, RHS); @@ -251,7 +954,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // See if all bits from the first bit set in the Add RHS up are included // in the mask. First, get the rightmost bit. const APInt &AddRHSV = CRHS->getValue(); - + // Form a mask of all bits from the lowest bit added through the top. APInt AddRHSHighBits(~((AddRHSV & -AddRHSV)-1)); @@ -289,7 +992,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A)))) // Fold the add into the true select value. return SelectInst::Create(SI->getCondition(), N, A); - + if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A)))) // Fold the add into the false select value. return SelectInst::Create(SI->getCondition(), A, N); @@ -301,18 +1004,18 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { if (SExtInst *LHSConv = dyn_cast<SExtInst>(LHS)) { // (add (sext x), cst) --> (sext (add x, cst')) if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) { - Constant *CI = + Constant *CI = ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType()); if (LHSConv->hasOneUse() && ConstantExpr::getSExt(CI, I.getType()) == RHSC && WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) { // Insert the new, smaller add. - Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), + Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), CI, "addconv"); return new SExtInst(NewAdd, I.getType()); } } - + // (add (sext x), (sext y)) --> (sext (add int x, y)) if (SExtInst *RHSConv = dyn_cast<SExtInst>(RHS)) { // Only do this if x/y have the same type, if at last one of them has a @@ -323,7 +1026,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { WillNotOverflowSignedAdd(LHSConv->getOperand(0), RHSConv->getOperand(0))) { // Insert the new integer add. - Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), + Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), RHSConv->getOperand(0), "addconv"); return new SExtInst(NewAdd, I.getType()); } @@ -351,18 +1054,12 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); - if (Constant *RHSC = dyn_cast<Constant>(RHS)) { - // X + 0 --> X - if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) { - if (CFP->isExactlyValue(ConstantFP::getNegativeZero - (I.getType())->getValueAPF())) - return ReplaceInstUsesWith(I, LHS); - } + if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), TD)) + return ReplaceInstUsesWith(I, V); - if (isa<PHINode>(LHS)) - if (Instruction *NV = FoldOpIntoPhi(I)) - return NV; - } + if (isa<Constant>(RHS) && isa<PHINode>(LHS)) + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; // -A + B --> B - A // -A + -B --> -(A + B) @@ -374,11 +1071,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { if (Value *V = dyn_castFNegVal(RHS)) return BinaryOperator::CreateFSub(LHS, V); - // Check for X+0.0. Simplify it to X if we know X is not -0.0. - if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) - if (CFP->getValueAPF().isPosZero() && CannotBeNegativeZero(LHS)) - return ReplaceInstUsesWith(I, LHS); - // Check for (fadd double (sitofp x), y), see if we can merge this into an // integer add followed by a promotion. if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) { @@ -388,7 +1080,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { // requires a constant pool load, and generally allows the add to be better // instcombined. if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) { - Constant *CI = + Constant *CI = ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType()); if (LHSConv->hasOneUse() && ConstantExpr::getSIToFP(CI, I.getType()) == CFP && @@ -399,7 +1091,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { return new SIToFPInst(NewAdd, I.getType()); } } - + // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y)) if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) { // Only do this if x/y have the same type, if at last one of them has a @@ -410,13 +1102,18 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { WillNotOverflowSignedAdd(LHSConv->getOperand(0), RHSConv->getOperand(0))) { // Insert the new integer add. - Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), + Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), RHSConv->getOperand(0),"addconv"); return new SIToFPInst(NewAdd, I.getType()); } } } - + + if (I.hasUnsafeAlgebra()) { + if (Value *V = FAddCombine(Builder).simplify(&I)) + return ReplaceInstUsesWith(I, V); + } + return Changed ? &I : 0; } @@ -428,7 +1125,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty) { assert(TD && "Must have target data info for this"); - + // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize // this. bool Swapped = false; @@ -451,7 +1148,7 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, } } } - + if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) { // X - (gep X, ...) if (RHSGEP->getOperand(0) == LHS) { @@ -467,16 +1164,16 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, } } } - + // Avoid duplicating the arithmetic if GEP2 has non-constant indices and // multiple users. if (GEP1 == 0 || (GEP2 != 0 && !GEP2->hasAllConstantIndices() && !GEP2->hasOneUse())) return 0; - + // Emit the offset of the GEP and an intptr_t. Value *Result = EmitGEPOffset(GEP1); - + // If we had a constant expression GEP on the other side offsetting the // pointer, subtract it from the offset we have. if (GEP2) { @@ -517,7 +1214,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { // Replace (-1 - A) with (~A). if (match(Op0, m_AllOnes())) return BinaryOperator::CreateNot(Op1); - + if (ConstantInt *C = dyn_cast<ConstantInt>(Op0)) { // C - ~X == X + (1+C) Value *X = 0; @@ -553,18 +1250,18 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { return &I; } - + { Value *Y; // X-(X+Y) == -Y X-(Y+X) == -Y if (match(Op1, m_Add(m_Specific(Op0), m_Value(Y))) || match(Op1, m_Add(m_Value(Y), m_Specific(Op0)))) return BinaryOperator::CreateNeg(Y); - + // (X-Y)-X == -Y if (match(Op0, m_Sub(m_Specific(Op1), m_Value(Y)))) return BinaryOperator::CreateNeg(Y); } - + if (Op1->hasOneUse()) { Value *X = 0, *Y = 0, *Z = 0; Constant *C = 0; @@ -581,7 +1278,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { match(Op1, m_And(m_Specific(Op0), m_Value(Y)))) return BinaryOperator::CreateAnd(Op0, Builder->CreateNot(Y, Y->getName() + ".not")); - + // 0 - (X sdiv C) -> (X sdiv -C) if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) && match(Op0, m_Zero())) @@ -604,14 +1301,14 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { C = ConstantExpr::getSub(One, ConstantExpr::getShl(One, CI)); return BinaryOperator::CreateMul(Op0, C); } - + // X - A*-B -> X + A*B // X - -A*B -> X + A*B Value *A, *B; if (match(Op1, m_Mul(m_Value(A), m_Neg(m_Value(B)))) || match(Op1, m_Mul(m_Neg(m_Value(A)), m_Value(B)))) return BinaryOperator::CreateAdd(Op0, Builder->CreateMul(A, B)); - + // X - A*CI -> X + A*-CI // X - CI*A -> X + A*-CI if (match(Op1, m_Mul(m_Value(A), m_ConstantInt(CI))) || @@ -630,7 +1327,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { if (X == dyn_castFoldableMul(Op1, C2)) return BinaryOperator::CreateMul(X, ConstantExpr::getSub(C1, C2)); } - + // Optimize pointer differences into the same array into a size. Consider: // &A[10] - &A[0]: we should compile this to "10". if (TD) { @@ -639,23 +1336,31 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { match(Op1, m_PtrToInt(m_Value(RHSOp)))) if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) return ReplaceInstUsesWith(I, Res); - + // trunc(p)-trunc(q) -> trunc(p-q) if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) && match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp))))) if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) return ReplaceInstUsesWith(I, Res); } - + return 0; } Instruction *InstCombiner::visitFSub(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), TD)) + return ReplaceInstUsesWith(I, V); + // If this is a 'B = x-(-A)', change to B = x+A... if (Value *V = dyn_castFNegVal(Op1)) return BinaryOperator::CreateFAdd(Op0, V); + if (I.hasUnsafeAlgebra()) { + if (Value *V = FAddCombine(Builder).simplify(&I)) + return ReplaceInstUsesWith(I, V); + } + return 0; } diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 7d0af0d..c1e60d4 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -12,11 +12,11 @@ //===----------------------------------------------------------------------===// #include "InstCombine.h" -#include "llvm/Intrinsics.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Transforms/Utils/CmpInstAnalysis.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/Support/ConstantRange.h" #include "llvm/Support/PatternMatch.h" +#include "llvm/Transforms/Utils/CmpInstAnalysis.h" using namespace llvm; using namespace PatternMatch; @@ -36,15 +36,15 @@ static inline bool isFreeToInvert(Value *V) { // ~(~(X)) -> X. if (BinaryOperator::isNot(V)) return true; - + // Constants can be considered to be not'ed values. if (isa<ConstantInt>(V)) return true; - + // Compares can be inverted if they have a single use. if (CmpInst *CI = dyn_cast<CmpInst>(V)) return CI->hasOneUse(); - + return false; } @@ -56,7 +56,7 @@ static inline Value *dyn_castNotVal(Value *V) { if (!isFreeToInvert(Operand)) return Operand; } - + // Constants can be considered to be not'ed values... if (ConstantInt *C = dyn_cast<ConstantInt>(V)) return ConstantInt::get(C->getType(), ~C->getValue()); @@ -91,7 +91,7 @@ static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) { } /// getNewICmpValue - This is the complement of getICmpCode, which turns an -/// opcode and two operands into either a constant true or false, or a brand +/// opcode and two operands into either a constant true or false, or a brand /// new ICmp instruction. The sign is passed in to determine which kind /// of predicate to use in the new icmp instruction. static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS, @@ -118,7 +118,7 @@ static Value *getFCmpValue(bool isordered, unsigned code, case 4: Pred = isordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT; break; case 5: Pred = isordered ? FCmpInst::FCMP_ONE : FCmpInst::FCMP_UNE; break; case 6: Pred = isordered ? FCmpInst::FCMP_OLE : FCmpInst::FCMP_ULE; break; - case 7: + case 7: if (!isordered) return ConstantInt::getTrue(LHS->getContext()); Pred = FCmpInst::FCMP_ORD; break; } @@ -154,7 +154,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, Or->takeName(Op); return BinaryOperator::CreateAnd(Or, AndRHS); } - + ConstantInt *TogetherCI = dyn_cast<ConstantInt>(Together); if (TogetherCI && !TogetherCI->isZero()){ // (X | C1) & C2 --> (X & (C2^(C1&C2))) | C1 @@ -166,7 +166,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, return BinaryOperator::CreateOr(And, OpRHS); } } - + break; case Instruction::Add: if (Op->hasOneUse()) { @@ -215,7 +215,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, if (CI->getValue() == ShlMask) // Masking out bits that the shift already masks. return ReplaceInstUsesWith(TheAnd, Op); // No need for the and. - + if (CI != AndRHS) { // Reducing bits set in and. TheAnd.setOperand(1, CI); return &TheAnd; @@ -236,7 +236,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, if (CI->getValue() == ShrMask) // Masking out bits that the shift already masks. return ReplaceInstUsesWith(TheAnd, Op); - + if (CI != AndRHS) { TheAnd.setOperand(1, CI); // Reduce bits set in and cst. return &TheAnd; @@ -269,22 +269,22 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, /// InsertRangeTest - Emit a computation of: (V >= Lo && V < Hi) if Inside is /// true, otherwise (V < Lo || V >= Hi). In practice, we emit the more efficient -/// (V-Lo) <u Hi-Lo. This method expects that Lo <= Hi. isSigned indicates +/// (V-Lo) \<u Hi-Lo. This method expects that Lo <= Hi. isSigned indicates /// whether to treat the V, Lo and HI as signed or not. IB is the location to /// insert new instructions. Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, bool isSigned, bool Inside) { - assert(cast<ConstantInt>(ConstantExpr::getICmp((isSigned ? + assert(cast<ConstantInt>(ConstantExpr::getICmp((isSigned ? ICmpInst::ICMP_SLE:ICmpInst::ICMP_ULE), Lo, Hi))->getZExtValue() && "Lo is not <= Hi in range emission code!"); - + if (Inside) { if (Lo == Hi) // Trivially false. return ConstantInt::getFalse(V->getContext()); // V >= Min && V < Hi --> V < Hi if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) { - ICmpInst::Predicate pred = (isSigned ? + ICmpInst::Predicate pred = (isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT); return Builder->CreateICmp(pred, V, Hi); } @@ -302,7 +302,7 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, // V < Min || V >= Hi -> V > Hi-1 Hi = SubOne(cast<ConstantInt>(Hi)); if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) { - ICmpInst::Predicate pred = (isSigned ? + ICmpInst::Predicate pred = (isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT); return Builder->CreateICmp(pred, V, Hi); } @@ -327,14 +327,14 @@ static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) { // look for the first zero bit after the run of ones MB = BitWidth - ((V - 1) ^ V).countLeadingZeros(); // look for the first non-zero bit - ME = V.getActiveBits(); + ME = V.getActiveBits(); return true; } /// FoldLogicalPlusAnd - This is part of an expression (LHS +/- RHS) & Mask, /// where isSub determines whether the operator is a sub. If we can fold one of /// the following xforms: -/// +/// /// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask /// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0 /// ((A ^ N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0 @@ -355,8 +355,8 @@ Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS, case Instruction::And: if (ConstantExpr::getAnd(N, Mask) == Mask) { // If the AndRHS is a power of two minus one (0+1+), this is simple. - if ((Mask->getValue().countLeadingZeros() + - Mask->getValue().countPopulation()) == + if ((Mask->getValue().countLeadingZeros() + + Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth()) break; @@ -375,33 +375,33 @@ Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS, case Instruction::Or: case Instruction::Xor: // If the AndRHS is a power of two minus one (0+1+), and N&Mask == 0 - if ((Mask->getValue().countLeadingZeros() + + if ((Mask->getValue().countLeadingZeros() + Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth() && ConstantExpr::getAnd(N, Mask)->isNullValue()) break; return 0; } - + if (isSub) return Builder->CreateSub(LHSI->getOperand(0), RHS, "fold"); return Builder->CreateAdd(LHSI->getOperand(0), RHS, "fold"); } /// enum for classifying (icmp eq (A & B), C) and (icmp ne (A & B), C) -/// One of A and B is considered the mask, the other the value. This is -/// described as the "AMask" or "BMask" part of the enum. If the enum +/// One of A and B is considered the mask, the other the value. This is +/// described as the "AMask" or "BMask" part of the enum. If the enum /// contains only "Mask", then both A and B can be considered masks. /// If A is the mask, then it was proven, that (A & C) == C. This /// is trivial if C == A, or C == 0. If both A and C are constants, this /// proof is also easy. /// For the following explanations we assume that A is the mask. -/// The part "AllOnes" declares, that the comparison is true only +/// The part "AllOnes" declares, that the comparison is true only /// if (A & B) == A, or all bits of A are set in B. /// Example: (icmp eq (A & 3), 3) -> FoldMskICmp_AMask_AllOnes -/// The part "AllZeroes" declares, that the comparison is true only +/// The part "AllZeroes" declares, that the comparison is true only /// if (A & B) == 0, or all bits of A are cleared in B. /// Example: (icmp eq (A & 3), 0) -> FoldMskICmp_Mask_AllZeroes -/// The part "Mixed" declares, that (A & B) == C and C might or might not +/// The part "Mixed" declares, that (A & B) == C and C might or might not /// contain any number of one bits and zero bits. /// Example: (icmp eq (A & 3), 1) -> FoldMskICmp_AMask_Mixed /// The Part "Not" means, that in above descriptions "==" should be replaced @@ -425,16 +425,16 @@ enum MaskedICmpType { /// return the set of pattern classes (from MaskedICmpType) /// that (icmp SCC (A & B), C) satisfies -static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, +static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, ICmpInst::Predicate SCC) { ConstantInt *ACst = dyn_cast<ConstantInt>(A); ConstantInt *BCst = dyn_cast<ConstantInt>(B); ConstantInt *CCst = dyn_cast<ConstantInt>(C); bool icmp_eq = (SCC == ICmpInst::ICMP_EQ); - bool icmp_abit = (ACst != 0 && !ACst->isZero() && + bool icmp_abit = (ACst != 0 && !ACst->isZero() && ACst->getValue().isPowerOf2()); - bool icmp_bbit = (BCst != 0 && !BCst->isZero() && + bool icmp_bbit = (BCst != 0 && !BCst->isZero() && BCst->getValue().isPowerOf2()); unsigned result = 0; if (CCst != 0 && CCst->isZero()) { @@ -449,12 +449,12 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, FoldMskICmp_BMask_NotMixed)); if (icmp_abit) result |= (icmp_eq ? (FoldMskICmp_AMask_NotAllOnes | - FoldMskICmp_AMask_NotMixed) + FoldMskICmp_AMask_NotMixed) : (FoldMskICmp_AMask_AllOnes | FoldMskICmp_AMask_Mixed)); if (icmp_bbit) result |= (icmp_eq ? (FoldMskICmp_BMask_NotAllOnes | - FoldMskICmp_BMask_NotMixed) + FoldMskICmp_BMask_NotMixed) : (FoldMskICmp_BMask_AllOnes | FoldMskICmp_BMask_Mixed)); return result; @@ -469,26 +469,23 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, FoldMskICmp_AMask_NotMixed) : (FoldMskICmp_Mask_AllZeroes | FoldMskICmp_AMask_Mixed)); - } - else if (ACst != 0 && CCst != 0 && - ConstantExpr::getAnd(ACst, CCst) == CCst) { + } else if (ACst != 0 && CCst != 0 && + ConstantExpr::getAnd(ACst, CCst) == CCst) { result |= (icmp_eq ? FoldMskICmp_AMask_Mixed : FoldMskICmp_AMask_NotMixed); } - if (B == C) - { + if (B == C) { result |= (icmp_eq ? (FoldMskICmp_BMask_AllOnes | FoldMskICmp_BMask_Mixed) : (FoldMskICmp_BMask_NotAllOnes | FoldMskICmp_BMask_NotMixed)); if (icmp_bbit) result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes | - FoldMskICmp_BMask_NotMixed) + FoldMskICmp_BMask_NotMixed) : (FoldMskICmp_Mask_AllZeroes | FoldMskICmp_BMask_Mixed)); - } - else if (BCst != 0 && CCst != 0 && - ConstantExpr::getAnd(BCst, CCst) == CCst) { + } else if (BCst != 0 && CCst != 0 && + ConstantExpr::getAnd(BCst, CCst) == CCst) { result |= (icmp_eq ? FoldMskICmp_BMask_Mixed : FoldMskICmp_BMask_NotMixed); } @@ -531,7 +528,7 @@ static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred, /// handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) /// return the set of pattern classes (from MaskedICmpType) /// that both LHS and RHS satisfy -static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, +static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, Value*& B, Value*& C, Value*& D, Value*& E, ICmpInst *LHS, ICmpInst *RHS, @@ -542,10 +539,10 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, if (LHS->getOperand(0)->getType()->isVectorTy()) return 0; // Here comes the tricky part: - // LHS might be of the form L11 & L12 == X, X == L21 & L22, + // LHS might be of the form L11 & L12 == X, X == L21 & L22, // and L11 & L12 == L21 & L22. The same goes for RHS. // Now we must find those components L** and R**, that are equal, so - // that we can extract the parameters A, B, C, D, and E for the canonical + // that we can extract the parameters A, B, C, D, and E for the canonical // above. Value *L1 = LHS->getOperand(0); Value *L2 = LHS->getOperand(1); @@ -610,14 +607,11 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, if (L11 == A) { B = L12; C = L2; - } - else if (L12 == A) { + } else if (L12 == A) { B = L11; C = L2; - } - else if (L21 == A) { + } else if (L21 == A) { B = L22; C = L1; - } - else if (L22 == A) { + } else if (L22 == A) { B = L21; C = L1; } @@ -643,32 +637,32 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, mask >>= 1; // treat "Not"-states as normal states if (mask & FoldMskICmp_Mask_AllZeroes) { - // (icmp eq (A & B), 0) & (icmp eq (A & D), 0) + // (icmp eq (A & B), 0) & (icmp eq (A & D), 0) // -> (icmp eq (A & (B|D)), 0) Value* newOr = Builder->CreateOr(B, D); Value* newAnd = Builder->CreateAnd(A, newOr); // we can't use C as zero, because we might actually handle - // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // (icmp ne (A & B), B) & (icmp ne (A & D), D) // with B and D, having a single bit set Value* zero = Constant::getNullValue(A->getType()); return Builder->CreateICmp(NEWCC, newAnd, zero); } - else if (mask & FoldMskICmp_BMask_AllOnes) { - // (icmp eq (A & B), B) & (icmp eq (A & D), D) + if (mask & FoldMskICmp_BMask_AllOnes) { + // (icmp eq (A & B), B) & (icmp eq (A & D), D) // -> (icmp eq (A & (B|D)), (B|D)) Value* newOr = Builder->CreateOr(B, D); Value* newAnd = Builder->CreateAnd(A, newOr); return Builder->CreateICmp(NEWCC, newAnd, newOr); - } - else if (mask & FoldMskICmp_AMask_AllOnes) { - // (icmp eq (A & B), A) & (icmp eq (A & D), A) + } + if (mask & FoldMskICmp_AMask_AllOnes) { + // (icmp eq (A & B), A) & (icmp eq (A & D), A) // -> (icmp eq (A & (B&D)), A) Value* newAnd1 = Builder->CreateAnd(B, D); Value* newAnd = Builder->CreateAnd(A, newAnd1); return Builder->CreateICmp(NEWCC, newAnd, A); } - else if (mask & FoldMskICmp_BMask_Mixed) { - // (icmp eq (A & B), C) & (icmp eq (A & D), E) + if (mask & FoldMskICmp_BMask_Mixed) { + // (icmp eq (A & B), C) & (icmp eq (A & D), E) // We already know that B & C == C && D & E == E. // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of // C and E, which are shared by both the mask B and the mask D, don't @@ -680,7 +674,7 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, ConstantInt *DCst = dyn_cast<ConstantInt>(D); if (DCst == 0) return 0; // we can't simply use C and E, because we might actually handle - // (icmp ne (A & B), B) & (icmp eq (A & D), D) + // (icmp ne (A & B), B) & (icmp eq (A & D), D) // with B and D, having a single bit set ConstantInt *CCst = dyn_cast<ConstantInt>(C); @@ -727,13 +721,13 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // handle (roughly): (icmp eq (A & B), C) & (icmp eq (A & D), E) if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_EQ, Builder)) return V; - + // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2). Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0); ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1)); ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1)); if (LHSCst == 0 || RHSCst == 0) return 0; - + if (LHSCst == RHSCst && LHSCC == RHSCC) { // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C) // where C is a power of 2 @@ -742,7 +736,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { Value *NewOr = Builder->CreateOr(Val, Val2); return Builder->CreateICmp(LHSCC, NewOr, LHSCst); } - + // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0) if (LHSCC == ICmpInst::ICMP_EQ && LHSCst->isZero()) { Value *NewOr = Builder->CreateOr(Val, Val2); @@ -759,14 +753,13 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { ConstantInt *AndCst, *SmallCst = 0, *BigCst = 0; // (trunc x) == C1 & (and x, CA) == C2 + // (and x, CA) == C2 & (trunc x) == C1 if (match(Val2, m_Trunc(m_Value(V))) && match(Val, m_And(m_Specific(V), m_ConstantInt(AndCst)))) { SmallCst = RHSCst; BigCst = LHSCst; - } - // (and x, CA) == C2 & (trunc x) == C1 - else if (match(Val, m_Trunc(m_Value(V))) && - match(Val2, m_And(m_Specific(V), m_ConstantInt(AndCst)))) { + } else if (match(Val, m_Trunc(m_Value(V))) && + match(Val2, m_And(m_Specific(V), m_ConstantInt(AndCst)))) { SmallCst = LHSCst; BigCst = RHSCst; } @@ -789,7 +782,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // From here on, we only handle: // (icmp1 A, C1) & (icmp2 A, C2) --> something simpler. if (Val != Val2) return 0; - + // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere. if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE || RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE || @@ -799,9 +792,9 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // Make a constant range that's the intersection of the two icmp ranges. // If the intersection is empty, we know that the result is false. - ConstantRange LHSRange = + ConstantRange LHSRange = ConstantRange::makeICmpRegion(LHSCC, LHSCst->getValue()); - ConstantRange RHSRange = + ConstantRange RHSRange = ConstantRange::makeICmpRegion(RHSCC, RHSCst->getValue()); if (LHSRange.intersectWith(RHSRange).isEmptySet()) @@ -810,16 +803,16 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // We can't fold (ugt x, C) & (sgt x, C2). if (!PredicatesFoldable(LHSCC, RHSCC)) return 0; - + // Ensure that the larger constant is on the RHS. bool ShouldSwap; if (CmpInst::isSigned(LHSCC) || - (ICmpInst::isEquality(LHSCC) && + (ICmpInst::isEquality(LHSCC) && CmpInst::isSigned(RHSCC))) ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue()); else ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue()); - + if (ShouldSwap) { std::swap(LHS, RHS); std::swap(LHSCst, RHSCst); @@ -829,8 +822,8 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // At this point, we know we have two icmp instructions // comparing a value against two constants and and'ing the result // together. Because of the above check, we know that we only have - // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know - // (from the icmp folding check above), that the two constants + // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know + // (from the icmp folding check above), that the two constants // are not equal and that the larger constant is on the RHS assert(LHSCst != RHSCst && "Compares not folded above?"); @@ -932,7 +925,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { } break; } - + return 0; } @@ -951,7 +944,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { return ConstantInt::getFalse(LHS->getContext()); return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0)); } - + // Handle vector zeros. This occurs because the canonical form of // "fcmp ord x,x" is "fcmp ord x, 0". if (isa<ConstantAggregateZero>(LHS->getOperand(1)) && @@ -959,18 +952,18 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0)); return 0; } - + Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1); Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1); FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate(); - - + + if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) { // Swap RHS operands to match LHS. Op1CC = FCmpInst::getSwappedPredicate(Op1CC); std::swap(Op1LHS, Op1RHS); } - + if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) { // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y). if (Op0CC == Op1CC) @@ -981,7 +974,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { return RHS; if (Op1CC == FCmpInst::FCMP_TRUE) return LHS; - + bool Op0Ordered; bool Op1Ordered; unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered); @@ -1001,7 +994,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { return LHS; if (Op0Ordered && (Op0Ordered == Op1Ordered)) return RHS; - + // uno && oeq -> uno && (ord && eq) -> false if (!Op0Ordered) return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); @@ -1025,10 +1018,10 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { if (Value *V = SimplifyUsingDistributiveLaws(I)) return ReplaceInstUsesWith(I, V); - // See if we can simplify any instructions used by the instruction whose sole + // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. if (SimplifyDemandedInstructionBits(I)) - return &I; + return &I; if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) { const APInt &AndRHSMask = AndRHS->getValue(); @@ -1043,7 +1036,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { case Instruction::Or: { // If the mask is only needed on one incoming arm, push it up. if (!Op0I->hasOneUse()) break; - + APInt NotAndRHS(~AndRHSMask); if (MaskedValueIsZero(Op0LHS, NotAndRHS)) { // Not masking anything out for the LHS, move to RHS. @@ -1103,12 +1096,12 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { } break; } - + if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) if (Instruction *Res = OptAndOp(Op0I, Op0CI, AndRHS, I)) return Res; } - + // If this is an integer truncation, and if the source is an 'and' with // immediate, transform it. This frequently occurs for bitfield accesses. { @@ -1116,7 +1109,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { if (match(Op0, m_Trunc(m_And(m_Value(X), m_ConstantInt(YC))))) { // Change: and (trunc (and X, YC) to T), C2 // into : and (trunc X to T), trunc(YC) & C2 - // This will fold the two constants together, which may allow + // This will fold the two constants together, which may allow // other simplifications. Value *NewCast = Builder->CreateTrunc(X, I.getType(), "and.shrunk"); Constant *C3 = ConstantExpr::getTrunc(YC, I.getType()); @@ -1143,7 +1136,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { I.getName()+".demorgan"); return BinaryOperator::CreateNot(Or); } - + { Value *A = 0, *B = 0, *C = 0, *D = 0; // (A|B) & ~(A&B) -> A^B @@ -1151,13 +1144,13 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { match(Op1, m_Not(m_And(m_Value(C), m_Value(D)))) && ((A == C && B == D) || (A == D && B == C))) return BinaryOperator::CreateXor(A, B); - + // ~(A&B) & (A|B) -> A^B if (match(Op1, m_Or(m_Value(A), m_Value(B))) && match(Op0, m_Not(m_And(m_Value(C), m_Value(D)))) && ((A == C && B == D) || (A == D && B == C))) return BinaryOperator::CreateXor(A, B); - + // A&(A^B) => A & ~B { Value *tmpOp0 = Op0; @@ -1193,19 +1186,19 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { match(Op1, m_Or(m_Value(A), m_Not(m_Specific(Op0))))) return BinaryOperator::CreateAnd(A, Op0); } - + if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1)) if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0)) if (Value *Res = FoldAndOfICmps(LHS, RHS)) return ReplaceInstUsesWith(I, Res); - + // If and'ing two fcmp, try combine them into one. if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0))) if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1))) if (Value *Res = FoldAndOfFCmps(LHS, RHS)) return ReplaceInstUsesWith(I, Res); - - + + // fold (and (cast A), (cast B)) -> (cast (and A, B)) if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) if (CastInst *Op1C = dyn_cast<CastInst>(Op1)) { @@ -1214,21 +1207,21 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntOrIntVectorTy()) { Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0); - + // Only do this if the casts both really cause code to be generated. if (ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) && ShouldOptimizeCast(Op1C->getOpcode(), Op1COp, I.getType())) { Value *NewOp = Builder->CreateAnd(Op0COp, Op1COp, I.getName()); return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType()); } - + // If this is and(cast(icmp), cast(icmp)), try to fold this even if the // cast is otherwise not optimizable. This happens for vector sexts. if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1COp)) if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0COp)) if (Value *Res = FoldAndOfICmps(LHS, RHS)) return CastInst::Create(Op0C->getOpcode(), Res, I.getType()); - + // If this is and(cast(fcmp), cast(fcmp)), try to fold this even if the // cast is otherwise not optimizable. This happens for vector sexts. if (FCmpInst *RHS = dyn_cast<FCmpInst>(Op1COp)) @@ -1237,17 +1230,17 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return CastInst::Create(Op0C->getOpcode(), Res, I.getType()); } } - + // (X >> Z) & (Y >> Z) -> (X&Y) >> Z for all shifts. if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) { if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0)) - if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && + if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && SI0->getOperand(1) == SI1->getOperand(1) && (SI0->hasOneUse() || SI1->hasOneUse())) { Value *NewOp = Builder->CreateAnd(SI0->getOperand(0), SI1->getOperand(0), SI0->getName()); - return BinaryOperator::Create(SI1->getOpcode(), NewOp, + return BinaryOperator::Create(SI1->getOpcode(), NewOp, SI1->getOperand(1)); } } @@ -1288,11 +1281,11 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, CollectBSwapParts(I->getOperand(1), OverallLeftShift, ByteMask, ByteValues); } - + // If this is a logical shift by a constant multiple of 8, recurse with // OverallLeftShift and ByteMask adjusted. if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) { - unsigned ShAmt = + unsigned ShAmt = cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U); // Ensure the shift amount is defined and of a byte value. if ((ShAmt & 7) || (ShAmt > 8*ByteValues.size())) @@ -1313,7 +1306,7 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, if (OverallLeftShift >= (int)ByteValues.size()) return true; if (OverallLeftShift <= -(int)ByteValues.size()) return true; - return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, + return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, ByteValues); } @@ -1325,20 +1318,20 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, unsigned NumBytes = ByteValues.size(); APInt Byte(I->getType()->getPrimitiveSizeInBits(), 255); const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue(); - + for (unsigned i = 0; i != NumBytes; ++i, Byte <<= 8) { // If this byte is masked out by a later operation, we don't care what // the and mask is. if ((ByteMask & (1 << i)) == 0) continue; - + // If the AndMask is all zeros for this byte, clear the bit. APInt MaskB = AndMask & Byte; if (MaskB == 0) { ByteMask &= ~(1U << i); continue; } - + // If the AndMask is not all ones for this byte, it's not a bytezap. if (MaskB != Byte) return true; @@ -1346,11 +1339,11 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, // Otherwise, this byte is kept. } - return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, + return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, ByteValues); } } - + // Okay, we got to something that isn't a shift, 'or' or 'and'. This must be // the input value to the bswap. Some observations: 1) if more than one byte // is demanded from this input, then it could not be successfully assembled @@ -1358,7 +1351,7 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, // their ultimate destination. if (!isPowerOf2_32(ByteMask)) return true; unsigned InputByteNo = CountTrailingZeros_32(ByteMask); - + // 2) The input and ultimate destinations must line up: if byte 3 of an i32 // is demanded, it needs to go into byte 0 of the result. This means that the // byte needs to be shifted until it lands in the right byte bucket. The @@ -1368,7 +1361,7 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, unsigned DestByteNo = InputByteNo + OverallLeftShift; if (ByteValues.size()-1-DestByteNo != InputByteNo) return true; - + // If the destination byte value is already defined, the values are or'd // together, which isn't a bswap (unless it's an or of the same bits). if (ByteValues[DestByteNo] && ByteValues[DestByteNo] != V) @@ -1381,25 +1374,25 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, /// If so, insert the new bswap intrinsic and return it. Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) { IntegerType *ITy = dyn_cast<IntegerType>(I.getType()); - if (!ITy || ITy->getBitWidth() % 16 || + if (!ITy || ITy->getBitWidth() % 16 || // ByteMask only allows up to 32-byte values. - ITy->getBitWidth() > 32*8) + ITy->getBitWidth() > 32*8) return 0; // Can only bswap pairs of bytes. Can't do vectors. - + /// ByteValues - For each byte of the result, we keep track of which value /// defines each byte. SmallVector<Value*, 8> ByteValues; ByteValues.resize(ITy->getBitWidth()/8); - + // Try to find all the pieces corresponding to the bswap. uint32_t ByteMask = ~0U >> (32-ByteValues.size()); if (CollectBSwapParts(&I, 0, ByteMask, ByteValues)) return 0; - + // Check to see if all of the bytes come from the same value. Value *V = ByteValues[0]; if (V == 0) return 0; // Didn't find a byte? Must be zero. - + // Check to make sure that all of the bytes come from the same value. for (unsigned i = 1, e = ByteValues.size(); i != e; ++i) if (ByteValues[i] != V) @@ -1425,7 +1418,7 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B, return SelectInst::Create(Cond, C, B); if (match(D, m_SExt(m_Not(m_Specific(Cond))))) return SelectInst::Create(Cond, C, B); - + // ((cond?-1:0)&C) | ((cond?0:-1)&D) -> cond ? C : D. if (match(B, m_Not(m_SExt(m_Specific(Cond))))) return SelectInst::Create(Cond, C, D); @@ -1483,33 +1476,33 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // From here on, we only handle: // (icmp1 A, C1) | (icmp2 A, C2) --> something simpler. if (Val != Val2) return 0; - + // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere. if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE || RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE || LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE || RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE) return 0; - + // We can't fold (ugt x, C) | (sgt x, C2). if (!PredicatesFoldable(LHSCC, RHSCC)) return 0; - + // Ensure that the larger constant is on the RHS. bool ShouldSwap; if (CmpInst::isSigned(LHSCC) || - (ICmpInst::isEquality(LHSCC) && + (ICmpInst::isEquality(LHSCC) && CmpInst::isSigned(RHSCC))) ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue()); else ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue()); - + if (ShouldSwap) { std::swap(LHS, RHS); std::swap(LHSCst, RHSCst); std::swap(LHSCC, RHSCC); } - + // At this point, we know we have two icmp instructions // comparing a value against two constants and or'ing the result // together. Because of the above check, we know that we only have @@ -1531,6 +1524,20 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { AddCST = ConstantExpr::getSub(AddOne(RHSCst), LHSCst); return Builder->CreateICmpULT(Add, AddCST); } + + if (LHS->getOperand(0) == RHS->getOperand(0)) { + // if LHSCst and RHSCst differ only by one bit: + // (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1 + assert(LHSCst->getValue().ule(LHSCst->getValue())); + + APInt Xor = LHSCst->getValue() ^ RHSCst->getValue(); + if (Xor.isPowerOf2()) { + Value *NegCst = Builder->getInt(~Xor); + Value *And = Builder->CreateAnd(LHS->getOperand(0), NegCst); + return Builder->CreateICmp(ICmpInst::ICMP_EQ, And, LHSCst); + } + } + break; // (X == 13 | X == 15) -> no change case ICmpInst::ICMP_UGT: // (X == 13 | X u> 14) -> no change case ICmpInst::ICMP_SGT: // (X == 13 | X s> 14) -> no change @@ -1632,7 +1639,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { /// function. Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { if (LHS->getPredicate() == FCmpInst::FCMP_UNO && - RHS->getPredicate() == FCmpInst::FCMP_UNO && + RHS->getPredicate() == FCmpInst::FCMP_UNO && LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType()) { if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1))) if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) { @@ -1640,25 +1647,25 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { // true. if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN()) return ConstantInt::getTrue(LHS->getContext()); - + // Otherwise, no need to compare the two constants, compare the // rest. return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0)); } - + // Handle vector zeros. This occurs because the canonical form of // "fcmp uno x,x" is "fcmp uno x, 0". if (isa<ConstantAggregateZero>(LHS->getOperand(1)) && isa<ConstantAggregateZero>(RHS->getOperand(1))) return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0)); - + return 0; } - + Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1); Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1); FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate(); - + if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) { // Swap RHS operands to match LHS. Op1CC = FCmpInst::getSwappedPredicate(Op1CC); @@ -1692,7 +1699,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { /// ((A | B) & C1) | (B & C2) /// /// into: -/// +/// /// (A & C1) | B /// /// when the XOR of the two constants is "all ones" (-1). @@ -1727,7 +1734,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (Value *V = SimplifyUsingDistributiveLaws(I)) return ReplaceInstUsesWith(I, V); - // See if we can simplify any instructions used by the instruction whose sole + // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. if (SimplifyDemandedInstructionBits(I)) return &I; @@ -1741,7 +1748,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { Op0->hasOneUse()) { Value *Or = Builder->CreateOr(X, RHS); Or->takeName(Op0); - return BinaryOperator::CreateAnd(Or, + return BinaryOperator::CreateAnd(Or, ConstantInt::get(I.getContext(), RHS->getValue() | C1->getValue())); } @@ -1778,7 +1785,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (Instruction *BSwap = MatchBSwap(I)) return BSwap; } - + // (X^C)|Y -> (X|Y)^C iff Y&C == 0 if (Op0->hasOneUse() && match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) && @@ -1827,7 +1834,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { return ReplaceInstUsesWith(I, B); } } - + if ((C1->getValue() & C2->getValue()) == 0) { // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2) // iff (C1&C2) == 0 and (N&~C1) == 0 @@ -1844,7 +1851,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { return BinaryOperator::CreateAnd(B, ConstantInt::get(B->getContext(), C1->getValue()|C2->getValue())); - + // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2) // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0. ConstantInt *C3 = 0, *C4 = 0; @@ -1904,16 +1911,16 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (Ret) return Ret; } } - + // (X >> Z) | (Y >> Z) -> (X|Y) >> Z for all shifts. if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) { if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0)) - if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && + if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && SI0->getOperand(1) == SI1->getOperand(1) && (SI0->hasOneUse() || SI1->hasOneUse())) { Value *NewOp = Builder->CreateOr(SI0->getOperand(0), SI1->getOperand(0), SI0->getName()); - return BinaryOperator::Create(SI1->getOpcode(), NewOp, + return BinaryOperator::Create(SI1->getOpcode(), NewOp, SI1->getOperand(1)); } } @@ -1975,13 +1982,13 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0))) if (Value *Res = FoldOrOfICmps(LHS, RHS)) return ReplaceInstUsesWith(I, Res); - + // (fcmp uno x, c) | (fcmp uno y, c) -> (fcmp uno x, y) if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0))) if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1))) if (Value *Res = FoldOrOfFCmps(LHS, RHS)) return ReplaceInstUsesWith(I, Res); - + // fold (or (cast A), (cast B)) -> (cast (or A, B)) if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) { CastInst *Op1C = dyn_cast<CastInst>(Op1); @@ -1999,14 +2006,14 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { Value *NewOp = Builder->CreateOr(Op0COp, Op1COp, I.getName()); return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType()); } - + // If this is or(cast(icmp), cast(icmp)), try to fold this even if the // cast is otherwise not optimizable. This happens for vector sexts. if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1COp)) if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0COp)) if (Value *Res = FoldOrOfICmps(LHS, RHS)) return CastInst::Create(Op0C->getOpcode(), Res, I.getType()); - + // If this is or(cast(fcmp), cast(fcmp)), try to fold this even if the // cast is otherwise not optimizable. This happens for vector sexts. if (FCmpInst *RHS = dyn_cast<FCmpInst>(Op1COp)) @@ -2035,7 +2042,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { Inner->takeName(Op0); return BinaryOperator::CreateOr(Inner, C1); } - + return Changed ? &I : 0; } @@ -2050,7 +2057,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { if (Value *V = SimplifyUsingDistributiveLaws(I)) return ReplaceInstUsesWith(I, V); - // See if we can simplify any instructions used by the instruction whose sole + // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. if (SimplifyDemandedInstructionBits(I)) return &I; @@ -2058,7 +2065,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { // Is this a ~ operation? if (Value *NotOp = dyn_castNotVal(&I)) { if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(NotOp)) { - if (Op0I->getOpcode() == Instruction::And || + if (Op0I->getOpcode() == Instruction::And || Op0I->getOpcode() == Instruction::Or) { // ~(~X & Y) --> (X | ~Y) - De Morgan's Law // ~(~X | Y) === (X & ~Y) - De Morgan's Law @@ -2072,10 +2079,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { return BinaryOperator::CreateOr(Op0NotVal, NotY); return BinaryOperator::CreateAnd(Op0NotVal, NotY); } - + // ~(X & Y) --> (~X | ~Y) - De Morgan's Law // ~(X | Y) === (~X & ~Y) - De Morgan's Law - if (isFreeToInvert(Op0I->getOperand(0)) && + if (isFreeToInvert(Op0I->getOperand(0)) && isFreeToInvert(Op0I->getOperand(1))) { Value *NotX = Builder->CreateNot(Op0I->getOperand(0), "notlhs"); @@ -2093,8 +2100,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { } } } - - + + if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { if (RHS->isOne() && Op0->hasOneUse()) // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B @@ -2109,7 +2116,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { if (CI->hasOneUse() && Op0C->hasOneUse()) { Instruction::CastOps Opcode = Op0C->getOpcode(); if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && - (RHS == ConstantExpr::getCast(Opcode, + (RHS == ConstantExpr::getCast(Opcode, ConstantInt::getTrue(I.getContext()), Op0C->getDestTy()))) { CI->setPredicate(CI->getInversePredicate()); @@ -2128,7 +2135,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { ConstantInt::get(I.getType(), 1)); return BinaryOperator::CreateAdd(Op0I->getOperand(1), ConstantRHS); } - + if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) { if (Op0I->getOpcode() == Instruction::Add) { // ~(X-c) --> (-c-1)-X @@ -2152,13 +2159,34 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { // Anything in both C1 and C2 is known to be zero, remove it from // NewRHS. Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHS); - NewRHS = ConstantExpr::getAnd(NewRHS, + NewRHS = ConstantExpr::getAnd(NewRHS, ConstantExpr::getNot(CommonBits)); Worklist.Add(Op0I); I.setOperand(0, Op0I->getOperand(0)); I.setOperand(1, NewRHS); return &I; } + } else if (Op0I->getOpcode() == Instruction::LShr) { + // ((X^C1) >> C2) ^ C3 -> (X>>C2) ^ ((C1>>C2)^C3) + // E1 = "X ^ C1" + BinaryOperator *E1; + ConstantInt *C1; + if (Op0I->hasOneUse() && + (E1 = dyn_cast<BinaryOperator>(Op0I->getOperand(0))) && + E1->getOpcode() == Instruction::Xor && + (C1 = dyn_cast<ConstantInt>(E1->getOperand(1)))) { + // fold (C1 >> C2) ^ C3 + ConstantInt *C2 = Op0CI, *C3 = RHS; + APInt FoldConst = C1->getValue().lshr(C2->getValue()); + FoldConst ^= C3->getValue(); + // Prepare the two operands. + Value *Opnd0 = Builder->CreateLShr(E1->getOperand(0), C2); + Opnd0->takeName(Op0I); + cast<Instruction>(Opnd0)->setDebugLoc(I.getDebugLoc()); + Value *FoldVal = ConstantInt::get(Opnd0->getType(), FoldConst); + + return BinaryOperator::CreateXor(Opnd0, FoldVal); + } } } } @@ -2184,7 +2212,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { I.swapOperands(); // Simplified below. std::swap(Op0, Op1); } - } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) && + } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) && Op1I->hasOneUse()){ if (A == Op0) { // A^(A&B) -> A^(B&A) Op1I->swapOperands(); @@ -2196,7 +2224,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { } } } - + BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0); if (Op0I) { Value *A, *B; @@ -2206,7 +2234,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { std::swap(A, B); if (B == Op1) // (A|B)^B == A & ~B return BinaryOperator::CreateAnd(A, Builder->CreateNot(Op1)); - } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) && + } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) && Op0I->hasOneUse()){ if (A == Op1) // (A&B)^A -> (B&A)^A std::swap(A, B); @@ -2216,31 +2244,31 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { } } } - + // (X >> Z) ^ (Y >> Z) -> (X^Y) >> Z for all shifts. - if (Op0I && Op1I && Op0I->isShift() && - Op0I->getOpcode() == Op1I->getOpcode() && + if (Op0I && Op1I && Op0I->isShift() && + Op0I->getOpcode() == Op1I->getOpcode() && Op0I->getOperand(1) == Op1I->getOperand(1) && (Op0I->hasOneUse() || Op1I->hasOneUse())) { Value *NewOp = Builder->CreateXor(Op0I->getOperand(0), Op1I->getOperand(0), Op0I->getName()); - return BinaryOperator::Create(Op1I->getOpcode(), NewOp, + return BinaryOperator::Create(Op1I->getOpcode(), NewOp, Op1I->getOperand(1)); } - + if (Op0I && Op1I) { Value *A, *B, *C, *D; // (A & B)^(A | B) -> A ^ B if (match(Op0I, m_And(m_Value(A), m_Value(B))) && match(Op1I, m_Or(m_Value(C), m_Value(D)))) { - if ((A == C && B == D) || (A == D && B == C)) + if ((A == C && B == D) || (A == D && B == C)) return BinaryOperator::CreateXor(A, B); } // (A | B)^(A & B) -> A ^ B if (match(Op0I, m_Or(m_Value(A), m_Value(B))) && match(Op1I, m_And(m_Value(C), m_Value(D)))) { - if ((A == C && B == D) || (A == D && B == C)) + if ((A == C && B == D) || (A == D && B == C)) return BinaryOperator::CreateXor(A, B); } } @@ -2257,7 +2285,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS); bool isSigned = LHS->isSigned() || RHS->isSigned(); - return ReplaceInstUsesWith(I, + return ReplaceInstUsesWith(I, getNewICmpValue(isSigned, Code, Op0, Op1, Builder)); } @@ -2270,9 +2298,9 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { Type *SrcTy = Op0C->getOperand(0)->getType(); if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntegerTy() && // Only do this if the casts both really cause code to be generated. - ShouldOptimizeCast(Op0C->getOpcode(), Op0C->getOperand(0), + ShouldOptimizeCast(Op0C->getOpcode(), Op0C->getOperand(0), I.getType()) && - ShouldOptimizeCast(Op1C->getOpcode(), Op1C->getOperand(0), + ShouldOptimizeCast(Op1C->getOpcode(), Op1C->getOperand(0), I.getType())) { Value *NewOp = Builder->CreateXor(Op0C->getOperand(0), Op1C->getOperand(0), I.getName()); diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index b12fc01..d17879b 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -12,12 +12,17 @@ //===----------------------------------------------------------------------===// #include "InstCombine.h" -#include "llvm/Support/CallSite.h" -#include "llvm/Target/TargetData.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/PatternMatch.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +using namespace PatternMatch; + +STATISTIC(NumSimplified, "Number of library calls simplified"); /// getPromotedType - Return the specified type promoted as it would be to pass /// though a va_arg area. @@ -29,6 +34,26 @@ static Type *getPromotedType(Type *Ty) { return Ty; } +/// reduceToSingleValueType - Given an aggregate type which ultimately holds a +/// single scalar element, like {{{type}}} or [1 x type], return type. +static Type *reduceToSingleValueType(Type *T) { + while (!T->isSingleValueType()) { + if (StructType *STy = dyn_cast<StructType>(T)) { + if (STy->getNumElements() == 1) + T = STy->getElementType(0); + else + break; + } else if (ArrayType *ATy = dyn_cast<ArrayType>(T)) { + if (ATy->getNumElements() == 1) + T = ATy->getElementType(); + else + break; + } else + break; + } + + return T; +} Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), TD); @@ -74,35 +99,37 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { // dest address will be promotable. See if we can find a better type than the // integer datatype. Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts(); + MDNode *CopyMD = 0; if (StrippedDest != MI->getArgOperand(0)) { Type *SrcETy = cast<PointerType>(StrippedDest->getType()) ->getElementType(); if (TD && SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) { // The SrcETy might be something like {{{double}}} or [1 x double]. Rip // down through these levels if so. - while (!SrcETy->isSingleValueType()) { - if (StructType *STy = dyn_cast<StructType>(SrcETy)) { - if (STy->getNumElements() == 1) - SrcETy = STy->getElementType(0); - else - break; - } else if (ArrayType *ATy = dyn_cast<ArrayType>(SrcETy)) { - if (ATy->getNumElements() == 1) - SrcETy = ATy->getElementType(); - else - break; - } else - break; - } + SrcETy = reduceToSingleValueType(SrcETy); if (SrcETy->isSingleValueType()) { NewSrcPtrTy = PointerType::get(SrcETy, SrcAddrSp); NewDstPtrTy = PointerType::get(SrcETy, DstAddrSp); + + // If the memcpy has metadata describing the members, see if we can + // get the TBAA tag describing our copy. + if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { + if (M->getNumOperands() == 3 && + M->getOperand(0) && + isa<ConstantInt>(M->getOperand(0)) && + cast<ConstantInt>(M->getOperand(0))->isNullValue() && + M->getOperand(1) && + isa<ConstantInt>(M->getOperand(1)) && + cast<ConstantInt>(M->getOperand(1))->getValue() == Size && + M->getOperand(2) && + isa<MDNode>(M->getOperand(2))) + CopyMD = cast<MDNode>(M->getOperand(2)); + } } } } - // If the memcpy/memmove provides better alignment info than we can // infer, use it. SrcAlign = std::max(SrcAlign, CopyAlign); @@ -112,8 +139,12 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { Value *Dest = Builder->CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); LoadInst *L = Builder->CreateLoad(Src, MI->isVolatile()); L->setAlignment(SrcAlign); + if (CopyMD) + L->setMetadata(LLVMContext::MD_tbaa, CopyMD); StoreInst *S = Builder->CreateStore(L, Dest, MI->isVolatile()); S->setAlignment(DstAlign); + if (CopyMD) + S->setMetadata(LLVMContext::MD_tbaa, CopyMD); // Set the size of the copy to 0, it will be deleted on the next iteration. MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType())); @@ -247,25 +278,25 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size)); return 0; } - case Intrinsic::bswap: + case Intrinsic::bswap: { + Value *IIOperand = II->getArgOperand(0); + Value *X = 0; + // bswap(bswap(x)) -> x - if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) - if (Operand->getIntrinsicID() == Intrinsic::bswap) - return ReplaceInstUsesWith(CI, Operand->getArgOperand(0)); + if (match(IIOperand, m_BSwap(m_Value(X)))) + return ReplaceInstUsesWith(CI, X); // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) - if (TruncInst *TI = dyn_cast<TruncInst>(II->getArgOperand(0))) { - if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(TI->getOperand(0))) - if (Operand->getIntrinsicID() == Intrinsic::bswap) { - unsigned C = Operand->getType()->getPrimitiveSizeInBits() - - TI->getType()->getPrimitiveSizeInBits(); - Value *CV = ConstantInt::get(Operand->getType(), C); - Value *V = Builder->CreateLShr(Operand->getArgOperand(0), CV); - return new TruncInst(V, TI->getType()); - } + if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) { + unsigned C = X->getType()->getPrimitiveSizeInBits() - + IIOperand->getType()->getPrimitiveSizeInBits(); + Value *CV = ConstantInt::get(X->getType(), C); + Value *V = Builder->CreateLShr(X, CV); + return new TruncInst(V, IIOperand->getType()); } - break; + } + case Intrinsic::powi: if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { // powi(x, 0) -> 1.0 @@ -664,7 +695,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (Splat->isOne()) { if (Zext) return CastInst::CreateZExtOrBitCast(Arg0, II->getType()); - // else + // else return CastInst::CreateSExtOrBitCast(Arg0, II->getType()); } } @@ -731,7 +762,7 @@ Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { /// passed through the varargs area, we can eliminate the use of the cast. static bool isSafeToEliminateVarargsCast(const CallSite CS, const CastInst * const CI, - const TargetData * const TD, + const DataLayout * const TD, const int ix) { if (!CI->isLosslessCast()) return false; @@ -752,49 +783,19 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS, return true; } -namespace { -class InstCombineFortifiedLibCalls : public SimplifyFortifiedLibCalls { - InstCombiner *IC; -protected: - void replaceCall(Value *With) { - NewInstruction = IC->ReplaceInstUsesWith(*CI, With); - } - bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp, bool isString) const { - if (CI->getArgOperand(SizeCIOp) == CI->getArgOperand(SizeArgOp)) - return true; - if (ConstantInt *SizeCI = - dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) { - if (SizeCI->isAllOnesValue()) - return true; - if (isString) { - uint64_t Len = GetStringLength(CI->getArgOperand(SizeArgOp)); - // If the length is 0 we don't know how long it is and so we can't - // remove the check. - if (Len == 0) return false; - return SizeCI->getZExtValue() >= Len; - } - if (ConstantInt *Arg = dyn_cast<ConstantInt>( - CI->getArgOperand(SizeArgOp))) - return SizeCI->getZExtValue() >= Arg->getZExtValue(); - } - return false; - } -public: - InstCombineFortifiedLibCalls(InstCombiner *IC) : IC(IC), NewInstruction(0) { } - Instruction *NewInstruction; -}; -} // end anonymous namespace - // Try to fold some different type of calls here. // Currently we're only working with the checking functions, memcpy_chk, // mempcpy_chk, memmove_chk, memset_chk, strcpy_chk, stpcpy_chk, strncpy_chk, // strcat_chk and strncat_chk. -Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const TargetData *TD) { +Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *TD) { if (CI->getCalledFunction() == 0) return 0; - InstCombineFortifiedLibCalls Simplifier(this); - Simplifier.fold(CI, TD, TLI); - return Simplifier.NewInstruction; + if (Value *With = Simplifier->optimizeCall(CI)) { + ++NumSimplified; + return CI->use_empty() ? CI : ReplaceInstUsesWith(*CI, With); + } + + return 0; } static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) { @@ -900,7 +901,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { new StoreInst(ConstantInt::getTrue(Callee->getContext()), UndefValue::get(Type::getInt1PtrTy(Callee->getContext())), OldCall); - // If OldCall dues not return void then replaceAllUsesWith undef. + // If OldCall does not return void then replaceAllUsesWith undef. // This allows ValueHandlers and custom metadata to adjust itself. if (!OldCall->getType()->isVoidTy()) ReplaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType())); @@ -961,7 +962,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { Changed = true; } - // Try to optimize the call if possible, we require TargetData for most of + // Try to optimize the call if possible, we require DataLayout for most of // this. None of these calls are seen as possibly dead so go ahead and // delete the instruction now. if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) { @@ -983,7 +984,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { if (Callee == 0) return false; Instruction *Caller = CS.getInstruction(); - const AttrListPtr &CallerPAL = CS.getAttributes(); + const AttributeSet &CallerPAL = CS.getAttributes(); // Okay, this is a cast from a function to a different type. Unless doing so // would cause a type conversion of one of our arguments, change this call to @@ -1013,8 +1014,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { return false; // Cannot transform this return value. if (!CallerPAL.isEmpty() && !Caller->use_empty()) { - Attributes RAttrs = CallerPAL.getRetAttributes(); - if (RAttrs & Attribute::typeIncompatible(NewRetTy)) + AttrBuilder RAttrs = CallerPAL.getRetAttributes(); + if (RAttrs.hasAttributes(Attribute::typeIncompatible(NewRetTy))) return false; // Attribute not compatible with transformed value. } @@ -1043,13 +1044,14 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { if (!CastInst::isCastable(ActTy, ParamTy)) return false; // Cannot transform this parameter value. - Attributes Attrs = CallerPAL.getParamAttributes(i + 1); - if (Attrs & Attribute::typeIncompatible(ParamTy)) + Attribute Attrs = CallerPAL.getParamAttributes(i + 1); + if (AttrBuilder(Attrs). + hasAttributes(Attribute::typeIncompatible(ParamTy))) return false; // Attribute not compatible with transformed value. // If the parameter is passed as a byval argument, then we have to have a // sized type and the sized type has to have the same size as the old type. - if (ParamTy != ActTy && (Attrs & Attribute::ByVal)) { + if (ParamTy != ActTy && Attrs.hasAttribute(Attribute::ByVal)) { PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0) return false; @@ -1100,8 +1102,9 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { for (unsigned i = CallerPAL.getNumSlots(); i; --i) { if (CallerPAL.getSlot(i - 1).Index <= FT->getNumParams()) break; - Attributes PAttrs = CallerPAL.getSlot(i - 1).Attrs; - if (PAttrs & Attribute::VarArgsIncompatible) + Attribute PAttrs = CallerPAL.getSlot(i - 1).Attrs; + // Check if it has an attribute that's incompatible with varargs. + if (PAttrs.hasAttribute(Attribute::StructRet)) return false; } @@ -1114,15 +1117,17 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { attrVec.reserve(NumCommonArgs); // Get any return attributes. - Attributes RAttrs = CallerPAL.getRetAttributes(); + AttrBuilder RAttrs = CallerPAL.getRetAttributes(); // If the return value is not being used, the type may not be compatible // with the existing attributes. Wipe out any problematic attributes. - RAttrs &= ~Attribute::typeIncompatible(NewRetTy); + RAttrs.removeAttributes(Attribute::typeIncompatible(NewRetTy)); // Add the new return attributes. - if (RAttrs) - attrVec.push_back(AttributeWithIndex::get(0, RAttrs)); + if (RAttrs.hasAttributes()) + attrVec.push_back( + AttributeWithIndex::get(AttributeSet::ReturnIndex, + Attribute::get(FT->getContext(), RAttrs))); AI = CS.arg_begin(); for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) { @@ -1136,7 +1141,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { } // Add any parameter attributes. - if (Attributes PAttrs = CallerPAL.getParamAttributes(i + 1)) + Attribute PAttrs = CallerPAL.getParamAttributes(i + 1); + if (PAttrs.hasAttributes()) attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs)); } @@ -1147,10 +1153,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { // If we are removing arguments to the function, emit an obnoxious warning. if (FT->getNumParams() < NumActualArgs) { - if (!FT->isVarArg()) { - errs() << "WARNING: While resolving call to function '" - << Callee->getName() << "' arguments were dropped!\n"; - } else { + // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722 + if (FT->isVarArg()) { // Add all of the arguments in their promoted form to the arg list. for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) { Type *PTy = getPromotedType((*AI)->getType()); @@ -1164,19 +1168,23 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { } // Add any parameter attributes. - if (Attributes PAttrs = CallerPAL.getParamAttributes(i + 1)) + Attribute PAttrs = CallerPAL.getParamAttributes(i + 1); + if (PAttrs.hasAttributes()) attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs)); } } } - if (Attributes FnAttrs = CallerPAL.getFnAttributes()) - attrVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); + Attribute FnAttrs = CallerPAL.getFnAttributes(); + if (FnAttrs.hasAttributes()) + attrVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex, + FnAttrs)); if (NewRetTy->isVoidTy()) Caller->setName(""); // Void type should not have a name. - const AttrListPtr &NewCallerPAL = AttrListPtr::get(attrVec); + const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(), + attrVec); Instruction *NC; if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { @@ -1236,7 +1244,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, Value *Callee = CS.getCalledValue(); PointerType *PTy = cast<PointerType>(Callee->getType()); FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); - const AttrListPtr &Attrs = CS.getAttributes(); + const AttributeSet &Attrs = CS.getAttributes(); // If the call already has the 'nest' attribute somewhere then give up - // otherwise 'nest' would occur twice after splicing in the chain. @@ -1250,16 +1258,16 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, PointerType *NestFPTy = cast<PointerType>(NestF->getType()); FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType()); - const AttrListPtr &NestAttrs = NestF->getAttributes(); + const AttributeSet &NestAttrs = NestF->getAttributes(); if (!NestAttrs.isEmpty()) { unsigned NestIdx = 1; Type *NestTy = 0; - Attributes NestAttr = Attribute::None; + Attribute NestAttr; // Look for a parameter marked with the 'nest' attribute. for (FunctionType::param_iterator I = NestFTy->param_begin(), E = NestFTy->param_end(); I != E; ++NestIdx, ++I) - if (NestAttrs.paramHasAttr(NestIdx, Attribute::Nest)) { + if (NestAttrs.getParamAttributes(NestIdx).hasAttribute(Attribute::Nest)){ // Record the parameter type and any other attributes. NestTy = *I; NestAttr = NestAttrs.getParamAttributes(NestIdx); @@ -1278,8 +1286,10 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, // mean appending it. Likewise for attributes. // Add any result attributes. - if (Attributes Attr = Attrs.getRetAttributes()) - NewAttrs.push_back(AttributeWithIndex::get(0, Attr)); + Attribute Attr = Attrs.getRetAttributes(); + if (Attr.hasAttributes()) + NewAttrs.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex, + Attr)); { unsigned Idx = 1; @@ -1299,7 +1309,8 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, // Add the original argument and attributes. NewArgs.push_back(*I); - if (Attributes Attr = Attrs.getParamAttributes(Idx)) + Attr = Attrs.getParamAttributes(Idx); + if (Attr.hasAttributes()) NewAttrs.push_back (AttributeWithIndex::get(Idx + (Idx >= NestIdx), Attr)); @@ -1308,8 +1319,10 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, } // Add any function attributes. - if (Attributes Attr = Attrs.getFnAttributes()) - NewAttrs.push_back(AttributeWithIndex::get(~0, Attr)); + Attr = Attrs.getFnAttributes(); + if (Attr.hasAttributes()) + NewAttrs.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex, + Attr)); // The trampoline may have been bitcast to a bogus type (FTy). // Handle this by synthesizing a new function type, equal to FTy @@ -1348,7 +1361,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, NestF->getType() == PointerType::getUnqual(NewFTy) ? NestF : ConstantExpr::getBitCast(NestF, PointerType::getUnqual(NewFTy)); - const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs); + const AttributeSet &NewPAL = AttributeSet::get(FTy->getContext(), NewAttrs); Instruction *NewCaller; if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 555b442..5af4442 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -13,9 +13,9 @@ #include "InstCombine.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/IR/DataLayout.h" #include "llvm/Support/PatternMatch.h" +#include "llvm/Target/TargetLibraryInfo.h" using namespace llvm; using namespace PatternMatch; @@ -78,7 +78,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, /// try to eliminate the cast by moving the type information into the alloc. Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI) { - // This requires TargetData to get the alloca alignment and size information. + // This requires DataLayout to get the alloca alignment and size information. if (!TD) return 0; PointerType *PTy = cast<PointerType>(CI.getType()); @@ -229,7 +229,7 @@ isEliminableCastPair( const CastInst *CI, ///< The first cast instruction unsigned opcode, ///< The opcode of the second cast instruction Type *DstTy, ///< The target type for the second cast instruction - TargetData *TD ///< The target data for pointer size + DataLayout *TD ///< The target data for pointer size ) { Type *SrcTy = CI->getOperand(0)->getType(); // A from above @@ -238,17 +238,20 @@ isEliminableCastPair( // Get the opcodes of the two Cast instructions Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode()); Instruction::CastOps secondOp = Instruction::CastOps(opcode); - + Type *SrcIntPtrTy = TD && SrcTy->isPtrOrPtrVectorTy() ? + TD->getIntPtrType(SrcTy) : 0; + Type *MidIntPtrTy = TD && MidTy->isPtrOrPtrVectorTy() ? + TD->getIntPtrType(MidTy) : 0; + Type *DstIntPtrTy = TD && DstTy->isPtrOrPtrVectorTy() ? + TD->getIntPtrType(DstTy) : 0; unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy, - DstTy, - TD ? TD->getIntPtrType(CI->getContext()) : 0); - + DstTy, SrcIntPtrTy, MidIntPtrTy, + DstIntPtrTy); + // We don't want to form an inttoptr or ptrtoint that converts to an integer // type that differs from the pointer size. - if ((Res == Instruction::IntToPtr && - (!TD || SrcTy != TD->getIntPtrType(CI->getContext()))) || - (Res == Instruction::PtrToInt && - (!TD || DstTy != TD->getIntPtrType(CI->getContext())))) + if ((Res == Instruction::IntToPtr && SrcTy != DstIntPtrTy) || + (Res == Instruction::PtrToInt && DstTy != SrcIntPtrTy)) Res = 0; return Instruction::CastOps(Res); @@ -1334,17 +1337,15 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) { // GEP computes a constant offset, see if we can convert these three // instructions into fewer. This typically happens with unions and other // non-type-safe code. + APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0); if (TD && GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0)) && - GEP->hasAllConstantIndices()) { - SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end()); - int64_t Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), Ops); - + GEP->accumulateConstantOffset(*TD, Offset)) { // Get the base pointer input of the bitcast, and the type it points to. Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0); Type *GEPIdxTy = cast<PointerType>(OrigBase->getType())->getElementType(); SmallVector<Value*, 8> NewIndices; - if (FindElementAtOffset(GEPIdxTy, Offset, NewIndices)) { + if (FindElementAtOffset(GEPIdxTy, Offset.getSExtValue(), NewIndices)) { // If we were able to index down into an element, create the GEP // and bitcast the result. This eliminates one bitcast, potentially // two. diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index c3fc18c..40e559e 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -12,15 +12,15 @@ //===----------------------------------------------------------------------===// #include "InstCombine.h" -#include "llvm/IntrinsicInst.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/ConstantRange.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/PatternMatch.h" +#include "llvm/Target/TargetLibraryInfo.h" using namespace llvm; using namespace PatternMatch; @@ -474,7 +474,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, /// If we can't emit an optimized form for this expression, this returns null. /// static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) { - TargetData &TD = *IC.getTargetData(); + DataLayout &TD = *IC.getDataLayout(); gep_type_iterator GTI = gep_type_begin(GEP); // Check to see if this gep only has a single variable index. If so, and if @@ -1226,6 +1226,16 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, ICI.setOperand(0, NewAnd); return &ICI; } + + // Replace ((X & AndCST) > RHSV) with ((X & AndCST) != 0), if any + // bit set in (X & AndCST) will produce a result greater than RHSV. + if (ICI.getPredicate() == ICmpInst::ICMP_UGT) { + unsigned NTZ = AndCST->getValue().countTrailingZeros(); + if ((NTZ < AndCST->getBitWidth()) && + APInt::getOneBitSet(AndCST->getBitWidth(), NTZ).ugt(RHSV)) + return new ICmpInst(ICmpInst::ICMP_NE, LHSI, + Constant::getNullValue(RHS->getType())); + } } // Try to optimize things like "A[i]&42 == 0" to index computations. @@ -2356,8 +2366,25 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // Try not to increase register pressure. BO0->hasOneUse() && BO1->hasOneUse()) { // Determine Y and Z in the form icmp (X+Y), (X+Z). - Value *Y = (A == C || A == D) ? B : A; - Value *Z = (C == A || C == B) ? D : C; + Value *Y, *Z; + if (A == C) { + // C + B == C + D -> B == D + Y = B; + Z = D; + } else if (A == D) { + // D + B == C + D -> B == C + Y = B; + Z = C; + } else if (B == C) { + // A + C == C + D -> A == D + Y = A; + Z = D; + } else { + assert(B == D); + // A + D == C + D -> A == C + Y = A; + Z = C; + } return new ICmpInst(Pred, Y, Z); } @@ -2895,10 +2922,6 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) { if (!RHSF) break; - // We can't convert a PPC double double. - if (RHSF->getType()->isPPC_FP128Ty()) - break; - const fltSemantics *Sem; // FIXME: This shouldn't be here. if (LHSExt->getSrcTy()->isHalfTy()) @@ -2911,6 +2934,8 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) { Sem = &APFloat::IEEEquad; else if (LHSExt->getSrcTy()->isX86_FP80Ty()) Sem = &APFloat::x87DoubleExtended; + else if (LHSExt->getSrcTy()->isPPC_FP128Ty()) + Sem = &APFloat::PPCDoubleDouble; else break; diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 6ecb4c5..337cfe3 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -12,12 +12,12 @@ //===----------------------------------------------------------------------===// #include "InstCombine.h" -#include "llvm/IntrinsicInst.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Loads.h" -#include "llvm/Target/TargetData.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumDeadStore, "Number of dead stores eliminated"); @@ -150,25 +150,6 @@ isOnlyCopiedFromConstantGlobal(AllocaInst *AI, return 0; } -/// getPointeeAlignment - Compute the minimum alignment of the value pointed -/// to by the given pointer. -static unsigned getPointeeAlignment(Value *V, const TargetData &TD) { - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) - if (CE->getOpcode() == Instruction::BitCast || - (CE->getOpcode() == Instruction::GetElementPtr && - cast<GEPOperator>(CE)->hasAllZeroIndices())) - return getPointeeAlignment(CE->getOperand(0), TD); - - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) - if (!GV->isDeclaration()) - return TD.getPreferredAlignment(GV); - - if (PointerType *PT = dyn_cast<PointerType>(V->getType())) - return TD.getABITypeAlignment(PT->getElementType()); - - return 0; -} - Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // Ensure that the alloca array size argument has type intptr_t, so that // any casting is exposed early. @@ -246,12 +227,16 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { return &AI; } + // If the alignment of the entry block alloca is 0 (unspecified), + // assign it the preferred alignment. + if (EntryAI->getAlignment() == 0) + EntryAI->setAlignment( + TD->getPrefTypeAlignment(EntryAI->getAllocatedType())); // Replace this zero-sized alloca with the one at the start of the entry // block after ensuring that the address will be aligned enough for both // types. - unsigned MaxAlign = - std::max(TD->getPrefTypeAlignment(EntryAI->getAllocatedType()), - TD->getPrefTypeAlignment(AI.getAllocatedType())); + unsigned MaxAlign = std::max(EntryAI->getAlignment(), + AI.getAlignment()); EntryAI->setAlignment(MaxAlign); if (AI.getType() != EntryAI->getType()) return new BitCastInst(EntryAI, AI.getType()); @@ -260,26 +245,30 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { } } - // Check to see if this allocation is only modified by a memcpy/memmove from - // a constant global whose alignment is equal to or exceeds that of the - // allocation. If this is the case, we can change all users to use - // the constant global instead. This is commonly produced by the CFE by - // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' - // is only subsequently read. - SmallVector<Instruction *, 4> ToDelete; - if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) { - if (AI.getAlignment() <= getPointeeAlignment(Copy->getSource(), *TD)) { - DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n'); - DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); - for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) - EraseInstFromFunction(*ToDelete[i]); - Constant *TheSrc = cast<Constant>(Copy->getSource()); - Instruction *NewI - = ReplaceInstUsesWith(AI, ConstantExpr::getBitCast(TheSrc, - AI.getType())); - EraseInstFromFunction(*Copy); - ++NumGlobalCopies; - return NewI; + if (AI.getAlignment()) { + // Check to see if this allocation is only modified by a memcpy/memmove from + // a constant global whose alignment is equal to or exceeds that of the + // allocation. If this is the case, we can change all users to use + // the constant global instead. This is commonly produced by the CFE by + // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' + // is only subsequently read. + SmallVector<Instruction *, 4> ToDelete; + if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) { + unsigned SourceAlign = getOrEnforceKnownAlignment(Copy->getSource(), + AI.getAlignment(), TD); + if (AI.getAlignment() <= SourceAlign) { + DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n'); + DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); + for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) + EraseInstFromFunction(*ToDelete[i]); + Constant *TheSrc = cast<Constant>(Copy->getSource()); + Instruction *NewI + = ReplaceInstUsesWith(AI, ConstantExpr::getBitCast(TheSrc, + AI.getType())); + EraseInstFromFunction(*Copy); + ++NumGlobalCopies; + return NewI; + } } } @@ -291,7 +280,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { /// InstCombineLoadCast - Fold 'load (cast P)' -> cast (load P)' when possible. static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI, - const TargetData *TD) { + const DataLayout *TD) { User *CI = cast<User>(LI.getOperand(0)); Value *CastOp = CI->getOperand(0); @@ -321,14 +310,14 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI, SrcPTy = SrcTy->getElementType(); } - if (IC.getTargetData() && + if (IC.getDataLayout() && (SrcPTy->isIntegerTy() || SrcPTy->isPointerTy() || SrcPTy->isVectorTy()) && // Do not allow turning this into a load of an integer, which is then // casted to a pointer, this pessimizes pointer analysis a lot. (SrcPTy->isPointerTy() == LI.getType()->isPointerTy()) && - IC.getTargetData()->getTypeSizeInBits(SrcPTy) == - IC.getTargetData()->getTypeSizeInBits(DestPTy)) { + IC.getDataLayout()->getTypeSizeInBits(SrcPTy) == + IC.getDataLayout()->getTypeSizeInBits(DestPTy)) { // Okay, we are casting from one integer or pointer type to another of // the same size. Instead of casting the pointer before the load, cast @@ -506,11 +495,11 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) { // If the pointers point into different address spaces or if they point to // values with different sizes, we can't do the transformation. - if (!IC.getTargetData() || + if (!IC.getDataLayout() || SrcTy->getAddressSpace() != cast<PointerType>(CI->getType())->getAddressSpace() || - IC.getTargetData()->getTypeSizeInBits(SrcPTy) != - IC.getTargetData()->getTypeSizeInBits(DestPTy)) + IC.getDataLayout()->getTypeSizeInBits(SrcPTy) != + IC.getDataLayout()->getTypeSizeInBits(DestPTy)) return 0; // Okay, we are casting from one integer or pointer type to another of @@ -813,6 +802,13 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) { InsertNewInstBefore(NewSI, *BBI); NewSI->setDebugLoc(OtherStore->getDebugLoc()); + // If the two stores had the same TBAA tag, preserve it. + if (MDNode *TBAATag = SI.getMetadata(LLVMContext::MD_tbaa)) + if ((TBAATag = MDNode::getMostGenericTBAA(TBAATag, + OtherStore->getMetadata(LLVMContext::MD_tbaa)))) + NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag); + + // Nuke the old stores. EraseInstFromFunction(SI); EraseInstFromFunction(*OtherStore); diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 2a7182f..d0f4392 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -13,8 +13,8 @@ //===----------------------------------------------------------------------===// #include "InstCombine.h" -#include "llvm/IntrinsicInst.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/PatternMatch.h" using namespace llvm; using namespace PatternMatch; @@ -37,7 +37,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) { if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(PowerOf2), m_Value(A))), m_Value(B))) && // The "1" can be any value known to be a power of 2. - isPowerOfTwo(PowerOf2, IC.getTargetData())) { + isKnownToBeAPowerOfTwo(PowerOf2)) { A = IC.Builder->CreateSub(A, B); return IC.Builder->CreateShl(PowerOf2, A); } @@ -45,8 +45,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) { // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it // inexact. Similarly for <<. if (BinaryOperator *I = dyn_cast<BinaryOperator>(V)) - if (I->isLogicalShift() && - isPowerOfTwo(I->getOperand(0), IC.getTargetData())) { + if (I->isLogicalShift() && isKnownToBeAPowerOfTwo(I->getOperand(0))) { // We know that this is an exact/nuw shift and that the input is a // non-zero context as well. if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC)) { @@ -252,24 +251,134 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { return Changed ? &I : 0; } +// +// Detect pattern: +// +// log2(Y*0.5) +// +// And check for corresponding fast math flags +// + +static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) { + + if (!Op->hasOneUse()) + return; + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op); + if (!II) + return; + if (II->getIntrinsicID() != Intrinsic::log2 || !II->hasUnsafeAlgebra()) + return; + Log2 = II; + + Value *OpLog2Of = II->getArgOperand(0); + if (!OpLog2Of->hasOneUse()) + return; + + Instruction *I = dyn_cast<Instruction>(OpLog2Of); + if (!I) + return; + if (I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra()) + return; + + ConstantFP *CFP = dyn_cast<ConstantFP>(I->getOperand(0)); + if (CFP && CFP->isExactlyValue(0.5)) { + Y = I->getOperand(1); + return; + } + CFP = dyn_cast<ConstantFP>(I->getOperand(1)); + if (CFP && CFP->isExactlyValue(0.5)) + Y = I->getOperand(0); +} + +/// Helper function of InstCombiner::visitFMul(BinaryOperator(). It returns +/// true iff the given value is FMul or FDiv with one and only one operand +/// being a normal constant (i.e. not Zero/NaN/Infinity). +static bool isFMulOrFDivWithConstant(Value *V) { + Instruction *I = dyn_cast<Instruction>(V); + if (!I || (I->getOpcode() != Instruction::FMul && + I->getOpcode() != Instruction::FDiv)) + return false; + + ConstantFP *C0 = dyn_cast<ConstantFP>(I->getOperand(0)); + ConstantFP *C1 = dyn_cast<ConstantFP>(I->getOperand(1)); + + if (C0 && C1) + return false; + + return (C0 && C0->getValueAPF().isNormal()) || + (C1 && C1->getValueAPF().isNormal()); +} + +static bool isNormalFp(const ConstantFP *C) { + const APFloat &Flt = C->getValueAPF(); + return Flt.isNormal() && !Flt.isDenormal(); +} + +/// foldFMulConst() is a helper routine of InstCombiner::visitFMul(). +/// The input \p FMulOrDiv is a FMul/FDiv with one and only one operand +/// being a constant (i.e. isFMulOrFDivWithConstant(FMulOrDiv) == true). +/// This function is to simplify "FMulOrDiv * C" and returns the +/// resulting expression. Note that this function could return NULL in +/// case the constants cannot be folded into a normal floating-point. +/// +Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C, + Instruction *InsertBefore) { + assert(isFMulOrFDivWithConstant(FMulOrDiv) && "V is invalid"); + + Value *Opnd0 = FMulOrDiv->getOperand(0); + Value *Opnd1 = FMulOrDiv->getOperand(1); + + ConstantFP *C0 = dyn_cast<ConstantFP>(Opnd0); + ConstantFP *C1 = dyn_cast<ConstantFP>(Opnd1); + + BinaryOperator *R = 0; + + // (X * C0) * C => X * (C0*C) + if (FMulOrDiv->getOpcode() == Instruction::FMul) { + Constant *F = ConstantExpr::getFMul(C1 ? C1 : C0, C); + if (isNormalFp(cast<ConstantFP>(F))) + R = BinaryOperator::CreateFMul(C1 ? Opnd0 : Opnd1, F); + } else { + if (C0) { + // (C0 / X) * C => (C0 * C) / X + ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C)); + if (isNormalFp(F)) + R = BinaryOperator::CreateFDiv(F, Opnd1); + } else { + // (X / C1) * C => X * (C/C1) if C/C1 is not a denormal + ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFDiv(C, C1)); + if (isNormalFp(F)) { + R = BinaryOperator::CreateFMul(Opnd0, F); + } else { + // (X / C1) * C => X / (C1/C) + Constant *F = ConstantExpr::getFDiv(C1, C); + if (isNormalFp(cast<ConstantFP>(F))) + R = BinaryOperator::CreateFDiv(Opnd0, F); + } + } + } + + if (R) { + R->setHasUnsafeAlgebra(true); + InsertNewInstWith(R, *InsertBefore); + } + + return R; +} + Instruction *InstCombiner::visitFMul(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - // Simplify mul instructions with a constant RHS. - if (Constant *Op1C = dyn_cast<Constant>(Op1)) { - if (ConstantFP *Op1F = dyn_cast<ConstantFP>(Op1C)) { - // "In IEEE floating point, x*1 is not equivalent to x for nans. However, - // ANSI says we can drop signals, so we can do this anyway." (from GCC) - if (Op1F->isExactlyValue(1.0)) - return ReplaceInstUsesWith(I, Op0); // Eliminate 'fmul double %X, 1.0' - } else if (ConstantDataVector *Op1V = dyn_cast<ConstantDataVector>(Op1C)) { - // As above, vector X*splat(1.0) -> X in all defined cases. - if (ConstantFP *F = dyn_cast_or_null<ConstantFP>(Op1V->getSplatValue())) - if (F->isExactlyValue(1.0)) - return ReplaceInstUsesWith(I, Op0); - } + if (isa<Constant>(Op0)) + std::swap(Op0, Op1); + + if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), TD)) + return ReplaceInstUsesWith(I, V); + // Simplify mul instructions with a constant RHS. + if (isa<Constant>(Op1)) { // Try to fold constant mul into select arguments. if (SelectInst *SI = dyn_cast<SelectInst>(Op0)) if (Instruction *R = FoldOpIntoSelect(I, SI)) @@ -278,12 +387,120 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (isa<PHINode>(Op0)) if (Instruction *NV = FoldOpIntoPhi(I)) return NV; + + ConstantFP *C = dyn_cast<ConstantFP>(Op1); + if (C && I.hasUnsafeAlgebra() && C->getValueAPF().isNormal()) { + // Let MDC denote an expression in one of these forms: + // X * C, C/X, X/C, where C is a constant. + // + // Try to simplify "MDC * Constant" + if (isFMulOrFDivWithConstant(Op0)) { + Value *V = foldFMulConst(cast<Instruction>(Op0), C, &I); + if (V) + return ReplaceInstUsesWith(I, V); + } + + // (MDC +/- C1) * C2 => (MDC * C2) +/- (C1 * C2) + Instruction *FAddSub = dyn_cast<Instruction>(Op0); + if (FAddSub && + (FAddSub->getOpcode() == Instruction::FAdd || + FAddSub->getOpcode() == Instruction::FSub)) { + Value *Opnd0 = FAddSub->getOperand(0); + Value *Opnd1 = FAddSub->getOperand(1); + ConstantFP *C0 = dyn_cast<ConstantFP>(Opnd0); + ConstantFP *C1 = dyn_cast<ConstantFP>(Opnd1); + bool Swap = false; + if (C0) { + std::swap(C0, C1); + std::swap(Opnd0, Opnd1); + Swap = true; + } + + if (C1 && C1->getValueAPF().isNormal() && + isFMulOrFDivWithConstant(Opnd0)) { + Value *M0 = ConstantExpr::getFMul(C1, C); + Value *M1 = isNormalFp(cast<ConstantFP>(M0)) ? + foldFMulConst(cast<Instruction>(Opnd0), C, &I) : + 0; + if (M0 && M1) { + if (Swap && FAddSub->getOpcode() == Instruction::FSub) + std::swap(M0, M1); + + Value *R = (FAddSub->getOpcode() == Instruction::FAdd) ? + BinaryOperator::CreateFAdd(M0, M1) : + BinaryOperator::CreateFSub(M0, M1); + Instruction *RI = cast<Instruction>(R); + RI->setHasUnsafeAlgebra(true); + return RI; + } + } + } + } } if (Value *Op0v = dyn_castFNegVal(Op0)) // -X * -Y = X*Y if (Value *Op1v = dyn_castFNegVal(Op1)) return BinaryOperator::CreateFMul(Op0v, Op1v); + // Under unsafe algebra do: + // X * log2(0.5*Y) = X*log2(Y) - X + if (I.hasUnsafeAlgebra()) { + Value *OpX = NULL; + Value *OpY = NULL; + IntrinsicInst *Log2; + detectLog2OfHalf(Op0, OpY, Log2); + if (OpY) { + OpX = Op1; + } else { + detectLog2OfHalf(Op1, OpY, Log2); + if (OpY) { + OpX = Op0; + } + } + // if pattern detected emit alternate sequence + if (OpX && OpY) { + Log2->setArgOperand(0, OpY); + Value *FMulVal = Builder->CreateFMul(OpX, Log2); + Instruction *FMul = cast<Instruction>(FMulVal); + FMul->copyFastMathFlags(Log2); + Instruction *FSub = BinaryOperator::CreateFSub(FMulVal, OpX); + FSub->copyFastMathFlags(Log2); + return FSub; + } + } + + // X * cond ? 1.0 : 0.0 => cond ? X : 0.0 + if (I.hasNoNaNs() && I.hasNoSignedZeros()) { + Value *V0 = I.getOperand(0); + Value *V1 = I.getOperand(1); + Value *Cond, *SLHS, *SRHS; + bool Match = false; + + if (match(V0, m_Select(m_Value(Cond), m_Value(SLHS), m_Value(SRHS)))) { + Match = true; + } else if (match(V1, m_Select(m_Value(Cond), m_Value(SLHS), + m_Value(SRHS)))) { + Match = true; + std::swap(V0, V1); + } + + if (Match) { + ConstantFP *C0 = dyn_cast<ConstantFP>(SLHS); + ConstantFP *C1 = dyn_cast<ConstantFP>(SRHS); + + if (C0 && C1 && + ((C0->isZero() && C1->isExactlyValue(1.0)) || + (C1->isZero() && C0->isExactlyValue(1.0)))) { + Value *T; + if (C0->isZero()) + T = Builder->CreateSelect(Cond, SLHS, V1); + else + T = Builder->CreateSelect(Cond, V1, SRHS); + return ReplaceInstUsesWith(I, T); + } + } + } + return Changed ? &I : 0; } @@ -477,7 +694,8 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { if (match(Op1, m_Shl(m_Power2(CI), m_Value(N))) || match(Op1, m_ZExt(m_Shl(m_Power2(CI), m_Value(N))))) { if (*CI != 1) - N = Builder->CreateAdd(N, ConstantInt::get(I.getType(),CI->logBase2())); + N = Builder->CreateAdd(N, + ConstantInt::get(N->getType(), CI->logBase2())); if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1)) N = Builder->CreateZExt(N, Z->getDestTy()); if (I.isExact()) diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp index 664546c..b0a998c 100644 --- a/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -12,10 +12,10 @@ //===----------------------------------------------------------------------===// #include "InstCombine.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Target/TargetData.h" -#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/IR/DataLayout.h" using namespace llvm; /// FoldPHIArgBinOpIntoPHI - If we have something like phi [add (a,b), add(a,c)] diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index 291e800..a262d71 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -12,9 +12,9 @@ //===----------------------------------------------------------------------===// #include "InstCombine.h" -#include "llvm/Support/PatternMatch.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Support/PatternMatch.h" using namespace llvm; using namespace PatternMatch; @@ -287,7 +287,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, /// SimplifyWithOpReplaced - See if V simplifies when its operand Op is /// replaced with RepOp. static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, - const TargetData *TD, + const DataLayout *TD, const TargetLibraryInfo *TLI) { // Trivial replacement. if (V == Op) @@ -333,6 +333,10 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, // All operands were constants, fold it. if (ConstOps.size() == I->getNumOperands()) { + if (CmpInst *C = dyn_cast<CmpInst>(I)) + return ConstantFoldCompareInstOperands(C->getPredicate(), ConstOps[0], + ConstOps[1], TD, TLI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) if (!LI->isVolatile()) return ConstantFoldLoadFromConstPtr(ConstOps[0], TD); @@ -903,7 +907,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { return &SI; } - if (VectorType* VecTy = dyn_cast<VectorType>(SI.getType())) { + if (VectorType *VecTy = dyn_cast<VectorType>(SI.getType())) { unsigned VWidth = VecTy->getNumElements(); APInt UndefElts(VWidth, 0); APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); @@ -912,6 +916,28 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { return ReplaceInstUsesWith(SI, V); return &SI; } + + if (ConstantVector *CV = dyn_cast<ConstantVector>(CondVal)) { + // Form a shufflevector instruction. + SmallVector<Constant *, 8> Mask(VWidth); + Type *Int32Ty = Type::getInt32Ty(CV->getContext()); + for (unsigned i = 0; i != VWidth; ++i) { + Constant *Elem = cast<Constant>(CV->getOperand(i)); + if (ConstantInt *E = dyn_cast<ConstantInt>(Elem)) + Mask[i] = ConstantInt::get(Int32Ty, i + (E->isZero() ? VWidth : 0)); + else if (isa<UndefValue>(Elem)) + Mask[i] = UndefValue::get(Int32Ty); + else + return 0; + } + Constant *MaskVal = ConstantVector::get(Mask); + Value *V = Builder->CreateShuffleVector(TrueVal, FalseVal, MaskVal); + return ReplaceInstUsesWith(SI, V); + } + + if (isa<ConstantAggregateZero>(CondVal)) { + return ReplaceInstUsesWith(SI, FalseVal); + } } return 0; diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp index 4bb2403..8cf76e5 100644 --- a/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -12,9 +12,9 @@ //===----------------------------------------------------------------------===// #include "InstCombine.h" -#include "llvm/IntrinsicInst.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/PatternMatch.h" using namespace llvm; using namespace PatternMatch; @@ -49,7 +49,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { I.setOperand(1, Rem); return &I; } - + return 0; } @@ -70,10 +70,10 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, // We can always evaluate constants shifted. if (isa<Constant>(V)) return true; - + Instruction *I = dyn_cast<Instruction>(V); if (!I) return false; - + // If this is the opposite shift, we can directly reuse the input of the shift // if the needed bits are already zero in the input. This allows us to reuse // the value which means that we don't care if the shift has multiple uses. @@ -95,14 +95,14 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, return CanEvaluateTruncated(I->getOperand(0), Ty); } #endif - + } } - + // We can't mutate something that has multiple uses: doing so would // require duplicating the instruction in general, which isn't profitable. if (!I->hasOneUse()) return false; - + switch (I->getOpcode()) { default: return false; case Instruction::And: @@ -111,7 +111,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted. return CanEvaluateShifted(I->getOperand(0), NumBits, isLeftShift, IC) && CanEvaluateShifted(I->getOperand(1), NumBits, isLeftShift, IC); - + case Instruction::Shl: { // We can often fold the shift into shifts-by-a-constant. CI = dyn_cast<ConstantInt>(I->getOperand(1)); @@ -119,10 +119,10 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, // We can always fold shl(c1)+shl(c2) -> shl(c1+c2). if (isLeftShift) return true; - + // We can always turn shl(c)+shr(c) -> and(c2). if (CI->getValue() == NumBits) return true; - + unsigned TypeWidth = I->getType()->getScalarSizeInBits(); // We can turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but it isn't @@ -133,20 +133,20 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits)) return true; } - + return false; } case Instruction::LShr: { // We can often fold the shift into shifts-by-a-constant. CI = dyn_cast<ConstantInt>(I->getOperand(1)); if (CI == 0) return false; - + // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2). if (!isLeftShift) return true; - + // We can always turn lshr(c)+shl(c) -> and(c2). if (CI->getValue() == NumBits) return true; - + unsigned TypeWidth = I->getType()->getScalarSizeInBits(); // We can always turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but it isn't @@ -157,7 +157,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits)) return true; } - + return false; } case Instruction::Select: { @@ -175,7 +175,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, return false; return true; } - } + } } /// GetShiftedValue - When CanEvaluateShifted returned true for an expression, @@ -190,11 +190,11 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, V = IC.Builder->CreateLShr(C, NumBits); // If we got a constantexpr back, try to simplify it with TD info. if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) - V = ConstantFoldConstantExpression(CE, IC.getTargetData(), + V = ConstantFoldConstantExpression(CE, IC.getDataLayout(), IC.getTargetLibraryInfo()); return V; } - + Instruction *I = cast<Instruction>(V); IC.Worklist.Add(I); @@ -207,7 +207,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, I->setOperand(0, GetShiftedValue(I->getOperand(0), NumBits,isLeftShift,IC)); I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC)); return I; - + case Instruction::Shl: { BinaryOperator *BO = cast<BinaryOperator>(I); unsigned TypeWidth = BO->getType()->getScalarSizeInBits(); @@ -227,7 +227,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, BO->setHasNoSignedWrap(false); return I; } - + // We turn shl(c)+lshr(c) -> and(c2) if the input doesn't already have // zeros. if (CI->getValue() == NumBits) { @@ -240,7 +240,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, } return V; } - + // We turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but only when we know that // the and won't be needed. assert(CI->getZExtValue() > NumBits); @@ -255,19 +255,19 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, unsigned TypeWidth = BO->getType()->getScalarSizeInBits(); // We only accept shifts-by-a-constant in CanEvaluateShifted. ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1)); - + // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2). if (!isLeftShift) { // If this is oversized composite shift, then unsigned shifts get 0. unsigned NewShAmt = NumBits+CI->getZExtValue(); if (NewShAmt >= TypeWidth) return Constant::getNullValue(BO->getType()); - + BO->setOperand(1, ConstantInt::get(BO->getType(), NewShAmt)); BO->setIsExact(false); return I; } - + // We turn lshr(c)+shl(c) -> and(c2) if the input doesn't already have // zeros. if (CI->getValue() == NumBits) { @@ -280,7 +280,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, } return V; } - + // We turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but only when we know that // the and won't be needed. assert(CI->getZExtValue() > NumBits); @@ -289,7 +289,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, BO->setIsExact(false); return BO; } - + case Instruction::Select: I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC)); I->setOperand(2, GetShiftedValue(I->getOperand(2), NumBits,isLeftShift,IC)); @@ -304,7 +304,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, NumBits, isLeftShift, IC)); return PN; } - } + } } @@ -312,24 +312,24 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, BinaryOperator &I) { bool isLeftShift = I.getOpcode() == Instruction::Shl; - - + + // See if we can propagate this shift into the input, this covers the trivial // cast of lshr(shl(x,c1),c2) as well as other more complex cases. if (I.getOpcode() != Instruction::AShr && CanEvaluateShifted(Op0, Op1->getZExtValue(), isLeftShift, *this)) { DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression" " to eliminate shift:\n IN: " << *Op0 << "\n SH: " << I <<"\n"); - - return ReplaceInstUsesWith(I, + + return ReplaceInstUsesWith(I, GetShiftedValue(Op0, Op1->getZExtValue(), isLeftShift, *this)); } - - - // See if we can simplify any instructions used by the instruction whose sole + + + // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. uint32_t TypeBits = Op0->getType()->getScalarSizeInBits(); - + // shl i32 X, 32 = 0 and srl i8 Y, 9 = 0, ... just don't eliminate // a signed shift. // @@ -340,14 +340,14 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, I.setOperand(1, ConstantInt::get(I.getType(), TypeBits-1)); return &I; } - + // ((X*C1) << C2) == (X * (C1 << C2)) if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0)) if (BO->getOpcode() == Instruction::Mul && isLeftShift) if (Constant *BOOp = dyn_cast<Constant>(BO->getOperand(1))) return BinaryOperator::CreateMul(BO->getOperand(0), ConstantExpr::getShl(BOOp, Op1)); - + // Try to fold constant and into select arguments. if (SelectInst *SI = dyn_cast<SelectInst>(Op0)) if (Instruction *R = FoldOpIntoSelect(I, SI)) @@ -355,7 +355,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, if (isa<PHINode>(Op0)) if (Instruction *NV = FoldOpIntoPhi(I)) return NV; - + // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2)) if (TruncInst *TI = dyn_cast<TruncInst>(Op0)) { Instruction *TrOp = dyn_cast<Instruction>(TI->getOperand(0)); @@ -364,7 +364,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, // require that the input operand is a shift-by-constant so that we have // confidence that the shifts will get folded together. We could do this // xform in more cases, but it is unlikely to be profitable. - if (TrOp && I.isLogicalShift() && TrOp->isShift() && + if (TrOp && I.isLogicalShift() && TrOp->isShift() && isa<ConstantInt>(TrOp->getOperand(1))) { // Okay, we'll do this xform. Make the shift of shift. Constant *ShAmt = ConstantExpr::getZExt(Op1, TrOp->getType()); @@ -378,7 +378,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, unsigned SrcSize = TrOp->getType()->getScalarSizeInBits(); unsigned DstSize = TI->getType()->getScalarSizeInBits(); APInt MaskV(APInt::getLowBitsSet(SrcSize, DstSize)); - + // The mask we constructed says what the trunc would do if occurring // between the shifts. We want to know the effect *after* the second // shift. We know that it is a logical shift by a constant, so adjust the @@ -399,7 +399,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, return new TruncInst(And, I.getType()); } } - + if (Op0->hasOneUse()) { if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) { // Turn ((X >> C) + Y) << C -> (X + (Y << C)) & (~0 << C) @@ -425,14 +425,13 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(), APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val))); } - + // Turn (Y + ((X >> C) & CC)) << C -> ((X & (CC << C)) + (Y << C)) Value *Op0BOOp1 = Op0BO->getOperand(1); if (isLeftShift && Op0BOOp1->hasOneUse() && - match(Op0BOOp1, - m_And(m_Shr(m_Value(V1), m_Specific(Op1)), - m_ConstantInt(CC))) && - cast<BinaryOperator>(Op0BOOp1)->getOperand(0)->hasOneUse()) { + match(Op0BOOp1, + m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))), + m_ConstantInt(CC)))) { Value *YS = // (Y << C) Builder->CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName()); @@ -442,7 +441,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM); } } - + // FALL THROUGH. case Instruction::Sub: { // Turn ((X >> C) + Y) << C -> (X + (Y << C)) & (~0 << C) @@ -458,34 +457,32 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(), APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val))); } - + // Turn (((X >> C)&CC) + Y) << C -> (X + (Y << C)) & (CC << C) if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() && match(Op0BO->getOperand(0), - m_And(m_Shr(m_Value(V1), m_Value(V2)), - m_ConstantInt(CC))) && V2 == Op1 && - cast<BinaryOperator>(Op0BO->getOperand(0)) - ->getOperand(0)->hasOneUse()) { + m_And(m_OneUse(m_Shr(m_Value(V1), m_Value(V2))), + m_ConstantInt(CC))) && V2 == Op1) { Value *YS = // (Y << C) Builder->CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName()); // X & (CC << C) Value *XM = Builder->CreateAnd(V1, ConstantExpr::getShl(CC, Op1), V1->getName()+".mask"); - + return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS); } - + break; } } - - + + // If the operand is an bitwise operator with a constant RHS, and the // shift is the only use, we can pull it out of the shift. if (ConstantInt *Op0C = dyn_cast<ConstantInt>(Op0BO->getOperand(1))) { bool isValid = true; // Valid only for And, Or, Xor bool highBitSet = false; // Transform if high bit of constant set? - + switch (Op0BO->getOpcode()) { default: isValid = false; break; // Do not perform transform! case Instruction::Add: @@ -499,7 +496,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, highBitSet = true; break; } - + // If this is a signed shift right, and the high bit is modified // by the logical operation, do not perform the transformation. // The highBitSet boolean indicates the value of the high bit of @@ -508,26 +505,26 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, // if (isValid && I.getOpcode() == Instruction::AShr) isValid = Op0C->getValue()[TypeBits-1] == highBitSet; - + if (isValid) { Constant *NewRHS = ConstantExpr::get(I.getOpcode(), Op0C, Op1); - + Value *NewShift = Builder->CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1); NewShift->takeName(Op0BO); - + return BinaryOperator::Create(Op0BO->getOpcode(), NewShift, NewRHS); } } } } - + // Find out if this is a shift of a shift by a constant. BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0); if (ShiftOp && !ShiftOp->isShift()) ShiftOp = 0; - + if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) { // This is a constant shift of a constant shift. Be careful about hiding @@ -548,9 +545,9 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, assert(ShiftAmt2 != 0 && "Should have been simplified earlier"); if (ShiftAmt1 == 0) return 0; // Will be simplified in the future. Value *X = ShiftOp->getOperand(0); - + IntegerType *Ty = cast<IntegerType>(I.getType()); - + // Check for (X << c1) << c2 and (X >> c1) >> c2 if (I.getOpcode() == ShiftOp->getOpcode()) { uint32_t AmtSum = ShiftAmt1+ShiftAmt2; // Fold into one big shift. @@ -561,11 +558,11 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType())); AmtSum = TypeBits-1; // Saturate to 31 for i32 ashr. } - + return BinaryOperator::Create(I.getOpcode(), X, ConstantInt::get(Ty, AmtSum)); } - + if (ShiftAmt1 == ShiftAmt2) { // If we have ((X << C) >>u C), turn this into X & (-1 >>u C). if (I.getOpcode() == Instruction::LShr && @@ -605,7 +602,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, return NewLShr; } Value *Shift = Builder->CreateLShr(X, ShiftDiffCst); - + APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2)); return BinaryOperator::CreateAnd(Shift, ConstantInt::get(I.getContext(),Mask)); @@ -653,12 +650,12 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, return NewShl; } Value *Shift = Builder->CreateShl(X, ShiftDiffCst); - + APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2)); return BinaryOperator::CreateAnd(Shift, ConstantInt::get(I.getContext(),Mask)); } - + // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However, // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits. if (I.getOpcode() == Instruction::AShr && @@ -682,21 +679,21 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) { I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), TD)) return ReplaceInstUsesWith(I, V); - + if (Instruction *V = commonShiftTransforms(I)) return V; - + if (ConstantInt *Op1C = dyn_cast<ConstantInt>(I.getOperand(1))) { unsigned ShAmt = Op1C->getZExtValue(); - + // If the shifted-out value is known-zero, then this is a NUW shift. - if (!I.hasNoUnsignedWrap() && + if (!I.hasNoUnsignedWrap() && MaskedValueIsZero(I.getOperand(0), APInt::getHighBitsSet(Op1C->getBitWidth(), ShAmt))) { I.setHasNoUnsignedWrap(); return &I; } - + // If the shifted out value is all signbits, this is a NSW shift. if (!I.hasNoSignedWrap() && ComputeNumSignBits(I.getOperand(0)) > ShAmt) { @@ -712,7 +709,7 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) { match(I.getOperand(1), m_Constant(C2))) return BinaryOperator::CreateShl(ConstantExpr::getShl(C1, C2), A); - return 0; + return 0; } Instruction *InstCombiner::visitLShr(BinaryOperator &I) { @@ -722,9 +719,9 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) { if (Instruction *R = commonShiftTransforms(I)) return R; - + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - + if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { unsigned ShAmt = Op1C->getZExtValue(); @@ -743,15 +740,15 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) { return new ZExtInst(Cmp, II->getType()); } } - + // If the shifted-out value is known-zero, then this is an exact shift. - if (!I.isExact() && + if (!I.isExact() && MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt))){ I.setIsExact(); return &I; - } + } } - + return 0; } @@ -762,12 +759,12 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) { if (Instruction *R = commonShiftTransforms(I)) return R; - + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { unsigned ShAmt = Op1C->getZExtValue(); - + // If the input is a SHL by the same constant (ashr (shl X, C), C), then we // have a sign-extend idiom. Value *X; @@ -791,23 +788,23 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) { } // If the shifted-out value is known-zero, then this is an exact shift. - if (!I.isExact() && + if (!I.isExact() && MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt))){ I.setIsExact(); return &I; } - } - + } + // See if we can turn a signed shr into an unsigned shr. if (MaskedValueIsZero(Op0, APInt::getSignBit(I.getType()->getScalarSizeInBits()))) return BinaryOperator::CreateLShr(Op0, Op1); - + // Arithmetic shifting an all-sign-bit value is a no-op. unsigned NumSignBits = ComputeNumSignBits(Op0); if (NumSignBits == Op0->getType()->getScalarSizeInBits()) return ReplaceInstUsesWith(I, Op0); - + return 0; } diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 54be8ed..8add1ea 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -14,17 +14,18 @@ #include "InstCombine.h" -#include "llvm/Target/TargetData.h" -#include "llvm/IntrinsicInst.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/PatternMatch.h" using namespace llvm; +using namespace llvm::PatternMatch; - -/// ShrinkDemandedConstant - Check to see if the specified operand of the +/// ShrinkDemandedConstant - Check to see if the specified operand of the /// specified instruction is a constant integer. If so, check to see if there /// are any bits set in the constant that are not demanded. If so, shrink the /// constant and return true. -static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, +static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, APInt Demanded) { assert(I && "No instruction?"); assert(OpNo < I->getNumOperands() && "Operand index too large"); @@ -53,8 +54,8 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) { unsigned BitWidth = Inst.getType()->getScalarSizeInBits(); APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); APInt DemandedMask(APInt::getAllOnesValue(BitWidth)); - - Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, + + Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, KnownZero, KnownOne, 0); if (V == 0) return false; if (V == &Inst) return true; @@ -65,7 +66,7 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) { /// SimplifyDemandedBits - This form of SimplifyDemandedBits simplifies the /// specified instruction operand if possible, updating it in place. It returns /// true if it made any change and false otherwise. -bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask, +bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne, unsigned Depth) { Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, @@ -86,7 +87,7 @@ bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask, /// to be one in the expression. KnownZero contains all the bits that are known /// to be zero in the expression. These are provided to potentially allow the /// caller (which might recursively be SimplifyDemandedBits itself) to simplify -/// the expression. KnownOne and KnownZero always follow the invariant that +/// the expression. KnownOne and KnownZero always follow the invariant that /// KnownOne & KnownZero == 0. That is, a bit can't be both 1 and 0. Note that /// the bits in KnownOne and KnownZero may only be accurate for those bits set /// in DemandedMask. Note also that the bitwidth of V, DemandedMask, KnownZero @@ -133,10 +134,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, return 0; return UndefValue::get(VTy); } - + if (Depth == 6) // Limit search depth. return 0; - + APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0); APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); @@ -158,61 +159,74 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // If either the LHS or the RHS are Zero, the result is zero. ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1); ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); - + // If all of the demanded bits are known 1 on one side, return the other. // These bits cannot contribute to the result of the 'and' in this // context. - if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == + if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == (DemandedMask & ~LHSKnownZero)) return I->getOperand(0); - if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == + if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == (DemandedMask & ~RHSKnownZero)) return I->getOperand(1); - + // If all of the demanded bits in the inputs are known zeros, return zero. if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask) return Constant::getNullValue(VTy); - + } else if (I->getOpcode() == Instruction::Or) { // We can simplify (X|Y) -> X or Y in the user's context if we know that // only bits from X or Y are demanded. - + // If either the LHS or the RHS are One, the result is One. ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1); ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); - + // If all of the demanded bits are known zero on one side, return the // other. These bits cannot contribute to the result of the 'or' in this // context. - if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == + if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == (DemandedMask & ~LHSKnownOne)) return I->getOperand(0); - if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == + if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == (DemandedMask & ~RHSKnownOne)) return I->getOperand(1); - + // If all of the potentially set bits on one side are known to be set on // the other side, just use the 'other' side. - if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == + if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == (DemandedMask & (~RHSKnownZero))) return I->getOperand(0); - if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == + if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == (DemandedMask & (~LHSKnownZero))) return I->getOperand(1); + } else if (I->getOpcode() == Instruction::Xor) { + // We can simplify (X^Y) -> X or Y in the user's context if we know that + // only bits from X or Y are demanded. + + ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1); + ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); + + // If all of the demanded bits are known zero on one side, return the + // other. + if ((DemandedMask & RHSKnownZero) == DemandedMask) + return I->getOperand(0); + if ((DemandedMask & LHSKnownZero) == DemandedMask) + return I->getOperand(1); } - + // Compute the KnownZero/KnownOne bits to simplify things downstream. ComputeMaskedBits(I, KnownZero, KnownOne, Depth); return 0; } - + // If this is the root being simplified, allow it to have multiple uses, // just set the DemandedMask to all bits so that we can try to simplify the // operands. This allows visitTruncInst (for example) to simplify the // operand of a trunc without duplicating all the logic below. if (Depth == 0 && !V->hasOneUse()) DemandedMask = APInt::getAllOnesValue(BitWidth); - + switch (I->getOpcode()) { default: ComputeMaskedBits(I, KnownZero, KnownOne, Depth); @@ -224,26 +238,26 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownZero, LHSKnownZero, LHSKnownOne, Depth+1)) return I; - assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); - assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); + assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); + assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); // If all of the demanded bits are known 1 on one side, return the other. // These bits cannot contribute to the result of the 'and'. - if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == + if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == (DemandedMask & ~LHSKnownZero)) return I->getOperand(0); - if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == + if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == (DemandedMask & ~RHSKnownZero)) return I->getOperand(1); - + // If all of the demanded bits in the inputs are known zeros, return zero. if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask) return Constant::getNullValue(VTy); - + // If the RHS is a constant, see if we can simplify it. if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnownZero)) return I; - + // Output known-1 bits are only known if set in both the LHS & RHS. KnownOne = RHSKnownOne & LHSKnownOne; // Output known-0 are known to be clear if zero in either the LHS | RHS. @@ -251,36 +265,36 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, break; case Instruction::Or: // If either the LHS or the RHS are One, the result is One. - if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, + if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero, RHSKnownOne, Depth+1) || - SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne, + SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne, LHSKnownZero, LHSKnownOne, Depth+1)) return I; - assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); - assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); - + assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); + assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); + // If all of the demanded bits are known zero on one side, return the other. // These bits cannot contribute to the result of the 'or'. - if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == + if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == (DemandedMask & ~LHSKnownOne)) return I->getOperand(0); - if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == + if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == (DemandedMask & ~RHSKnownOne)) return I->getOperand(1); // If all of the potentially set bits on one side are known to be set on // the other side, just use the 'other' side. - if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == + if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == (DemandedMask & (~RHSKnownZero))) return I->getOperand(0); - if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == + if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == (DemandedMask & (~LHSKnownZero))) return I->getOperand(1); - + // If the RHS is a constant, see if we can simplify it. if (ShrinkDemandedConstant(I, 1, DemandedMask)) return I; - + // Output known-0 bits are only known if clear in both the LHS & RHS. KnownZero = RHSKnownZero & LHSKnownZero; // Output known-1 are known to be set if set in either the LHS | RHS. @@ -289,34 +303,34 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, case Instruction::Xor: { if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero, RHSKnownOne, Depth+1) || - SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, + SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, LHSKnownZero, LHSKnownOne, Depth+1)) return I; - assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); - assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); - + assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); + assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); + // If all of the demanded bits are known zero on one side, return the other. // These bits cannot contribute to the result of the 'xor'. if ((DemandedMask & RHSKnownZero) == DemandedMask) return I->getOperand(0); if ((DemandedMask & LHSKnownZero) == DemandedMask) return I->getOperand(1); - + // If all of the demanded bits are known to be zero on one side or the // other, turn this into an *inclusive* or. // e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0 if ((DemandedMask & ~RHSKnownZero & ~LHSKnownZero) == 0) { - Instruction *Or = + Instruction *Or = BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1), I->getName()); return InsertNewInstWith(Or, *I); } - + // If all of the demanded bits on one side are known, and all of the set // bits on that side are also known to be set on the other side, turn this // into an AND, as we know the bits will be cleared. // e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2 - if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask) { + if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask) { // all known if ((RHSKnownOne & LHSKnownOne) == RHSKnownOne) { Constant *AndC = Constant::getIntegerValue(VTy, @@ -325,12 +339,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, return InsertNewInstWith(And, *I); } } - + // If the RHS is a constant, see if we can simplify it. // FIXME: for XOR, we prefer to force bits to 1 if they will make a -1. if (ShrinkDemandedConstant(I, 1, DemandedMask)) return I; - + // If our LHS is an 'and' and if it has one use, and if any of the bits we // are flipping are known to be set, then the xor is just resetting those // bits to zero. We can just knock out bits from the 'and' and the 'xor', @@ -343,12 +357,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, ConstantInt *AndRHS = cast<ConstantInt>(LHSInst->getOperand(1)); ConstantInt *XorRHS = cast<ConstantInt>(I->getOperand(1)); APInt NewMask = ~(LHSKnownOne & RHSKnownOne & DemandedMask); - + Constant *AndC = ConstantInt::get(I->getType(), NewMask & AndRHS->getValue()); Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC); InsertNewInstWith(NewAnd, *I); - + Constant *XorC = ConstantInt::get(I->getType(), NewMask & XorRHS->getValue()); Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC); @@ -364,17 +378,17 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, case Instruction::Select: if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, RHSKnownZero, RHSKnownOne, Depth+1) || - SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, + SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, LHSKnownZero, LHSKnownOne, Depth+1)) return I; - assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); - assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); - + assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); + assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); + // If the operands are constants, see if we can simplify them. if (ShrinkDemandedConstant(I, 1, DemandedMask) || ShrinkDemandedConstant(I, 2, DemandedMask)) return I; - + // Only known if known in both the LHS and RHS. KnownOne = RHSKnownOne & LHSKnownOne; KnownZero = RHSKnownZero & LHSKnownZero; @@ -384,13 +398,13 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, DemandedMask = DemandedMask.zext(truncBf); KnownZero = KnownZero.zext(truncBf); KnownOne = KnownOne.zext(truncBf); - if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero, KnownOne, Depth+1)) return I; DemandedMask = DemandedMask.trunc(BitWidth); KnownZero = KnownZero.trunc(BitWidth); KnownOne = KnownOne.trunc(BitWidth); - assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); break; } case Instruction::BitCast: @@ -413,12 +427,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero, KnownOne, Depth+1)) return I; - assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); break; case Instruction::ZExt: { // Compute the bits in the result that are not present in the input. unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits(); - + DemandedMask = DemandedMask.trunc(SrcBitWidth); KnownZero = KnownZero.trunc(SrcBitWidth); KnownOne = KnownOne.trunc(SrcBitWidth); @@ -428,7 +442,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, DemandedMask = DemandedMask.zext(BitWidth); KnownZero = KnownZero.zext(BitWidth); KnownOne = KnownOne.zext(BitWidth); - assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); // The top bits are known to be zero. KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth); break; @@ -436,8 +450,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, case Instruction::SExt: { // Compute the bits in the result that are not present in the input. unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits(); - - APInt InputDemandedBits = DemandedMask & + + APInt InputDemandedBits = DemandedMask & APInt::getLowBitsSet(BitWidth, SrcBitWidth); APInt NewBits(APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth)); @@ -445,7 +459,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // bit is demanded. if ((NewBits & DemandedMask) != 0) InputDemandedBits.setBit(SrcBitWidth-1); - + InputDemandedBits = InputDemandedBits.trunc(SrcBitWidth); KnownZero = KnownZero.trunc(SrcBitWidth); KnownOne = KnownOne.trunc(SrcBitWidth); @@ -455,8 +469,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, InputDemandedBits = InputDemandedBits.zext(BitWidth); KnownZero = KnownZero.zext(BitWidth); KnownOne = KnownOne.zext(BitWidth); - assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); - + assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + // If the sign bit of the input is known set or clear, then we know the // top bits of the result. @@ -476,7 +490,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // are not demanded, then the add doesn't demand them from its input // either. unsigned NLZ = DemandedMask.countLeadingZeros(); - + // If there is a constant on the RHS, there are a variety of xformations // we can do. if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) { @@ -484,13 +498,13 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // won't work if the RHS is zero. if (RHS->isZero()) break; - + // If the top bit of the output is demanded, demand everything from the // input. Otherwise, we demand all the input bits except NLZ top bits. APInt InDemandedBits(APInt::getLowBitsSet(BitWidth, BitWidth - NLZ)); // Find information about known zero/one bits in the input. - if (SimplifyDemandedBits(I->getOperandUse(0), InDemandedBits, + if (SimplifyDemandedBits(I->getOperandUse(0), InDemandedBits, LHSKnownZero, LHSKnownOne, Depth+1)) return I; @@ -498,11 +512,11 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // the constant. if (ShrinkDemandedConstant(I, 1, InDemandedBits)) return I; - + // Avoid excess work. if (LHSKnownZero == 0 && LHSKnownOne == 0) break; - + // Turn it into OR if input bits are zero. if ((LHSKnownZero & RHS->getValue()) == RHS->getValue()) { Instruction *Or = @@ -510,26 +524,26 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, I->getName()); return InsertNewInstWith(Or, *I); } - + // We can say something about the output known-zero and known-one bits, // depending on potential carries from the input constant and the // unknowns. For example if the LHS is known to have at most the 0x0F0F0 // bits set and the RHS constant is 0x01001, then we know we have a known // one mask of 0x00001 and a known zero mask of 0xE0F0E. - + // To compute this, we first compute the potential carry bits. These are // the bits which may be modified. I'm not aware of a better way to do // this scan. const APInt &RHSVal = RHS->getValue(); APInt CarryBits((~LHSKnownZero + RHSVal) ^ (~LHSKnownZero ^ RHSVal)); - + // Now that we know which bits have carries, compute the known-1/0 sets. - + // Bits are known one if they are known zero in one operand and one in the // other, and there is no input carry. - KnownOne = ((LHSKnownZero & RHSVal) | + KnownOne = ((LHSKnownZero & RHSVal) | (LHSKnownOne & ~RHSVal)) & ~CarryBits; - + // Bits are known zero if they are known zero in both operands and there // is no input carry. KnownZero = LHSKnownZero & ~RHSVal & ~CarryBits; @@ -580,17 +594,28 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, break; case Instruction::Shl: if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) { + { + Value *VarX; ConstantInt *C1; + if (match(I->getOperand(0), m_Shr(m_Value(VarX), m_ConstantInt(C1)))) { + Instruction *Shr = cast<Instruction>(I->getOperand(0)); + Value *R = SimplifyShrShlDemandedBits(Shr, I, DemandedMask, + KnownZero, KnownOne); + if (R) + return R; + } + } + uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1); APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt)); - + // If the shift is NUW/NSW, then it does demand the high bits. ShlOperator *IOp = cast<ShlOperator>(I); if (IOp->hasNoSignedWrap()) DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1); else if (IOp->hasNoUnsignedWrap()) DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt); - - if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, + + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero, KnownOne, Depth+1)) return I; assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); @@ -605,15 +630,15 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // For a logical shift right if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) { uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1); - + // Unsigned shift right. APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt)); - + // If the shift is exact, then it does demand the low bits (and knows that // they are zero). if (cast<LShrOperator>(I)->isExact()) DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt); - + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero, KnownOne, Depth+1)) return I; @@ -637,28 +662,28 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, Instruction *NewVal = BinaryOperator::CreateLShr( I->getOperand(0), I->getOperand(1), I->getName()); return InsertNewInstWith(NewVal, *I); - } + } // If the sign bit is the only bit demanded by this ashr, then there is no // need to do it, the shift doesn't change the high bit. if (DemandedMask.isSignBit()) return I->getOperand(0); - + if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) { uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1); - + // Signed shift right. APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt)); // If any of the "high bits" are demanded, we should set the sign bit as // demanded. if (DemandedMask.countLeadingZeros() <= ShiftAmt) DemandedMaskIn.setBit(BitWidth-1); - + // If the shift is exact, then it does demand the low bits (and knows that // they are zero). if (cast<AShrOperator>(I)->isExact()) DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt); - + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero, KnownOne, Depth+1)) return I; @@ -667,15 +692,15 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt)); KnownZero = APIntOps::lshr(KnownZero, ShiftAmt); KnownOne = APIntOps::lshr(KnownOne, ShiftAmt); - + // Handle the sign bits. APInt SignBit(APInt::getSignBit(BitWidth)); // Adjust to where it is now in the mask. - SignBit = APIntOps::lshr(SignBit, ShiftAmt); - + SignBit = APIntOps::lshr(SignBit, ShiftAmt); + // If the input sign bit is known to be zero, or if none of the top bits // are demanded, turn this into an unsigned shift right. - if (BitWidth <= ShiftAmt || KnownZero[BitWidth-ShiftAmt-1] || + if (BitWidth <= ShiftAmt || KnownZero[BitWidth-ShiftAmt-1] || (HighBits & ~DemandedMask) == HighBits) { // Perform the logical shift right. BinaryOperator *NewVal = BinaryOperator::CreateLShr(I->getOperand(0), @@ -718,7 +743,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (LHSKnownOne[BitWidth-1] && ((LHSKnownOne & LowBits) != 0)) KnownOne |= ~LowBits; - assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); } } @@ -756,7 +781,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // just shift the input byte into position to eliminate the bswap. unsigned NLZ = DemandedMask.countLeadingZeros(); unsigned NTZ = DemandedMask.countTrailingZeros(); - + // Round NTZ down to the next byte. If we have 11 trailing zeros, then // we need all the bits down to bit 8. Likewise, round NLZ. If we // have 14 leading zeros, round to 8. @@ -766,7 +791,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (BitWidth-NLZ-NTZ == 8) { unsigned ResultBit = NTZ; unsigned InputBit = BitWidth-NTZ-8; - + // Replace this with either a left or right shift to get the byte into // the right place. Instruction *NewVal; @@ -779,7 +804,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, NewVal->takeName(I); return InsertNewInstWith(NewVal, *I); } - + // TODO: Could compute known zero/one bits based on the input. break; } @@ -792,7 +817,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, ComputeMaskedBits(V, KnownZero, KnownOne, Depth); break; } - + // If the client is only demanding bits that we know, return the known // constant. if ((DemandedMask & (KnownZero|KnownOne)) == DemandedMask) @@ -800,6 +825,81 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, return 0; } +/// Helper routine of SimplifyDemandedUseBits. It tries to simplify +/// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into +/// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign +/// of "C2-C1". +/// +/// Suppose E1 and E2 are generally different in bits S={bm, bm+1, +/// ..., bn}, without considering the specific value X is holding. +/// This transformation is legal iff one of following conditions is hold: +/// 1) All the bit in S are 0, in this case E1 == E2. +/// 2) We don't care those bits in S, per the input DemandedMask. +/// 3) Combination of 1) and 2). Some bits in S are 0, and we don't care the +/// rest bits. +/// +/// Currently we only test condition 2). +/// +/// As with SimplifyDemandedUseBits, it returns NULL if the simplification was +/// not successful. +Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr, + Instruction *Shl, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne) { + + unsigned ShlAmt = cast<ConstantInt>(Shl->getOperand(1))->getZExtValue(); + unsigned ShrAmt = cast<ConstantInt>(Shr->getOperand(1))->getZExtValue(); + + KnownOne.clearAllBits(); + KnownZero = APInt::getBitsSet(KnownZero.getBitWidth(), 0, ShlAmt-1); + KnownZero &= DemandedMask; + + if (ShlAmt == 0 || ShrAmt == 0) + return 0; + + Value *VarX = Shr->getOperand(0); + Type *Ty = VarX->getType(); + + APInt BitMask1(APInt::getAllOnesValue(Ty->getIntegerBitWidth())); + APInt BitMask2(APInt::getAllOnesValue(Ty->getIntegerBitWidth())); + + bool isLshr = (Shr->getOpcode() == Instruction::LShr); + BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) : + (BitMask1.ashr(ShrAmt) << ShlAmt); + + if (ShrAmt <= ShlAmt) { + BitMask2 <<= (ShlAmt - ShrAmt); + } else { + BitMask2 = isLshr ? BitMask2.lshr(ShrAmt - ShlAmt): + BitMask2.ashr(ShrAmt - ShlAmt); + } + + // Check if condition-2 (see the comment to this function) is satified. + if ((BitMask1 & DemandedMask) == (BitMask2 & DemandedMask)) { + if (ShrAmt == ShlAmt) + return VarX; + + if (!Shr->hasOneUse()) + return 0; + + BinaryOperator *New; + if (ShrAmt < ShlAmt) { + Constant *Amt = ConstantInt::get(VarX->getType(), ShlAmt - ShrAmt); + New = BinaryOperator::CreateShl(VarX, Amt); + BinaryOperator *Orig = cast<BinaryOperator>(Shl); + New->setHasNoSignedWrap(Orig->hasNoSignedWrap()); + New->setHasNoUnsignedWrap(Orig->hasNoUnsignedWrap()); + } else { + Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt); + New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) : + BinaryOperator::CreateAShr(VarX, Amt); + if (cast<BinaryOperator>(Shr)->isExact()) + New->setIsExact(true); + } + + return InsertNewInstWith(New, *Shl); + } + + return 0; +} /// SimplifyDemandedVectorElts - The specified value produces a vector with /// any number of elements. DemandedElts contains the set of elements that are @@ -821,14 +921,14 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, UndefElts = EltMask; return 0; } - + if (DemandedElts == 0) { // If nothing is demanded, provide undef. UndefElts = EltMask; return UndefValue::get(V->getType()); } UndefElts = 0; - + // Handle ConstantAggregateZero, ConstantVector, ConstantDataSequential. if (Constant *C = dyn_cast<Constant>(V)) { // Check if this is identity. If so, return 0 since we are not simplifying @@ -838,7 +938,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, Type *EltTy = cast<VectorType>(V->getType())->getElementType(); Constant *Undef = UndefValue::get(EltTy); - + SmallVector<Constant*, 16> Elts; for (unsigned i = 0; i != VWidth; ++i) { if (!DemandedElts[i]) { // If not demanded, set to undef. @@ -846,10 +946,10 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, UndefElts.setBit(i); continue; } - + Constant *Elt = C->getAggregateElement(i); if (Elt == 0) return 0; - + if (isa<UndefValue>(Elt)) { // Already undef. Elts.push_back(Undef); UndefElts.setBit(i); @@ -857,12 +957,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, Elts.push_back(Elt); } } - + // If we changed the constant, return it. Constant *NewCV = ConstantVector::get(Elts); return NewCV != C ? NewCV : 0; } - + // Limit search depth. if (Depth == 10) return 0; @@ -881,16 +981,16 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // Conservatively assume that all elements are needed. DemandedElts = EltMask; } - + Instruction *I = dyn_cast<Instruction>(V); if (!I) return 0; // Only analyze instructions. - + bool MadeChange = false; APInt UndefElts2(VWidth, 0); Value *TmpV; switch (I->getOpcode()) { default: break; - + case Instruction::InsertElement: { // If this is a variable index, we don't know which element it overwrites. // demand exactly the same input as we produce. @@ -903,7 +1003,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } break; } - + // If this is inserting an element that isn't demanded, remove this // insertelement. unsigned IdxNo = Idx->getZExtValue(); @@ -911,7 +1011,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, Worklist.Add(I); return I->getOperand(0); } - + // Otherwise, the element inserted overwrites whatever was there, so the // input demanded set is simpler than the output set. APInt DemandedElts2 = DemandedElts; @@ -1007,7 +1107,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, TmpV = SimplifyDemandedVectorElts(I->getOperand(2), RightDemanded, UndefElts2, Depth+1); if (TmpV) { I->setOperand(2, TmpV); MadeChange = true; } - + // Output elements are undefined if both are undefined. UndefElts &= UndefElts2; break; @@ -1028,7 +1128,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, } else if (VWidth > InVWidth) { // Untested so far. break; - + // If there are more elements in the result than there are in the source, // then an input element is live if any of the corresponding output // elements are live. @@ -1040,7 +1140,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, } else { // Untested so far. break; - + // If there are more elements in the source than there are in the result, // then an input element is live if the corresponding output element is // live. @@ -1049,7 +1149,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, if (DemandedElts[InIdx/Ratio]) InputDemandedElts.setBit(InIdx); } - + // div/rem demand all inputs, because they don't want divide by zero. TmpV = SimplifyDemandedVectorElts(I->getOperand(0), InputDemandedElts, UndefElts2, Depth+1); @@ -1057,7 +1157,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, I->setOperand(0, TmpV); MadeChange = true; } - + UndefElts = UndefElts2; if (VWidth > InVWidth) { llvm_unreachable("Unimp"); @@ -1092,7 +1192,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, TmpV = SimplifyDemandedVectorElts(I->getOperand(1), DemandedElts, UndefElts2, Depth+1); if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; } - + // Output elements are undefined if both are undefined. Consider things // like undef&0. The result is known zero, not undef. UndefElts &= UndefElts2; @@ -1103,13 +1203,13 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, UndefElts, Depth+1); if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } break; - + case Instruction::Call: { IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); if (!II) break; switch (II->getIntrinsicID()) { default: break; - + // Binary vector operations that work column-wise. A dest element is a // function of the corresponding input elements from the two inputs. case Intrinsic::x86_sse_sub_ss: @@ -1140,11 +1240,11 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, Value *LHS = II->getArgOperand(0); Value *RHS = II->getArgOperand(1); // Extract the element as scalars. - LHS = InsertNewInstWith(ExtractElementInst::Create(LHS, + LHS = InsertNewInstWith(ExtractElementInst::Create(LHS, ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II); RHS = InsertNewInstWith(ExtractElementInst::Create(RHS, ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II); - + switch (II->getIntrinsicID()) { default: llvm_unreachable("Case stmts out of sync!"); case Intrinsic::x86_sse_sub_ss: @@ -1158,7 +1258,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, II->getName()), *II); break; } - + Instruction *New = InsertElementInst::Create( UndefValue::get(II->getType()), TmpV, @@ -1166,9 +1266,9 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, II->getName()); InsertNewInstWith(New, *II); return New; - } + } } - + // Output elements are undefined if both are undefined. Consider things // like undef&0. The result is known zero, not undef. UndefElts &= UndefElts2; diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index cf60f0f..dd7ea14 100644 --- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -636,8 +636,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { // If LHS's width is changed, shift the mask value accordingly. // If newRHS == NULL, i.e. LHSOp0 == RHSOp0, we want to remap any - // references to RHSOp0 to LHSOp0, so we don't need to shift the mask. - if (eltMask >= 0 && newRHS != NULL) + // references from RHSOp0 to LHSOp0, so we don't need to shift the mask. + // If newRHS == newLHS, we want to remap any references from newRHS to + // newLHS so that we can properly identify splats that may occur due to + // obfuscation accross the two vectors. + if (eltMask >= 0 && newRHS != NULL && newLHS != newRHS) eltMask += newLHSWidth; } diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h index 99a02fc..57ed9e3 100644 --- a/lib/Transforms/InstCombine/InstCombineWorklist.h +++ b/lib/Transforms/InstCombine/InstCombineWorklist.h @@ -11,11 +11,11 @@ #define INSTCOMBINE_WORKLIST_H #define DEBUG_TYPE "instcombine" -#include "llvm/Instruction.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/Compiler.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Instruction.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" namespace llvm { @@ -26,8 +26,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombineWorklist { SmallVector<Instruction*, 256> Worklist; DenseMap<Instruction*, unsigned> WorklistMap; - void operator=(const InstCombineWorklist&RHS); // DO NOT IMPLEMENT - InstCombineWorklist(const InstCombineWorklist&); // DO NOT IMPLEMENT + void operator=(const InstCombineWorklist&RHS) LLVM_DELETED_FUNCTION; + InstCombineWorklist(const InstCombineWorklist&) LLVM_DELETED_FUNCTION; public: InstCombineWorklist() {} diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index ff758c4..6f24cdd 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -36,22 +36,23 @@ #define DEBUG_TYPE "instcombine" #include "llvm/Transforms/Scalar.h" #include "InstCombine.h" -#include "llvm/IntrinsicInst.h" +#include "llvm-c/Initialization.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/CFG.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/PatternMatch.h" #include "llvm/Support/ValueHandle.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm-c/Initialization.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <climits> using namespace llvm; @@ -65,6 +66,11 @@ STATISTIC(NumExpand, "Number of expansions"); STATISTIC(NumFactor , "Number of factorizations"); STATISTIC(NumReassoc , "Number of reassociations"); +static cl::opt<bool> UnsafeFPShrink("enable-double-float-shrink", cl::Hidden, + cl::init(false), + cl::desc("Enable unsafe double to float " + "shrinking for math lib calls")); + // Initialization Routines void llvm::initializeInstCombine(PassRegistry &Registry) { initializeInstCombinerPass(Registry); @@ -88,7 +94,7 @@ void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const { Value *InstCombiner::EmitGEPOffset(User *GEP) { - return llvm::EmitGEPOffset(Builder, *getTargetData(), GEP); + return llvm::EmitGEPOffset(Builder, *getDataLayout(), GEP); } /// ShouldChangeType - Return true if it is desirable to convert a computation @@ -805,6 +811,244 @@ static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) { return true; } +/// Descale - Return a value X such that Val = X * Scale, or null if none. If +/// the multiplication is known not to overflow then NoSignedWrap is set. +Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { + assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!"); + assert(cast<IntegerType>(Val->getType())->getBitWidth() == + Scale.getBitWidth() && "Scale not compatible with value!"); + + // If Val is zero or Scale is one then Val = Val * Scale. + if (match(Val, m_Zero()) || Scale == 1) { + NoSignedWrap = true; + return Val; + } + + // If Scale is zero then it does not divide Val. + if (Scale.isMinValue()) + return 0; + + // Look through chains of multiplications, searching for a constant that is + // divisible by Scale. For example, descaling X*(Y*(Z*4)) by a factor of 4 + // will find the constant factor 4 and produce X*(Y*Z). Descaling X*(Y*8) by + // a factor of 4 will produce X*(Y*2). The principle of operation is to bore + // down from Val: + // + // Val = M1 * X || Analysis starts here and works down + // M1 = M2 * Y || Doesn't descend into terms with more + // M2 = Z * 4 \/ than one use + // + // Then to modify a term at the bottom: + // + // Val = M1 * X + // M1 = Z * Y || Replaced M2 with Z + // + // Then to work back up correcting nsw flags. + + // Op - the term we are currently analyzing. Starts at Val then drills down. + // Replaced with its descaled value before exiting from the drill down loop. + Value *Op = Val; + + // Parent - initially null, but after drilling down notes where Op came from. + // In the example above, Parent is (Val, 0) when Op is M1, because M1 is the + // 0'th operand of Val. + std::pair<Instruction*, unsigned> Parent; + + // RequireNoSignedWrap - Set if the transform requires a descaling at deeper + // levels that doesn't overflow. + bool RequireNoSignedWrap = false; + + // logScale - log base 2 of the scale. Negative if not a power of 2. + int32_t logScale = Scale.exactLogBase2(); + + for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down + + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // If Op is a constant divisible by Scale then descale to the quotient. + APInt Quotient(Scale), Remainder(Scale); // Init ensures right bitwidth. + APInt::sdivrem(CI->getValue(), Scale, Quotient, Remainder); + if (!Remainder.isMinValue()) + // Not divisible by Scale. + return 0; + // Replace with the quotient in the parent. + Op = ConstantInt::get(CI->getType(), Quotient); + NoSignedWrap = true; + break; + } + + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op)) { + + if (BO->getOpcode() == Instruction::Mul) { + // Multiplication. + NoSignedWrap = BO->hasNoSignedWrap(); + if (RequireNoSignedWrap && !NoSignedWrap) + return 0; + + // There are three cases for multiplication: multiplication by exactly + // the scale, multiplication by a constant different to the scale, and + // multiplication by something else. + Value *LHS = BO->getOperand(0); + Value *RHS = BO->getOperand(1); + + if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) { + // Multiplication by a constant. + if (CI->getValue() == Scale) { + // Multiplication by exactly the scale, replace the multiplication + // by its left-hand side in the parent. + Op = LHS; + break; + } + + // Otherwise drill down into the constant. + if (!Op->hasOneUse()) + return 0; + + Parent = std::make_pair(BO, 1); + continue; + } + + // Multiplication by something else. Drill down into the left-hand side + // since that's where the reassociate pass puts the good stuff. + if (!Op->hasOneUse()) + return 0; + + Parent = std::make_pair(BO, 0); + continue; + } + + if (logScale > 0 && BO->getOpcode() == Instruction::Shl && + isa<ConstantInt>(BO->getOperand(1))) { + // Multiplication by a power of 2. + NoSignedWrap = BO->hasNoSignedWrap(); + if (RequireNoSignedWrap && !NoSignedWrap) + return 0; + + Value *LHS = BO->getOperand(0); + int32_t Amt = cast<ConstantInt>(BO->getOperand(1))-> + getLimitedValue(Scale.getBitWidth()); + // Op = LHS << Amt. + + if (Amt == logScale) { + // Multiplication by exactly the scale, replace the multiplication + // by its left-hand side in the parent. + Op = LHS; + break; + } + if (Amt < logScale || !Op->hasOneUse()) + return 0; + + // Multiplication by more than the scale. Reduce the multiplying amount + // by the scale in the parent. + Parent = std::make_pair(BO, 1); + Op = ConstantInt::get(BO->getType(), Amt - logScale); + break; + } + } + + if (!Op->hasOneUse()) + return 0; + + if (CastInst *Cast = dyn_cast<CastInst>(Op)) { + if (Cast->getOpcode() == Instruction::SExt) { + // Op is sign-extended from a smaller type, descale in the smaller type. + unsigned SmallSize = Cast->getSrcTy()->getPrimitiveSizeInBits(); + APInt SmallScale = Scale.trunc(SmallSize); + // Suppose Op = sext X, and we descale X as Y * SmallScale. We want to + // descale Op as (sext Y) * Scale. In order to have + // sext (Y * SmallScale) = (sext Y) * Scale + // some conditions need to hold however: SmallScale must sign-extend to + // Scale and the multiplication Y * SmallScale should not overflow. + if (SmallScale.sext(Scale.getBitWidth()) != Scale) + // SmallScale does not sign-extend to Scale. + return 0; + assert(SmallScale.exactLogBase2() == logScale); + // Require that Y * SmallScale must not overflow. + RequireNoSignedWrap = true; + + // Drill down through the cast. + Parent = std::make_pair(Cast, 0); + Scale = SmallScale; + continue; + } + + if (Cast->getOpcode() == Instruction::Trunc) { + // Op is truncated from a larger type, descale in the larger type. + // Suppose Op = trunc X, and we descale X as Y * sext Scale. Then + // trunc (Y * sext Scale) = (trunc Y) * Scale + // always holds. However (trunc Y) * Scale may overflow even if + // trunc (Y * sext Scale) does not, so nsw flags need to be cleared + // from this point up in the expression (see later). + if (RequireNoSignedWrap) + return 0; + + // Drill down through the cast. + unsigned LargeSize = Cast->getSrcTy()->getPrimitiveSizeInBits(); + Parent = std::make_pair(Cast, 0); + Scale = Scale.sext(LargeSize); + if (logScale + 1 == (int32_t)Cast->getType()->getPrimitiveSizeInBits()) + logScale = -1; + assert(Scale.exactLogBase2() == logScale); + continue; + } + } + + // Unsupported expression, bail out. + return 0; + } + + // We know that we can successfully descale, so from here on we can safely + // modify the IR. Op holds the descaled version of the deepest term in the + // expression. NoSignedWrap is 'true' if multiplying Op by Scale is known + // not to overflow. + + if (!Parent.first) + // The expression only had one term. + return Op; + + // Rewrite the parent using the descaled version of its operand. + assert(Parent.first->hasOneUse() && "Drilled down when more than one use!"); + assert(Op != Parent.first->getOperand(Parent.second) && + "Descaling was a no-op?"); + Parent.first->setOperand(Parent.second, Op); + Worklist.Add(Parent.first); + + // Now work back up the expression correcting nsw flags. The logic is based + // on the following observation: if X * Y is known not to overflow as a signed + // multiplication, and Y is replaced by a value Z with smaller absolute value, + // then X * Z will not overflow as a signed multiplication either. As we work + // our way up, having NoSignedWrap 'true' means that the descaled value at the + // current level has strictly smaller absolute value than the original. + Instruction *Ancestor = Parent.first; + do { + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Ancestor)) { + // If the multiplication wasn't nsw then we can't say anything about the + // value of the descaled multiplication, and we have to clear nsw flags + // from this point on up. + bool OpNoSignedWrap = BO->hasNoSignedWrap(); + NoSignedWrap &= OpNoSignedWrap; + if (NoSignedWrap != OpNoSignedWrap) { + BO->setHasNoSignedWrap(NoSignedWrap); + Worklist.Add(Ancestor); + } + } else if (Ancestor->getOpcode() == Instruction::Trunc) { + // The fact that the descaled input to the trunc has smaller absolute + // value than the original input doesn't tell us anything useful about + // the absolute values of the truncations. + NoSignedWrap = false; + } + assert((Ancestor->getOpcode() != Instruction::SExt || NoSignedWrap) && + "Failed to keep proper track of nsw flags while drilling down?"); + + if (Ancestor == Val) + // Got to the top, all done! + return Val; + + // Move up one level in the expression. + assert(Ancestor->hasOneUse() && "Drilled down when more than one use!"); + Ancestor = Ancestor->use_back(); + } while (1); +} + Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end()); @@ -817,7 +1061,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // by multiples of a zero size type with zero. if (TD) { bool MadeChange = false; - Type *IntPtrTy = TD->getIntPtrType(GEP.getContext()); + Type *IntPtrTy = TD->getIntPtrType(GEP.getPointerOperandType()); gep_type_iterator GTI = gep_type_begin(GEP); for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); @@ -836,7 +1080,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } Type *IndexTy = (*I)->getType(); - if (IndexTy != IntPtrTy && !IndexTy->isVectorTy()) { + if (IndexTy != IntPtrTy) { // If we are using a wider index than needed for this platform, shrink // it to what we need. If narrower, sign-extend it to what we need. // This explicit cast can make subsequent optimizations more obvious. @@ -855,7 +1099,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src)) return 0; - // Note that if our source is a gep chain itself that we wait for that + // Note that if our source is a gep chain itself then we wait for that // chain to be resolved before we perform this transformation. This // avoids us creating a TON of code in some cases. if (GEPOperator *SrcGEP = @@ -987,63 +1231,74 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } // Transform things like: + // %V = mul i64 %N, 4 + // %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V + // into: %t1 = getelementptr i32* %arr, i32 %N; bitcast + if (TD && ResElTy->isSized() && SrcElTy->isSized()) { + // Check that changing the type amounts to dividing the index by a scale + // factor. + uint64_t ResSize = TD->getTypeAllocSize(ResElTy); + uint64_t SrcSize = TD->getTypeAllocSize(SrcElTy); + if (ResSize && SrcSize % ResSize == 0) { + Value *Idx = GEP.getOperand(1); + unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits(); + uint64_t Scale = SrcSize / ResSize; + + // Earlier transforms ensure that the index has type IntPtrType, which + // considerably simplifies the logic by eliminating implicit casts. + assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) && + "Index not cast to pointer width?"); + + bool NSW; + if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) { + // Successfully decomposed Idx as NewIdx * Scale, form a new GEP. + // If the multiplication NewIdx * Scale may overflow then the new + // GEP may not be "inbounds". + Value *NewGEP = GEP.isInBounds() && NSW ? + Builder->CreateInBoundsGEP(StrippedPtr, NewIdx, GEP.getName()) : + Builder->CreateGEP(StrippedPtr, NewIdx, GEP.getName()); + // The NewGEP must be pointer typed, so must the old one -> BitCast + return new BitCastInst(NewGEP, GEP.getType()); + } + } + } + + // Similarly, transform things like: // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp // (where tmp = 8*tmp2) into: // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast - - if (TD && SrcElTy->isArrayTy() && ResElTy->isIntegerTy(8)) { + if (TD && ResElTy->isSized() && SrcElTy->isSized() && + SrcElTy->isArrayTy()) { + // Check that changing to the array element type amounts to dividing the + // index by a scale factor. + uint64_t ResSize = TD->getTypeAllocSize(ResElTy); uint64_t ArrayEltSize = - TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()); - - // Check to see if "tmp" is a scale by a multiple of ArrayEltSize. We - // allow either a mul, shift, or constant here. - Value *NewIdx = 0; - ConstantInt *Scale = 0; - if (ArrayEltSize == 1) { - NewIdx = GEP.getOperand(1); - Scale = ConstantInt::get(cast<IntegerType>(NewIdx->getType()), 1); - } else if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP.getOperand(1))) { - NewIdx = ConstantInt::get(CI->getType(), 1); - Scale = CI; - } else if (Instruction *Inst =dyn_cast<Instruction>(GEP.getOperand(1))){ - if (Inst->getOpcode() == Instruction::Shl && - isa<ConstantInt>(Inst->getOperand(1))) { - ConstantInt *ShAmt = cast<ConstantInt>(Inst->getOperand(1)); - uint32_t ShAmtVal = ShAmt->getLimitedValue(64); - Scale = ConstantInt::get(cast<IntegerType>(Inst->getType()), - 1ULL << ShAmtVal); - NewIdx = Inst->getOperand(0); - } else if (Inst->getOpcode() == Instruction::Mul && - isa<ConstantInt>(Inst->getOperand(1))) { - Scale = cast<ConstantInt>(Inst->getOperand(1)); - NewIdx = Inst->getOperand(0); + TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()); + if (ResSize && ArrayEltSize % ResSize == 0) { + Value *Idx = GEP.getOperand(1); + unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits(); + uint64_t Scale = ArrayEltSize / ResSize; + + // Earlier transforms ensure that the index has type IntPtrType, which + // considerably simplifies the logic by eliminating implicit casts. + assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) && + "Index not cast to pointer width?"); + + bool NSW; + if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) { + // Successfully decomposed Idx as NewIdx * Scale, form a new GEP. + // If the multiplication NewIdx * Scale may overflow then the new + // GEP may not be "inbounds". + Value *Off[2]; + Off[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext())); + Off[1] = NewIdx; + Value *NewGEP = GEP.isInBounds() && NSW ? + Builder->CreateInBoundsGEP(StrippedPtr, Off, GEP.getName()) : + Builder->CreateGEP(StrippedPtr, Off, GEP.getName()); + // The NewGEP must be pointer typed, so must the old one -> BitCast + return new BitCastInst(NewGEP, GEP.getType()); } } - - // If the index will be to exactly the right offset with the scale taken - // out, perform the transformation. Note, we don't know whether Scale is - // signed or not. We'll use unsigned version of division/modulo - // operation after making sure Scale doesn't have the sign bit set. - if (ArrayEltSize && Scale && Scale->getSExtValue() >= 0LL && - Scale->getZExtValue() % ArrayEltSize == 0) { - Scale = ConstantInt::get(Scale->getType(), - Scale->getZExtValue() / ArrayEltSize); - if (Scale->getZExtValue() != 1) { - Constant *C = ConstantExpr::getIntegerCast(Scale, NewIdx->getType(), - false /*ZExt*/); - NewIdx = Builder->CreateMul(NewIdx, C, "idxscale"); - } - - // Insert the new GEP instruction. - Value *Idx[2]; - Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext())); - Idx[1] = NewIdx; - Value *NewGEP = GEP.isInBounds() ? - Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()): - Builder->CreateGEP(StrippedPtr, Idx, GEP.getName()); - // The NewGEP must be pointer typed, so must the old one -> BitCast - return new BitCastInst(NewGEP, GEP.getType()); - } } } } @@ -1054,17 +1309,15 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { /// into a gep of the original struct. This is important for SROA and alias /// analysis of unions. If "A" is also a bitcast, wait for A/X to be merged. if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) { + APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0); if (TD && - !isa<BitCastInst>(BCI->getOperand(0)) && GEP.hasAllConstantIndices() && + !isa<BitCastInst>(BCI->getOperand(0)) && + GEP.accumulateConstantOffset(*TD, Offset) && StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) { - // Determine how much the GEP moves the pointer. - SmallVector<Value*, 8> Ops(GEP.idx_begin(), GEP.idx_end()); - int64_t Offset = TD->getIndexedOffset(GEP.getPointerOperandType(), Ops); - // If this GEP instruction doesn't move the pointer, just replace the GEP // with a bitcast of the real input to the dest type. - if (Offset == 0) { + if (!Offset) { // If the bitcast is of an allocation, and the allocation will be // converted to match the type of the cast, don't touch this. if (isa<AllocaInst>(BCI->getOperand(0)) || @@ -1088,7 +1341,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { SmallVector<Value*, 8> NewIndices; Type *InTy = cast<PointerType>(BCI->getOperand(0)->getType())->getElementType(); - if (FindElementAtOffset(InTy, Offset, NewIndices)) { + if (FindElementAtOffset(InTy, Offset.getSExtValue(), NewIndices)) { Value *NGEP = GEP.isInBounds() ? Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) : Builder->CreateGEP(BCI->getOperand(0), NewIndices); @@ -1222,6 +1475,62 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) { return 0; } +/// \brief Move the call to free before a NULL test. +/// +/// Check if this free is accessed after its argument has been test +/// against NULL (property 0). +/// If yes, it is legal to move this call in its predecessor block. +/// +/// The move is performed only if the block containing the call to free +/// will be removed, i.e.: +/// 1. it has only one predecessor P, and P has two successors +/// 2. it contains the call and an unconditional branch +/// 3. its successor is the same as its predecessor's successor +/// +/// The profitability is out-of concern here and this function should +/// be called only if the caller knows this transformation would be +/// profitable (e.g., for code size). +static Instruction * +tryToMoveFreeBeforeNullTest(CallInst &FI) { + Value *Op = FI.getArgOperand(0); + BasicBlock *FreeInstrBB = FI.getParent(); + BasicBlock *PredBB = FreeInstrBB->getSinglePredecessor(); + + // Validate part of constraint #1: Only one predecessor + // FIXME: We can extend the number of predecessor, but in that case, we + // would duplicate the call to free in each predecessor and it may + // not be profitable even for code size. + if (!PredBB) + return 0; + + // Validate constraint #2: Does this block contains only the call to + // free and an unconditional branch? + // FIXME: We could check if we can speculate everything in the + // predecessor block + if (FreeInstrBB->size() != 2) + return 0; + BasicBlock *SuccBB; + if (!match(FreeInstrBB->getTerminator(), m_UnconditionalBr(SuccBB))) + return 0; + + // Validate the rest of constraint #1 by matching on the pred branch. + TerminatorInst *TI = PredBB->getTerminator(); + BasicBlock *TrueBB, *FalseBB; + ICmpInst::Predicate Pred; + if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Op), m_Zero()), TrueBB, FalseBB))) + return 0; + if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE) + return 0; + + // Validate constraint #3: Ensure the null case just falls through. + if (SuccBB != (Pred == ICmpInst::ICMP_EQ ? TrueBB : FalseBB)) + return 0; + assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) && + "Broken CFG: missing edge from predecessor to successor"); + + FI.moveBefore(TI); + return &FI; +} Instruction *InstCombiner::visitFree(CallInst &FI) { @@ -1240,6 +1549,16 @@ Instruction *InstCombiner::visitFree(CallInst &FI) { if (isa<ConstantPointerNull>(Op)) return EraseInstFromFunction(FI); + // If we optimize for code size, try to move the call to free before the null + // test so that simplify cfg can remove the empty block and dead code + // elimination the branch. I.e., helps to turn something like: + // if (foo) free(foo); + // into + // free(foo); + if (MinimizeSize) + if (Instruction *I = tryToMoveFreeBeforeNullTest(FI)) + return I; + return 0; } @@ -1854,7 +2173,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { static bool AddReachableCodeToWorklist(BasicBlock *BB, SmallPtrSet<BasicBlock*, 64> &Visited, InstCombiner &IC, - const TargetData *TD, + const DataLayout *TD, const TargetLibraryInfo *TLI) { bool MadeIRChange = false; SmallVector<BasicBlock*, 256> Worklist; @@ -2118,10 +2437,31 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { return MadeIRChange; } +namespace { +class InstCombinerLibCallSimplifier : public LibCallSimplifier { + InstCombiner *IC; +public: + InstCombinerLibCallSimplifier(const DataLayout *TD, + const TargetLibraryInfo *TLI, + InstCombiner *IC) + : LibCallSimplifier(TD, TLI, UnsafeFPShrink) { + this->IC = IC; + } + + /// replaceAllUsesWith - override so that instruction replacement + /// can be defined in terms of the instruction combiner framework. + virtual void replaceAllUsesWith(Instruction *I, Value *With) const { + IC->ReplaceInstUsesWith(*I, With); + } +}; +} bool InstCombiner::runOnFunction(Function &F) { - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); + // Minimizing size? + MinimizeSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::MinSize); /// Builder - This is an IRBuilder that automatically inserts new /// instructions into the worklist when they are created. @@ -2130,6 +2470,9 @@ bool InstCombiner::runOnFunction(Function &F) { InstCombineIRInserter(Worklist)); Builder = &TheBuilder; + InstCombinerLibCallSimplifier TheSimplifier(TD, TLI, this); + Simplifier = &TheSimplifier; + bool EverMadeChange = false; // Lower dbg.declare intrinsics otherwise their value may be clobbered diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 0775cf4..9bd3239 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -15,34 +15,38 @@ #define DEBUG_TYPE "asan" +#include "llvm/Transforms/Instrumentation.h" #include "BlackList.h" -#include "llvm/Function.h" -#include "llvm/IRBuilder.h" -#include "llvm/InlineAsm.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" -#include "llvm/Type.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/OwningPtr.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" +#include "llvm/DIBuilder.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/InstVisitor.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/system_error.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" - -#include <string> #include <algorithm> +#include <string> using namespace llvm; @@ -69,6 +73,10 @@ static const char *kAsanMappingOffsetName = "__asan_mapping_offset"; static const char *kAsanMappingScaleName = "__asan_mapping_scale"; static const char *kAsanStackMallocName = "__asan_stack_malloc"; static const char *kAsanStackFreeName = "__asan_stack_free"; +static const char *kAsanGenPrefix = "__asan_gen_"; +static const char *kAsanPoisonStackMemoryName = "__asan_poison_stack_memory"; +static const char *kAsanUnpoisonStackMemoryName = + "__asan_unpoison_stack_memory"; static const int kAsanStackLeftRedzoneMagic = 0xf1; static const int kAsanStackMidRedzoneMagic = 0xf2; @@ -112,9 +120,10 @@ static cl::opt<bool> ClInitializers("asan-initialization-order", cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false)); static cl::opt<bool> ClMemIntrin("asan-memintrin", cl::desc("Handle memset/memcpy/memmove"), cl::Hidden, cl::init(true)); -// This flag may need to be replaced with -fasan-blacklist. -static cl::opt<std::string> ClBlackListFile("asan-blacklist", - cl::desc("File containing the list of functions to ignore " +static cl::opt<bool> ClRealignStack("asan-realign-stack", + cl::desc("Realign stack to 32"), cl::Hidden, cl::init(true)); +static cl::opt<std::string> ClBlacklistFile("asan-blacklist", + cl::desc("File containing the list of objects to ignore " "during instrumentation"), cl::Hidden); // These flags allow to change the shadow mapping. @@ -135,6 +144,10 @@ static cl::opt<bool> ClOptSameTemp("asan-opt-same-temp", static cl::opt<bool> ClOptGlobals("asan-opt-globals", cl::desc("Don't instrument scalar globals"), cl::Hidden, cl::init(true)); +static cl::opt<bool> ClCheckLifetime("asan-check-lifetime", + cl::desc("Use llvm.lifetime intrinsics to insert extra checks"), + cl::Hidden, cl::init(false)); + // Debug flags. static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden, cl::init(0)); @@ -148,80 +161,274 @@ static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"), cl::Hidden, cl::init(-1)); namespace { +/// A set of dynamically initialized globals extracted from metadata. +class SetOfDynamicallyInitializedGlobals { + public: + void Init(Module& M) { + // Clang generates metadata identifying all dynamically initialized globals. + NamedMDNode *DynamicGlobals = + M.getNamedMetadata("llvm.asan.dynamically_initialized_globals"); + if (!DynamicGlobals) + return; + for (int i = 0, n = DynamicGlobals->getNumOperands(); i < n; ++i) { + MDNode *MDN = DynamicGlobals->getOperand(i); + assert(MDN->getNumOperands() == 1); + Value *VG = MDN->getOperand(0); + // The optimizer may optimize away a global entirely, in which case we + // cannot instrument access to it. + if (!VG) + continue; + DynInitGlobals.insert(cast<GlobalVariable>(VG)); + } + } + bool Contains(GlobalVariable *G) { return DynInitGlobals.count(G) != 0; } + private: + SmallSet<GlobalValue*, 32> DynInitGlobals; +}; -/// An object of this type is created while instrumenting every function. -struct AsanFunctionContext { - AsanFunctionContext(Function &Function) : F(Function) { } +static int MappingScale() { + return ClMappingScale ? ClMappingScale : kDefaultShadowScale; +} - Function &F; -}; +static size_t RedzoneSize() { + // Redzone used for stack and globals is at least 32 bytes. + // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively. + return std::max(32U, 1U << MappingScale()); +} /// AddressSanitizer: instrument the code in module to find memory bugs. -struct AddressSanitizer : public ModulePass { - AddressSanitizer(); - virtual const char *getPassName() const; - void instrumentMop(AsanFunctionContext &AFC, Instruction *I); - void instrumentAddress(AsanFunctionContext &AFC, - Instruction *OrigIns, IRBuilder<> &IRB, +struct AddressSanitizer : public FunctionPass { + AddressSanitizer(bool CheckInitOrder = false, + bool CheckUseAfterReturn = false, + bool CheckLifetime = false, + StringRef BlacklistFile = StringRef()) + : FunctionPass(ID), + CheckInitOrder(CheckInitOrder || ClInitializers), + CheckUseAfterReturn(CheckUseAfterReturn || ClUseAfterReturn), + CheckLifetime(CheckLifetime || ClCheckLifetime), + BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile + : BlacklistFile) {} + virtual const char *getPassName() const { + return "AddressSanitizerFunctionPass"; + } + void instrumentMop(Instruction *I); + void instrumentAddress(Instruction *OrigIns, IRBuilder<> &IRB, Value *Addr, uint32_t TypeSize, bool IsWrite); Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, Value *ShadowValue, uint32_t TypeSize); Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr, bool IsWrite, size_t AccessSizeIndex); - bool instrumentMemIntrinsic(AsanFunctionContext &AFC, MemIntrinsic *MI); - void instrumentMemIntrinsicParam(AsanFunctionContext &AFC, - Instruction *OrigIns, Value *Addr, + bool instrumentMemIntrinsic(MemIntrinsic *MI); + void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite); Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); - bool handleFunction(Module &M, Function &F); + bool runOnFunction(Function &F); void createInitializerPoisonCalls(Module &M, Value *FirstAddr, Value *LastAddr); bool maybeInsertAsanInitAtFunctionEntry(Function &F); - bool poisonStackInFunction(Module &M, Function &F); - virtual bool runOnModule(Module &M); - bool insertGlobalRedzones(Module &M); + virtual bool doInitialization(Module &M); static char ID; // Pass identification, replacement for typeid private: - uint64_t getAllocaSizeInBytes(AllocaInst *AI) { - Type *Ty = AI->getAllocatedType(); - uint64_t SizeInBytes = TD->getTypeAllocSize(Ty); - return SizeInBytes; - } - uint64_t getAlignedSize(uint64_t SizeInBytes) { - return ((SizeInBytes + RedzoneSize - 1) - / RedzoneSize) * RedzoneSize; - } - uint64_t getAlignedAllocaSize(AllocaInst *AI) { - uint64_t SizeInBytes = getAllocaSizeInBytes(AI); - return getAlignedSize(SizeInBytes); - } + void initializeCallbacks(Module &M); - Function *checkInterfaceFunction(Constant *FuncOrBitcast); bool ShouldInstrumentGlobal(GlobalVariable *G); - void PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, - Value *ShadowBase, bool DoPoison); bool LooksLikeCodeInBug11395(Instruction *I); void FindDynamicInitializers(Module &M); - bool HasDynamicInitializer(GlobalVariable *G); + bool CheckInitOrder; + bool CheckUseAfterReturn; + bool CheckLifetime; LLVMContext *C; - TargetData *TD; + DataLayout *TD; uint64_t MappingOffset; - int MappingScale; - size_t RedzoneSize; int LongSize; Type *IntptrTy; - Type *IntptrPtrTy; Function *AsanCtorFunction; Function *AsanInitFunction; - Instruction *CtorInsertBefore; + Function *AsanHandleNoReturnFunc; + SmallString<64> BlacklistFile; OwningPtr<BlackList> BL; // This array is indexed by AccessIsWrite and log2(AccessSize). Function *AsanErrorCallback[2][kNumberOfAccessSizes]; InlineAsm *EmptyAsm; - SmallSet<GlobalValue*, 32> DynamicallyInitializedGlobals; + SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals; + + friend struct FunctionStackPoisoner; +}; + +class AddressSanitizerModule : public ModulePass { + public: + AddressSanitizerModule(bool CheckInitOrder = false, + StringRef BlacklistFile = StringRef()) + : ModulePass(ID), + CheckInitOrder(CheckInitOrder || ClInitializers), + BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile + : BlacklistFile) {} + bool runOnModule(Module &M); + static char ID; // Pass identification, replacement for typeid + virtual const char *getPassName() const { + return "AddressSanitizerModule"; + } + + private: + void initializeCallbacks(Module &M); + + bool ShouldInstrumentGlobal(GlobalVariable *G); + void createInitializerPoisonCalls(Module &M, Value *FirstAddr, + Value *LastAddr); + + bool CheckInitOrder; + SmallString<64> BlacklistFile; + OwningPtr<BlackList> BL; + SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals; + Type *IntptrTy; + LLVMContext *C; + DataLayout *TD; + Function *AsanPoisonGlobals; + Function *AsanUnpoisonGlobals; + Function *AsanRegisterGlobals; + Function *AsanUnregisterGlobals; +}; + +// Stack poisoning does not play well with exception handling. +// When an exception is thrown, we essentially bypass the code +// that unpoisones the stack. This is why the run-time library has +// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire +// stack in the interceptor. This however does not work inside the +// actual function which catches the exception. Most likely because the +// compiler hoists the load of the shadow value somewhere too high. +// This causes asan to report a non-existing bug on 453.povray. +// It sounds like an LLVM bug. +struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { + Function &F; + AddressSanitizer &ASan; + DIBuilder DIB; + LLVMContext *C; + Type *IntptrTy; + Type *IntptrPtrTy; + + SmallVector<AllocaInst*, 16> AllocaVec; + SmallVector<Instruction*, 8> RetVec; + uint64_t TotalStackSize; + unsigned StackAlignment; + + Function *AsanStackMallocFunc, *AsanStackFreeFunc; + Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc; + + // Stores a place and arguments of poisoning/unpoisoning call for alloca. + struct AllocaPoisonCall { + IntrinsicInst *InsBefore; + uint64_t Size; + bool DoPoison; + }; + SmallVector<AllocaPoisonCall, 8> AllocaPoisonCallVec; + + // Maps Value to an AllocaInst from which the Value is originated. + typedef DenseMap<Value*, AllocaInst*> AllocaForValueMapTy; + AllocaForValueMapTy AllocaForValue; + + FunctionStackPoisoner(Function &F, AddressSanitizer &ASan) + : F(F), ASan(ASan), DIB(*F.getParent()), C(ASan.C), + IntptrTy(ASan.IntptrTy), IntptrPtrTy(PointerType::get(IntptrTy, 0)), + TotalStackSize(0), StackAlignment(1 << MappingScale()) {} + + bool runOnFunction() { + if (!ClStack) return false; + // Collect alloca, ret, lifetime instructions etc. + for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()), + DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) { + BasicBlock *BB = *DI; + visit(*BB); + } + if (AllocaVec.empty()) return false; + + initializeCallbacks(*F.getParent()); + + poisonStack(); + + if (ClDebugStack) { + DEBUG(dbgs() << F); + } + return true; + } + + // Finds all static Alloca instructions and puts + // poisoned red zones around all of them. + // Then unpoison everything back before the function returns. + void poisonStack(); + + // ----------------------- Visitors. + /// \brief Collect all Ret instructions. + void visitReturnInst(ReturnInst &RI) { + RetVec.push_back(&RI); + } + + /// \brief Collect Alloca instructions we want (and can) handle. + void visitAllocaInst(AllocaInst &AI) { + if (!isInterestingAlloca(AI)) return; + + StackAlignment = std::max(StackAlignment, AI.getAlignment()); + AllocaVec.push_back(&AI); + uint64_t AlignedSize = getAlignedAllocaSize(&AI); + TotalStackSize += AlignedSize; + } + + /// \brief Collect lifetime intrinsic calls to check for use-after-scope + /// errors. + void visitIntrinsicInst(IntrinsicInst &II) { + if (!ASan.CheckLifetime) return; + Intrinsic::ID ID = II.getIntrinsicID(); + if (ID != Intrinsic::lifetime_start && + ID != Intrinsic::lifetime_end) + return; + // Found lifetime intrinsic, add ASan instrumentation if necessary. + ConstantInt *Size = dyn_cast<ConstantInt>(II.getArgOperand(0)); + // If size argument is undefined, don't do anything. + if (Size->isMinusOne()) return; + // Check that size doesn't saturate uint64_t and can + // be stored in IntptrTy. + const uint64_t SizeValue = Size->getValue().getLimitedValue(); + if (SizeValue == ~0ULL || + !ConstantInt::isValueValidForType(IntptrTy, SizeValue)) + return; + // Find alloca instruction that corresponds to llvm.lifetime argument. + AllocaInst *AI = findAllocaForValue(II.getArgOperand(1)); + if (!AI) return; + bool DoPoison = (ID == Intrinsic::lifetime_end); + AllocaPoisonCall APC = {&II, SizeValue, DoPoison}; + AllocaPoisonCallVec.push_back(APC); + } + + // ---------------------- Helpers. + void initializeCallbacks(Module &M); + + // Check if we want (and can) handle this alloca. + bool isInterestingAlloca(AllocaInst &AI) { + return (!AI.isArrayAllocation() && + AI.isStaticAlloca() && + AI.getAllocatedType()->isSized()); + } + + uint64_t getAllocaSizeInBytes(AllocaInst *AI) { + Type *Ty = AI->getAllocatedType(); + uint64_t SizeInBytes = ASan.TD->getTypeAllocSize(Ty); + return SizeInBytes; + } + uint64_t getAlignedSize(uint64_t SizeInBytes) { + size_t RZ = RedzoneSize(); + return ((SizeInBytes + RZ - 1) / RZ) * RZ; + } + uint64_t getAlignedAllocaSize(AllocaInst *AI) { + uint64_t SizeInBytes = getAllocaSizeInBytes(AI); + return getAlignedSize(SizeInBytes); + } + /// Finds alloca where the value comes from. + AllocaInst *findAllocaForValue(Value *V); + void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, + Value *ShadowBase, bool DoPoison); + void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> IRB, bool DoPoison); }; } // namespace @@ -230,13 +437,20 @@ char AddressSanitizer::ID = 0; INITIALIZE_PASS(AddressSanitizer, "asan", "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false, false) -AddressSanitizer::AddressSanitizer() : ModulePass(ID) { } -ModulePass *llvm::createAddressSanitizerPass() { - return new AddressSanitizer(); +FunctionPass *llvm::createAddressSanitizerFunctionPass( + bool CheckInitOrder, bool CheckUseAfterReturn, bool CheckLifetime, + StringRef BlacklistFile) { + return new AddressSanitizer(CheckInitOrder, CheckUseAfterReturn, + CheckLifetime, BlacklistFile); } -const char *AddressSanitizer::getPassName() const { - return "AddressSanitizer"; +char AddressSanitizerModule::ID = 0; +INITIALIZE_PASS(AddressSanitizerModule, "asan-module", + "AddressSanitizer: detects use-after-free and out-of-bounds bugs." + "ModulePass", false, false) +ModulePass *llvm::createAddressSanitizerModulePass( + bool CheckInitOrder, StringRef BlacklistFile) { + return new AddressSanitizerModule(CheckInitOrder, BlacklistFile); } static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { @@ -249,44 +463,17 @@ static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) { Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str); return new GlobalVariable(M, StrConst->getType(), true, - GlobalValue::PrivateLinkage, StrConst, ""); + GlobalValue::PrivateLinkage, StrConst, + kAsanGenPrefix); } -// Split the basic block and insert an if-then code. -// Before: -// Head -// Cmp -// Tail -// After: -// Head -// if (Cmp) -// ThenBlock -// Tail -// -// ThenBlock block is created and its terminator is returned. -// If Unreachable, ThenBlock is terminated with UnreachableInst, otherwise -// it is terminated with BranchInst to Tail. -static TerminatorInst *splitBlockAndInsertIfThen(Value *Cmp, bool Unreachable) { - Instruction *SplitBefore = cast<Instruction>(Cmp)->getNextNode(); - BasicBlock *Head = SplitBefore->getParent(); - BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); - TerminatorInst *HeadOldTerm = Head->getTerminator(); - LLVMContext &C = Head->getParent()->getParent()->getContext(); - BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); - TerminatorInst *CheckTerm; - if (Unreachable) - CheckTerm = new UnreachableInst(C, ThenBlock); - else - CheckTerm = BranchInst::Create(Tail, ThenBlock); - BranchInst *HeadNewTerm = - BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp); - ReplaceInstWithInst(HeadOldTerm, HeadNewTerm); - return CheckTerm; +static bool GlobalWasGeneratedByAsan(GlobalVariable *G) { + return G->getName().find(kAsanGenPrefix) == 0; } Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) { // Shadow >> scale - Shadow = IRB.CreateLShr(Shadow, MappingScale); + Shadow = IRB.CreateLShr(Shadow, MappingScale()); if (MappingOffset == 0) return Shadow; // (Shadow >> scale) | offset @@ -295,12 +482,12 @@ Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) { } void AddressSanitizer::instrumentMemIntrinsicParam( - AsanFunctionContext &AFC, Instruction *OrigIns, + Instruction *OrigIns, Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) { // Check the first byte. { IRBuilder<> IRB(InsertBefore); - instrumentAddress(AFC, OrigIns, IRB, Addr, 8, IsWrite); + instrumentAddress(OrigIns, IRB, Addr, 8, IsWrite); } // Check the last byte. { @@ -310,13 +497,12 @@ void AddressSanitizer::instrumentMemIntrinsicParam( SizeMinusOne = IRB.CreateIntCast(SizeMinusOne, IntptrTy, false); Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); Value *AddrPlusSizeMinisOne = IRB.CreateAdd(AddrLong, SizeMinusOne); - instrumentAddress(AFC, OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite); + instrumentAddress(OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite); } } // Instrument memset/memmove/memcpy -bool AddressSanitizer::instrumentMemIntrinsic(AsanFunctionContext &AFC, - MemIntrinsic *MI) { +bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { Value *Dst = MI->getDest(); MemTransferInst *MemTran = dyn_cast<MemTransferInst>(MI); Value *Src = MemTran ? MemTran->getSource() : 0; @@ -332,12 +518,12 @@ bool AddressSanitizer::instrumentMemIntrinsic(AsanFunctionContext &AFC, Value *Cmp = IRB.CreateICmpNE(Length, Constant::getNullValue(Length->getType())); - InsertBefore = splitBlockAndInsertIfThen(Cmp, false); + InsertBefore = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); } - instrumentMemIntrinsicParam(AFC, MI, Dst, Length, InsertBefore, true); + instrumentMemIntrinsicParam(MI, Dst, Length, InsertBefore, true); if (Src) - instrumentMemIntrinsicParam(AFC, MI, Src, Length, InsertBefore, false); + instrumentMemIntrinsicParam(MI, Src, Length, InsertBefore, false); return true; } @@ -367,46 +553,20 @@ static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) { return NULL; } -void AddressSanitizer::FindDynamicInitializers(Module& M) { - // Clang generates metadata identifying all dynamically initialized globals. - NamedMDNode *DynamicGlobals = - M.getNamedMetadata("llvm.asan.dynamically_initialized_globals"); - if (!DynamicGlobals) - return; - for (int i = 0, n = DynamicGlobals->getNumOperands(); i < n; ++i) { - MDNode *MDN = DynamicGlobals->getOperand(i); - assert(MDN->getNumOperands() == 1); - Value *VG = MDN->getOperand(0); - // The optimizer may optimize away a global entirely, in which case we - // cannot instrument access to it. - if (!VG) - continue; - - GlobalVariable *G = cast<GlobalVariable>(VG); - DynamicallyInitializedGlobals.insert(G); - } -} -// Returns true if a global variable is initialized dynamically in this TU. -bool AddressSanitizer::HasDynamicInitializer(GlobalVariable *G) { - return DynamicallyInitializedGlobals.count(G); -} - -void AddressSanitizer::instrumentMop(AsanFunctionContext &AFC, Instruction *I) { - bool IsWrite; +void AddressSanitizer::instrumentMop(Instruction *I) { + bool IsWrite = false; Value *Addr = isInterestingMemoryAccess(I, &IsWrite); assert(Addr); if (ClOpt && ClOptGlobals) { if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) { // If initialization order checking is disabled, a simple access to a // dynamically initialized global is always valid. - if (!ClInitializers) + if (!CheckInitOrder) return; // If a global variable does not have dynamic initialization we don't - // have to instrument it. However, if a global has external linkage, we - // assume it has dynamic initialization, as it may have an initializer - // in a different TU. - if (G->getLinkage() != GlobalVariable::ExternalLinkage && - !HasDynamicInitializer(G)) + // have to instrument it. However, if a global does not have initailizer + // at all, we assume it has dynamic initializer (in other TU). + if (G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G)) return; } } @@ -424,14 +584,14 @@ void AddressSanitizer::instrumentMop(AsanFunctionContext &AFC, Instruction *I) { } IRBuilder<> IRB(I); - instrumentAddress(AFC, I, IRB, Addr, TypeSize, IsWrite); + instrumentAddress(I, IRB, Addr, TypeSize, IsWrite); } // Validate the result of Module::getOrInsertFunction called for an interface // function of AddressSanitizer. If the instrumented module defines a function // with the same name, their prototypes must match, otherwise // getOrInsertFunction returns a bitcast. -Function *AddressSanitizer::checkInterfaceFunction(Constant *FuncOrBitcast) { +static Function *checkInterfaceFunction(Constant *FuncOrBitcast) { if (isa<Function>(FuncOrBitcast)) return cast<Function>(FuncOrBitcast); FuncOrBitcast->dump(); report_fatal_error("trying to redefine an AddressSanitizer " @@ -454,7 +614,7 @@ Instruction *AddressSanitizer::generateCrashCode( Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, Value *ShadowValue, uint32_t TypeSize) { - size_t Granularity = 1 << MappingScale; + size_t Granularity = 1 << MappingScale(); // Addr & (Granularity - 1) Value *LastAccessedByte = IRB.CreateAnd( AddrLong, ConstantInt::get(IntptrTy, Granularity - 1)); @@ -469,14 +629,13 @@ Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue); } -void AddressSanitizer::instrumentAddress(AsanFunctionContext &AFC, - Instruction *OrigIns, +void AddressSanitizer::instrumentAddress(Instruction *OrigIns, IRBuilder<> &IRB, Value *Addr, uint32_t TypeSize, bool IsWrite) { Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); Type *ShadowTy = IntegerType::get( - *C, std::max(8U, TypeSize >> MappingScale)); + *C, std::max(8U, TypeSize >> MappingScale())); Type *ShadowPtrTy = PointerType::get(ShadowTy, 0); Value *ShadowPtr = memToShadow(AddrLong, IRB); Value *CmpVal = Constant::getNullValue(ShadowTy); @@ -485,21 +644,23 @@ void AddressSanitizer::instrumentAddress(AsanFunctionContext &AFC, Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal); size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize); - size_t Granularity = 1 << MappingScale; + size_t Granularity = 1 << MappingScale(); TerminatorInst *CrashTerm = 0; if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) { - TerminatorInst *CheckTerm = splitBlockAndInsertIfThen(Cmp, false); + TerminatorInst *CheckTerm = + SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); assert(dyn_cast<BranchInst>(CheckTerm)->isUnconditional()); BasicBlock *NextBB = CheckTerm->getSuccessor(0); IRB.SetInsertPoint(CheckTerm); Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize); - BasicBlock *CrashBlock = BasicBlock::Create(*C, "", &AFC.F, NextBB); + BasicBlock *CrashBlock = + BasicBlock::Create(*C, "", NextBB->getParent(), NextBB); CrashTerm = new UnreachableInst(*C, CrashBlock); BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2); ReplaceInstWithInst(CheckTerm, NewTerm); } else { - CrashTerm = splitBlockAndInsertIfThen(Cmp, true); + CrashTerm = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), true); } Instruction *Crash = @@ -507,9 +668,8 @@ void AddressSanitizer::instrumentAddress(AsanFunctionContext &AFC, Crash->setDebugLoc(OrigIns->getDebugLoc()); } -void AddressSanitizer::createInitializerPoisonCalls(Module &M, - Value *FirstAddr, - Value *LastAddr) { +void AddressSanitizerModule::createInitializerPoisonCalls( + Module &M, Value *FirstAddr, Value *LastAddr) { // We do all of our poisoning and unpoisoning within _GLOBAL__I_a. Function *GlobalInit = M.getFunction("_GLOBAL__I_a"); // If that function is not present, this TU contains no globals, or they have @@ -520,14 +680,6 @@ void AddressSanitizer::createInitializerPoisonCalls(Module &M, // Set up the arguments to our poison/unpoison functions. IRBuilder<> IRB(GlobalInit->begin()->getFirstInsertionPt()); - // Declare our poisoning and unpoisoning functions. - Function *AsanPoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction( - kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); - AsanPoisonGlobals->setLinkage(Function::ExternalLinkage); - Function *AsanUnpoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction( - kAsanUnpoisonGlobalsName, IRB.getVoidTy(), NULL)); - AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage); - // Add a call to poison all external globals before the given function starts. IRB.CreateCall2(AsanPoisonGlobals, FirstAddr, LastAddr); @@ -540,13 +692,14 @@ void AddressSanitizer::createInitializerPoisonCalls(Module &M, } } -bool AddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) { +bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { Type *Ty = cast<PointerType>(G->getType())->getElementType(); - DEBUG(dbgs() << "GLOBAL: " << *G); + DEBUG(dbgs() << "GLOBAL: " << *G << "\n"); if (BL->isIn(*G)) return false; if (!Ty->isSized()) return false; if (!G->hasInitializer()) return false; + if (GlobalWasGeneratedByAsan(G)) return false; // Our own global. // Touch only those globals that will not be defined in other modules. // Don't handle ODR type linkages since other modules may be built w/o asan. if (G->getLinkage() != GlobalVariable::ExternalLinkage && @@ -559,7 +712,7 @@ bool AddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) { if (G->isThreadLocal()) return false; // For now, just ignore this Alloca if the alignment is large. - if (G->getAlignment() > RedzoneSize) return false; + if (G->getAlignment() > RedzoneSize()) return false; // Ignore all the globals with the names starting with "\01L_OBJC_". // Many of those are put into the .cstring section. The linker compresses @@ -598,10 +751,41 @@ bool AddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) { return true; } +void AddressSanitizerModule::initializeCallbacks(Module &M) { + IRBuilder<> IRB(*C); + // Declare our poisoning and unpoisoning functions. + AsanPoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction( + kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); + AsanPoisonGlobals->setLinkage(Function::ExternalLinkage); + AsanUnpoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction( + kAsanUnpoisonGlobalsName, IRB.getVoidTy(), NULL)); + AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage); + // Declare functions that register/unregister globals. + AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction( + kAsanRegisterGlobalsName, IRB.getVoidTy(), + IntptrTy, IntptrTy, NULL)); + AsanRegisterGlobals->setLinkage(Function::ExternalLinkage); + AsanUnregisterGlobals = checkInterfaceFunction(M.getOrInsertFunction( + kAsanUnregisterGlobalsName, + IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); + AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage); +} + // This function replaces all global variables with new variables that have // trailing redzones. It also creates a function that poisons // redzones and inserts this function into llvm.global_ctors. -bool AddressSanitizer::insertGlobalRedzones(Module &M) { +bool AddressSanitizerModule::runOnModule(Module &M) { + if (!ClGlobals) return false; + TD = getAnalysisIfAvailable<DataLayout>(); + if (!TD) + return false; + BL.reset(new BlackList(BlacklistFile)); + if (BL->isIn(M)) return false; + C = &(M.getContext()); + IntptrTy = Type::getIntNTy(*C, TD->getPointerSizeInBits()); + initializeCallbacks(M); + DynamicallyInitializedGlobals.Init(M); + SmallVector<GlobalVariable *, 16> GlobalsToChange; for (Module::GlobalListType::iterator G = M.global_begin(), @@ -625,10 +809,10 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { IntptrTy, NULL); SmallVector<Constant *, 16> Initializers(n), DynamicInit; - IRBuilder<> IRB(CtorInsertBefore); - if (ClInitializers) - FindDynamicInitializers(M); + Function *CtorFunc = M.getFunction(kAsanModuleCtorName); + assert(CtorFunc); + IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator()); // The addresses of the first and last dynamically initialized globals in // this TU. Used in initialization order checking. @@ -639,11 +823,12 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { PointerType *PtrTy = cast<PointerType>(G->getType()); Type *Ty = PtrTy->getElementType(); uint64_t SizeInBytes = TD->getTypeAllocSize(Ty); - uint64_t RightRedzoneSize = RedzoneSize + - (RedzoneSize - (SizeInBytes % RedzoneSize)); + size_t RZ = RedzoneSize(); + uint64_t RightRedzoneSize = RZ + (RZ - (SizeInBytes % RZ)); Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize); // Determine whether this global should be poisoned in initialization. - bool GlobalHasDynamicInitializer = HasDynamicInitializer(G); + bool GlobalHasDynamicInitializer = + DynamicallyInitializedGlobals.Contains(G); // Don't check initialization order if this global is blacklisted. GlobalHasDynamicInitializer &= !BL->isInInit(*G); @@ -663,7 +848,7 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { M, NewTy, G->isConstant(), G->getLinkage(), NewInitializer, "", G, G->getThreadLocalMode()); NewGlobal->copyAttributesFrom(G); - NewGlobal->setAlignment(RedzoneSize); + NewGlobal->setAlignment(RZ); Value *Indices2[2]; Indices2[0] = IRB.getInt32(0); @@ -684,13 +869,13 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { NULL); // Populate the first and last globals declared in this TU. - if (ClInitializers && GlobalHasDynamicInitializer) { + if (CheckInitOrder && GlobalHasDynamicInitializer) { LastDynamic = ConstantExpr::getPointerCast(NewGlobal, IntptrTy); if (FirstDynamic == 0) FirstDynamic = LastDynamic; } - DEBUG(dbgs() << "NEW GLOBAL:\n" << *NewGlobal); + DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n"); } ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n); @@ -699,14 +884,8 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { ConstantArray::get(ArrayOfGlobalStructTy, Initializers), ""); // Create calls for poisoning before initializers run and unpoisoning after. - if (ClInitializers && FirstDynamic && LastDynamic) + if (CheckInitOrder && FirstDynamic && LastDynamic) createInitializerPoisonCalls(M, FirstDynamic, LastDynamic); - - Function *AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction( - kAsanRegisterGlobalsName, IRB.getVoidTy(), - IntptrTy, IntptrTy, NULL)); - AsanRegisterGlobals->setLinkage(Function::ExternalLinkage); - IRB.CreateCall2(AsanRegisterGlobals, IRB.CreatePointerCast(AllGlobals, IntptrTy), ConstantInt::get(IntptrTy, n)); @@ -718,12 +897,6 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { GlobalValue::InternalLinkage, kAsanModuleDtorName, &M); BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction); IRBuilder<> IRB_Dtor(ReturnInst::Create(*C, AsanDtorBB)); - Function *AsanUnregisterGlobals = - checkInterfaceFunction(M.getOrInsertFunction( - kAsanUnregisterGlobalsName, - IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); - AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage); - IRB_Dtor.CreateCall2(AsanUnregisterGlobals, IRB.CreatePointerCast(AllGlobals, IntptrTy), ConstantInt::get(IntptrTy, n)); @@ -733,49 +906,55 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { return true; } +void AddressSanitizer::initializeCallbacks(Module &M) { + IRBuilder<> IRB(*C); + // Create __asan_report* callbacks. + for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) { + for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; + AccessSizeIndex++) { + // IsWrite and TypeSize are encoded in the function name. + std::string FunctionName = std::string(kAsanReportErrorTemplate) + + (AccessIsWrite ? "store" : "load") + itostr(1 << AccessSizeIndex); + // If we are merging crash callbacks, they have two parameters. + AsanErrorCallback[AccessIsWrite][AccessSizeIndex] = + checkInterfaceFunction(M.getOrInsertFunction( + FunctionName, IRB.getVoidTy(), IntptrTy, NULL)); + } + } + + AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction( + kAsanHandleNoReturnName, IRB.getVoidTy(), NULL)); + // We insert an empty inline asm after __asan_report* to avoid callback merge. + EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), + StringRef(""), StringRef(""), + /*hasSideEffects=*/true); +} + // virtual -bool AddressSanitizer::runOnModule(Module &M) { +bool AddressSanitizer::doInitialization(Module &M) { // Initialize the private fields. No one has accessed them before. - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); + if (!TD) return false; - BL.reset(new BlackList(ClBlackListFile)); + BL.reset(new BlackList(BlacklistFile)); + DynamicallyInitializedGlobals.Init(M); C = &(M.getContext()); LongSize = TD->getPointerSizeInBits(); IntptrTy = Type::getIntNTy(*C, LongSize); - IntptrPtrTy = PointerType::get(IntptrTy, 0); AsanCtorFunction = Function::Create( FunctionType::get(Type::getVoidTy(*C), false), GlobalValue::InternalLinkage, kAsanModuleCtorName, &M); BasicBlock *AsanCtorBB = BasicBlock::Create(*C, "", AsanCtorFunction); - CtorInsertBefore = ReturnInst::Create(*C, AsanCtorBB); - // call __asan_init in the module ctor. - IRBuilder<> IRB(CtorInsertBefore); + IRBuilder<> IRB(ReturnInst::Create(*C, AsanCtorBB)); AsanInitFunction = checkInterfaceFunction( M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), NULL)); AsanInitFunction->setLinkage(Function::ExternalLinkage); IRB.CreateCall(AsanInitFunction); - // Create __asan_report* callbacks. - for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) { - for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; - AccessSizeIndex++) { - // IsWrite and TypeSize are encoded in the function name. - std::string FunctionName = std::string(kAsanReportErrorTemplate) + - (AccessIsWrite ? "store" : "load") + itostr(1 << AccessSizeIndex); - // If we are merging crash callbacks, they have two parameters. - AsanErrorCallback[AccessIsWrite][AccessSizeIndex] = cast<Function>( - M.getOrInsertFunction(FunctionName, IRB.getVoidTy(), IntptrTy, NULL)); - } - } - // We insert an empty inline asm after __asan_report* to avoid callback merge. - EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), - StringRef(""), StringRef(""), - /*hasSideEffects=*/true); - llvm::Triple targetTriple(M.getTargetTriple()); bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::Android; @@ -789,18 +968,7 @@ bool AddressSanitizer::runOnModule(Module &M) { MappingOffset = 1ULL << ClMappingOffsetLog; } } - MappingScale = kDefaultShadowScale; - if (ClMappingScale) { - MappingScale = ClMappingScale; - } - // Redzone used for stack and globals is at least 32 bytes. - // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively. - RedzoneSize = std::max(32, (int)(1 << MappingScale)); - bool Res = false; - - if (ClGlobals) - Res |= insertGlobalRedzones(M); if (ClMappingOffsetLog >= 0) { // Tell the run-time the current values of mapping offset and scale. @@ -814,21 +982,15 @@ bool AddressSanitizer::runOnModule(Module &M) { if (ClMappingScale) { GlobalValue *asan_mapping_scale = new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage, - ConstantInt::get(IntptrTy, MappingScale), + ConstantInt::get(IntptrTy, MappingScale()), kAsanMappingScaleName); // Read the global, otherwise it may be optimized away. IRB.CreateLoad(asan_mapping_scale, true); } - - for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (F->isDeclaration()) continue; - Res |= handleFunction(M, *F); - } - appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndCtorPriority); - return Res; + return true; } bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { @@ -847,19 +1009,24 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { return false; } -bool AddressSanitizer::handleFunction(Module &M, Function &F) { +bool AddressSanitizer::runOnFunction(Function &F) { if (BL->isIn(F)) return false; if (&F == AsanCtorFunction) return false; + DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n"); + initializeCallbacks(*F.getParent()); // If needed, insert __asan_init before checking for AddressSafety attr. maybeInsertAsanInitAtFunctionEntry(F); - if (!F.hasFnAttr(Attribute::AddressSafety)) return false; + if (!F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::AddressSafety)) + return false; if (!ClDebugFunc.empty() && ClDebugFunc != F.getName()) return false; - // We want to instrument every address only once per basic block - // (unless there are calls between uses). + + // We want to instrument every address only once per basic block (unless there + // are calls between uses). SmallSet<Value*, 16> TempsToInstrument; SmallVector<Instruction*, 16> ToInstrument; SmallVector<Instruction*, 8> NoReturnCalls; @@ -897,8 +1064,6 @@ bool AddressSanitizer::handleFunction(Module &M, Function &F) { } } - AsanFunctionContext AFC(F); - // Instrument. int NumInstrumented = 0; for (size_t i = 0, n = ToInstrument.size(); i != n; i++) { @@ -906,25 +1071,24 @@ bool AddressSanitizer::handleFunction(Module &M, Function &F) { if (ClDebugMin < 0 || ClDebugMax < 0 || (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) { if (isInterestingMemoryAccess(Inst, &IsWrite)) - instrumentMop(AFC, Inst); + instrumentMop(Inst); else - instrumentMemIntrinsic(AFC, cast<MemIntrinsic>(Inst)); + instrumentMemIntrinsic(cast<MemIntrinsic>(Inst)); } NumInstrumented++; } - DEBUG(dbgs() << F); - - bool ChangedStack = poisonStackInFunction(M, F); + FunctionStackPoisoner FSP(F, *this); + bool ChangedStack = FSP.runOnFunction(); // We must unpoison the stack before every NoReturn call (throw, _exit, etc). // See e.g. http://code.google.com/p/address-sanitizer/issues/detail?id=37 for (size_t i = 0, n = NoReturnCalls.size(); i != n; i++) { Instruction *CI = NoReturnCalls[i]; IRBuilder<> IRB(CI); - IRB.CreateCall(M.getOrInsertFunction(kAsanHandleNoReturnName, - IRB.getVoidTy(), NULL)); + IRB.CreateCall(AsanHandleNoReturnFunc); } + DEBUG(dbgs() << "ASAN done instrumenting:\n" << F << "\n"); return NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty(); } @@ -940,10 +1104,10 @@ static uint64_t ValueForPoison(uint64_t PoisonByte, size_t ShadowRedzoneSize) { static void PoisonShadowPartialRightRedzone(uint8_t *Shadow, size_t Size, - size_t RedzoneSize, + size_t RZSize, size_t ShadowGranularity, uint8_t Magic) { - for (size_t i = 0; i < RedzoneSize; + for (size_t i = 0; i < RZSize; i+= ShadowGranularity, Shadow++) { if (i + ShadowGranularity <= Size) { *Shadow = 0; // fully addressable @@ -955,10 +1119,35 @@ static void PoisonShadowPartialRightRedzone(uint8_t *Shadow, } } -void AddressSanitizer::PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec, - IRBuilder<> IRB, - Value *ShadowBase, bool DoPoison) { - size_t ShadowRZSize = RedzoneSize >> MappingScale; +// Workaround for bug 11395: we don't want to instrument stack in functions +// with large assembly blobs (32-bit only), otherwise reg alloc may crash. +// FIXME: remove once the bug 11395 is fixed. +bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) { + if (LongSize != 32) return false; + CallInst *CI = dyn_cast<CallInst>(I); + if (!CI || !CI->isInlineAsm()) return false; + if (CI->getNumArgOperands() <= 5) return false; + // We have inline assembly with quite a few arguments. + return true; +} + +void FunctionStackPoisoner::initializeCallbacks(Module &M) { + IRBuilder<> IRB(*C); + AsanStackMallocFunc = checkInterfaceFunction(M.getOrInsertFunction( + kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL)); + AsanStackFreeFunc = checkInterfaceFunction(M.getOrInsertFunction( + kAsanStackFreeName, IRB.getVoidTy(), + IntptrTy, IntptrTy, IntptrTy, NULL)); + AsanPoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction( + kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); + AsanUnpoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction( + kAsanUnpoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); +} + +void FunctionStackPoisoner::poisonRedZones( + const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, Value *ShadowBase, + bool DoPoison) { + size_t ShadowRZSize = RedzoneSize() >> MappingScale(); assert(ShadowRZSize >= 1 && ShadowRZSize <= 4); Type *RZTy = Type::getIntNTy(*C, ShadowRZSize * 8); Type *RZPtrTy = PointerType::get(RZTy, 0); @@ -974,12 +1163,12 @@ void AddressSanitizer::PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec, IRB.CreateStore(PoisonLeft, IRB.CreateIntToPtr(ShadowBase, RZPtrTy)); // poison all other red zones. - uint64_t Pos = RedzoneSize; + uint64_t Pos = RedzoneSize(); for (size_t i = 0, n = AllocaVec.size(); i < n; i++) { AllocaInst *AI = AllocaVec[i]; uint64_t SizeInBytes = getAllocaSizeInBytes(AI); uint64_t AlignedSize = getAlignedAllocaSize(AI); - assert(AlignedSize - SizeInBytes < RedzoneSize); + assert(AlignedSize - SizeInBytes < RedzoneSize()); Value *Ptr = NULL; Pos += AlignedSize; @@ -989,13 +1178,13 @@ void AddressSanitizer::PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec, // Poison the partial redzone at right Ptr = IRB.CreateAdd( ShadowBase, ConstantInt::get(IntptrTy, - (Pos >> MappingScale) - ShadowRZSize)); - size_t AddressableBytes = RedzoneSize - (AlignedSize - SizeInBytes); + (Pos >> MappingScale()) - ShadowRZSize)); + size_t AddressableBytes = RedzoneSize() - (AlignedSize - SizeInBytes); uint32_t Poison = 0; if (DoPoison) { PoisonShadowPartialRightRedzone((uint8_t*)&Poison, AddressableBytes, - RedzoneSize, - 1ULL << MappingScale, + RedzoneSize(), + 1ULL << MappingScale(), kAsanStackPartialRedzoneMagic); } Value *PartialPoison = ConstantInt::get(RZTy, Poison); @@ -1004,76 +1193,23 @@ void AddressSanitizer::PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec, // Poison the full redzone at right. Ptr = IRB.CreateAdd(ShadowBase, - ConstantInt::get(IntptrTy, Pos >> MappingScale)); - Value *Poison = i == AllocaVec.size() - 1 ? PoisonRight : PoisonMid; + ConstantInt::get(IntptrTy, Pos >> MappingScale())); + bool LastAlloca = (i == AllocaVec.size() - 1); + Value *Poison = LastAlloca ? PoisonRight : PoisonMid; IRB.CreateStore(Poison, IRB.CreateIntToPtr(Ptr, RZPtrTy)); - Pos += RedzoneSize; + Pos += RedzoneSize(); } } -// Workaround for bug 11395: we don't want to instrument stack in functions -// with large assembly blobs (32-bit only), otherwise reg alloc may crash. -// FIXME: remove once the bug 11395 is fixed. -bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) { - if (LongSize != 32) return false; - CallInst *CI = dyn_cast<CallInst>(I); - if (!CI || !CI->isInlineAsm()) return false; - if (CI->getNumArgOperands() <= 5) return false; - // We have inline assembly with quite a few arguments. - return true; -} +void FunctionStackPoisoner::poisonStack() { + uint64_t LocalStackSize = TotalStackSize + + (AllocaVec.size() + 1) * RedzoneSize(); -// Find all static Alloca instructions and put -// poisoned red zones around all of them. -// Then unpoison everything back before the function returns. -// -// Stack poisoning does not play well with exception handling. -// When an exception is thrown, we essentially bypass the code -// that unpoisones the stack. This is why the run-time library has -// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire -// stack in the interceptor. This however does not work inside the -// actual function which catches the exception. Most likely because the -// compiler hoists the load of the shadow value somewhere too high. -// This causes asan to report a non-existing bug on 453.povray. -// It sounds like an LLVM bug. -bool AddressSanitizer::poisonStackInFunction(Module &M, Function &F) { - if (!ClStack) return false; - SmallVector<AllocaInst*, 16> AllocaVec; - SmallVector<Instruction*, 8> RetVec; - uint64_t TotalSize = 0; - - // Filter out Alloca instructions we want (and can) handle. - // Collect Ret instructions. - for (Function::iterator FI = F.begin(), FE = F.end(); - FI != FE; ++FI) { - BasicBlock &BB = *FI; - for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); - BI != BE; ++BI) { - if (isa<ReturnInst>(BI)) { - RetVec.push_back(BI); - continue; - } - - AllocaInst *AI = dyn_cast<AllocaInst>(BI); - if (!AI) continue; - if (AI->isArrayAllocation()) continue; - if (!AI->isStaticAlloca()) continue; - if (!AI->getAllocatedType()->isSized()) continue; - if (AI->getAlignment() > RedzoneSize) continue; - AllocaVec.push_back(AI); - uint64_t AlignedSize = getAlignedAllocaSize(AI); - TotalSize += AlignedSize; - } - } - - if (AllocaVec.empty()) return false; - - uint64_t LocalStackSize = TotalSize + (AllocaVec.size() + 1) * RedzoneSize; - - bool DoStackMalloc = ClUseAfterReturn + bool DoStackMalloc = ASan.CheckUseAfterReturn && LocalStackSize <= kMaxStackMallocSize; + assert(AllocaVec.size() > 0); Instruction *InsBefore = AllocaVec[0]; IRBuilder<> IRB(InsBefore); @@ -1081,14 +1217,14 @@ bool AddressSanitizer::poisonStackInFunction(Module &M, Function &F) { Type *ByteArrayTy = ArrayType::get(IRB.getInt8Ty(), LocalStackSize); AllocaInst *MyAlloca = new AllocaInst(ByteArrayTy, "MyAlloca", InsBefore); - MyAlloca->setAlignment(RedzoneSize); + if (ClRealignStack && StackAlignment < RedzoneSize()) + StackAlignment = RedzoneSize(); + MyAlloca->setAlignment(StackAlignment); assert(MyAlloca->isStaticAlloca()); Value *OrigStackBase = IRB.CreatePointerCast(MyAlloca, IntptrTy); Value *LocalStackBase = OrigStackBase; if (DoStackMalloc) { - Value *AsanStackMallocFunc = M.getOrInsertFunction( - kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL); LocalStackBase = IRB.CreateCall2(AsanStackMallocFunc, ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase); } @@ -1098,7 +1234,19 @@ bool AddressSanitizer::poisonStackInFunction(Module &M, Function &F) { raw_svector_ostream StackDescription(StackDescriptionStorage); StackDescription << F.getName() << " " << AllocaVec.size() << " "; - uint64_t Pos = RedzoneSize; + // Insert poison calls for lifetime intrinsics for alloca. + bool HavePoisonedAllocas = false; + for (size_t i = 0, n = AllocaPoisonCallVec.size(); i < n; i++) { + const AllocaPoisonCall &APC = AllocaPoisonCallVec[i]; + IntrinsicInst *II = APC.InsBefore; + AllocaInst *AI = findAllocaForValue(II->getArgOperand(1)); + assert(AI); + IRBuilder<> IRB(II); + poisonAlloca(AI, APC.Size, IRB, APC.DoPoison); + HavePoisonedAllocas |= APC.DoPoison; + } + + uint64_t Pos = RedzoneSize(); // Replace Alloca instructions with base+offset. for (size_t i = 0, n = AllocaVec.size(); i < n; i++) { AllocaInst *AI = AllocaVec[i]; @@ -1107,12 +1255,13 @@ bool AddressSanitizer::poisonStackInFunction(Module &M, Function &F) { StackDescription << Pos << " " << SizeInBytes << " " << Name.size() << " " << Name << " "; uint64_t AlignedSize = getAlignedAllocaSize(AI); - assert((AlignedSize % RedzoneSize) == 0); - AI->replaceAllUsesWith( - IRB.CreateIntToPtr( + assert((AlignedSize % RedzoneSize()) == 0); + Value *NewAllocaPtr = IRB.CreateIntToPtr( IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Pos)), - AI->getType())); - Pos += AlignedSize + RedzoneSize; + AI->getType()); + replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB); + AI->replaceAllUsesWith(NewAllocaPtr); + Pos += AlignedSize + RedzoneSize(); } assert(Pos == LocalStackSize); @@ -1121,45 +1270,93 @@ bool AddressSanitizer::poisonStackInFunction(Module &M, Function &F) { IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic), BasePlus0); Value *BasePlus1 = IRB.CreateAdd(LocalStackBase, - ConstantInt::get(IntptrTy, LongSize/8)); + ConstantInt::get(IntptrTy, + ASan.LongSize/8)); BasePlus1 = IRB.CreateIntToPtr(BasePlus1, IntptrPtrTy); - Value *Description = IRB.CreatePointerCast( - createPrivateGlobalForString(M, StackDescription.str()), - IntptrTy); + GlobalVariable *StackDescriptionGlobal = + createPrivateGlobalForString(*F.getParent(), StackDescription.str()); + Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, + IntptrTy); IRB.CreateStore(Description, BasePlus1); // Poison the stack redzones at the entry. - Value *ShadowBase = memToShadow(LocalStackBase, IRB); - PoisonStack(ArrayRef<AllocaInst*>(AllocaVec), IRB, ShadowBase, true); - - Value *AsanStackFreeFunc = NULL; - if (DoStackMalloc) { - AsanStackFreeFunc = M.getOrInsertFunction( - kAsanStackFreeName, IRB.getVoidTy(), - IntptrTy, IntptrTy, IntptrTy, NULL); - } + Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB); + poisonRedZones(AllocaVec, IRB, ShadowBase, true); // Unpoison the stack before all ret instructions. for (size_t i = 0, n = RetVec.size(); i < n; i++) { Instruction *Ret = RetVec[i]; IRBuilder<> IRBRet(Ret); - // Mark the current frame as retired. IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic), BasePlus0); // Unpoison the stack. - PoisonStack(ArrayRef<AllocaInst*>(AllocaVec), IRBRet, ShadowBase, false); - + poisonRedZones(AllocaVec, IRBRet, ShadowBase, false); if (DoStackMalloc) { + // In use-after-return mode, mark the whole stack frame unaddressable. IRBRet.CreateCall3(AsanStackFreeFunc, LocalStackBase, ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase); + } else if (HavePoisonedAllocas) { + // If we poisoned some allocas in llvm.lifetime analysis, + // unpoison whole stack frame now. + assert(LocalStackBase == OrigStackBase); + poisonAlloca(LocalStackBase, LocalStackSize, IRBRet, false); } } - if (ClDebugStack) { - DEBUG(dbgs() << F); - } + // We are done. Remove the old unused alloca instructions. + for (size_t i = 0, n = AllocaVec.size(); i < n; i++) + AllocaVec[i]->eraseFromParent(); +} - return true; +void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size, + IRBuilder<> IRB, bool DoPoison) { + // For now just insert the call to ASan runtime. + Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy); + Value *SizeArg = ConstantInt::get(IntptrTy, Size); + IRB.CreateCall2(DoPoison ? AsanPoisonStackMemoryFunc + : AsanUnpoisonStackMemoryFunc, + AddrArg, SizeArg); +} + +// Handling llvm.lifetime intrinsics for a given %alloca: +// (1) collect all llvm.lifetime.xxx(%size, %value) describing the alloca. +// (2) if %size is constant, poison memory for llvm.lifetime.end (to detect +// invalid accesses) and unpoison it for llvm.lifetime.start (the memory +// could be poisoned by previous llvm.lifetime.end instruction, as the +// variable may go in and out of scope several times, e.g. in loops). +// (3) if we poisoned at least one %alloca in a function, +// unpoison the whole stack frame at function exit. + +AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) + // We're intested only in allocas we can handle. + return isInterestingAlloca(*AI) ? AI : 0; + // See if we've already calculated (or started to calculate) alloca for a + // given value. + AllocaForValueMapTy::iterator I = AllocaForValue.find(V); + if (I != AllocaForValue.end()) + return I->second; + // Store 0 while we're calculating alloca for value V to avoid + // infinite recursion if the value references itself. + AllocaForValue[V] = 0; + AllocaInst *Res = 0; + if (CastInst *CI = dyn_cast<CastInst>(V)) + Res = findAllocaForValue(CI->getOperand(0)); + else if (PHINode *PN = dyn_cast<PHINode>(V)) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *IncValue = PN->getIncomingValue(i); + // Allow self-referencing phi-nodes. + if (IncValue == PN) continue; + AllocaInst *IncValueAI = findAllocaForValue(IncValue); + // AI for incoming values should exist and should all be equal. + if (IncValueAI == 0 || (Res != 0 && IncValueAI != Res)) + return 0; + Res = IncValueAI; + } + } + if (Res != 0) + AllocaForValue[V] = Res; + return Res; } diff --git a/lib/Transforms/Instrumentation/BlackList.cpp b/lib/Transforms/Instrumentation/BlackList.cpp index 2cb1199..4fcbea4 100644 --- a/lib/Transforms/Instrumentation/BlackList.cpp +++ b/lib/Transforms/Instrumentation/BlackList.cpp @@ -13,26 +13,26 @@ // //===----------------------------------------------------------------------===// -#include <utility> -#include <string> - #include "BlackList.h" #include "llvm/ADT/OwningPtr.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/Function.h" -#include "llvm/GlobalVariable.h" -#include "llvm/Module.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Module.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Regex.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/system_error.h" +#include <string> +#include <utility> namespace llvm { BlackList::BlackList(const StringRef Path) { // Validate and open blacklist file. - if (!Path.size()) return; + if (Path.empty()) return; OwningPtr<MemoryBuffer> File; if (error_code EC = MemoryBuffer::getFile(Path, File)) { report_fatal_error("Can't open blacklist file: " + Path + ": " + @@ -45,10 +45,17 @@ BlackList::BlackList(const StringRef Path) { StringMap<std::string> Regexps; for (SmallVector<StringRef, 16>::iterator I = Lines.begin(), E = Lines.end(); I != E; ++I) { + // Ignore empty lines and lines starting with "#" + if (I->empty() || I->startswith("#")) + continue; // Get our prefix and unparsed regexp. std::pair<StringRef, StringRef> SplitLine = I->split(":"); StringRef Prefix = SplitLine.first; std::string Regexp = SplitLine.second; + if (Regexp.empty()) { + // Missing ':' in the line. + report_fatal_error("malformed blacklist line: " + SplitLine.first); + } // Replace * with .* for (size_t pos = 0; (pos = Regexp.find("*", pos)) != std::string::npos; @@ -65,7 +72,7 @@ BlackList::BlackList(const StringRef Path) { } // Add this regexp into the proper group by its prefix. - if (Regexps[Prefix].size()) + if (!Regexps[Prefix].empty()) Regexps[Prefix] += "|"; Regexps[Prefix] += Regexp; } @@ -89,14 +96,29 @@ bool BlackList::isIn(const Module &M) { return inSection("src", M.getModuleIdentifier()); } +static StringRef GetGVTypeString(const GlobalVariable &G) { + // Types of GlobalVariables are always pointer types. + Type *GType = G.getType()->getElementType(); + // For now we support blacklisting struct types only. + if (StructType *SGType = dyn_cast<StructType>(GType)) { + if (!SGType->isLiteral()) + return SGType->getName(); + } + return "<unknown type>"; +} + bool BlackList::isInInit(const GlobalVariable &G) { - return isIn(*G.getParent()) || inSection("global-init", G.getName()); + return (isIn(*G.getParent()) || + inSection("global-init", G.getName()) || + inSection("global-init-type", GetGVTypeString(G))); } -bool BlackList::inSection(const StringRef Section, - const StringRef Query) { - Regex *FunctionRegex = Entries[Section]; - return FunctionRegex ? FunctionRegex->match(Query) : false; +bool BlackList::inSection(const StringRef Section, const StringRef Query) { + StringMap<Regex*>::iterator I = Entries.find(Section); + if (I == Entries.end()) return false; + + Regex *FunctionRegex = I->getValue(); + return FunctionRegex->match(Query); } } // namespace llvm diff --git a/lib/Transforms/Instrumentation/BlackList.h b/lib/Transforms/Instrumentation/BlackList.h index 73977fc..ee18a98 100644 --- a/lib/Transforms/Instrumentation/BlackList.h +++ b/lib/Transforms/Instrumentation/BlackList.h @@ -12,10 +12,13 @@ // // The blacklist disables instrumentation of various functions and global // variables. Each line contains a prefix, followed by a wild card expression. +// Empty lines and lines starting with "#" are ignored. // --- +// # Blacklisted items: // fun:*_ZN4base6subtle* // global:*global_with_bad_access_or_initialization* // global-init:*global_with_initialization_issues* +// global-init-type:*Namespace::ClassName* // src:file_with_tricky_code.cc // --- // Note that the wild card is in fact an llvm::Regex, but * is automatically diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp index 6429081..b094d42 100644 --- a/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -13,19 +13,19 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "bounds-checking" -#include "llvm/IRBuilder.h" -#include "llvm/Intrinsics.h" -#include "llvm/Pass.h" +#include "llvm/Transforms/Instrumentation.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/InstIterator.h" #include "llvm/Support/TargetFolder.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Instrumentation.h" using namespace llvm; static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap", @@ -41,25 +41,24 @@ namespace { struct BoundsChecking : public FunctionPass { static char ID; - BoundsChecking(unsigned _Penalty = 5) : FunctionPass(ID), Penalty(_Penalty){ + BoundsChecking() : FunctionPass(ID) { initializeBoundsCheckingPass(*PassRegistry::getPassRegistry()); } virtual bool runOnFunction(Function &F); virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetData>(); + AU.addRequired<DataLayout>(); AU.addRequired<TargetLibraryInfo>(); } private: - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; ObjectSizeOffsetEvaluator *ObjSizeEval; BuilderTy *Builder; Instruction *Inst; BasicBlock *TrapBB; - unsigned Penalty; BasicBlock *getTrapBB(); void emitBranchToTrap(Value *Cmp = 0); @@ -109,6 +108,7 @@ void BoundsChecking::emitBranchToTrap(Value *Cmp) { else Cmp = 0; // unconditional branch } + ++ChecksAdded; Instruction *Inst = Builder->GetInsertPoint(); BasicBlock *OldBB = Inst->getParent(); @@ -143,7 +143,7 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) { Value *Offset = SizeOffset.second; ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size); - IntegerType *IntTy = TD->getIntPtrType(Inst->getContext()); + Type *IntTy = TD->getIntPtrType(Ptr->getType()); Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize); // three checks are required to ensure safety: @@ -163,12 +163,11 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) { } emitBranchToTrap(Or); - ++ChecksAdded; return true; } bool BoundsChecking::runOnFunction(Function &F) { - TD = &getAnalysis<TargetData>(); + TD = &getAnalysis<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); TrapBB = 0; @@ -208,6 +207,6 @@ bool BoundsChecking::runOnFunction(Function &F) { return MadeChange; } -FunctionPass *llvm::createBoundsCheckingPass(unsigned Penalty) { - return new BoundsChecking(Penalty); +FunctionPass *llvm::createBoundsCheckingPass() { + return new BoundsChecking(); } diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt index 058f68c..1c9e053 100644 --- a/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/lib/Transforms/Instrumentation/CMakeLists.txt @@ -4,6 +4,7 @@ add_llvm_library(LLVMInstrumentation BoundsChecking.cpp EdgeProfiling.cpp GCOVProfiling.cpp + MemorySanitizer.cpp Instrumentation.cpp OptimalEdgeProfiling.cpp PathProfiling.cpp diff --git a/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/lib/Transforms/Instrumentation/EdgeProfiling.cpp index e8ef265..0b18b4c 100644 --- a/lib/Transforms/Instrumentation/EdgeProfiling.cpp +++ b/lib/Transforms/Instrumentation/EdgeProfiling.cpp @@ -18,13 +18,14 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "insert-edge-profiling" +#include "llvm/Transforms/Instrumentation.h" #include "ProfilingUtils.h" -#include "llvm/Module.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Instrumentation.h" -#include "llvm/ADT/Statistic.h" #include <set> using namespace llvm; @@ -54,8 +55,8 @@ ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); } bool EdgeProfiler::runOnModule(Module &M) { Function *Main = M.getFunction("main"); if (Main == 0) { - errs() << "WARNING: cannot insert edge profiling into a module" - << " with no main function!\n"; + M.getContext().emitWarning("cannot insert edge profiling into a module" + " with no main function"); return false; // No main, no instrumentation! } diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 9fcde31..eb0dc1e 100644 --- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -16,19 +16,19 @@ #define DEBUG_TYPE "insert-gcov-profiling" -#include "ProfilingUtils.h" #include "llvm/Transforms/Instrumentation.h" -#include "llvm/DebugInfo.h" -#include "llvm/IRBuilder.h" -#include "llvm/Instructions.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" +#include "ProfilingUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/UniqueVector.h" +#include "llvm/DebugInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/DebugLoc.h" #include "llvm/Support/InstIterator.h" @@ -45,13 +45,14 @@ namespace { static char ID; GCOVProfiler() : ModulePass(ID), EmitNotes(true), EmitData(true), Use402Format(false), - UseExtraChecksum(false) { + UseExtraChecksum(false), NoRedZone(false) { initializeGCOVProfilerPass(*PassRegistry::getPassRegistry()); } - GCOVProfiler(bool EmitNotes, bool EmitData, bool use402Format = false, - bool useExtraChecksum = false) + GCOVProfiler(bool EmitNotes, bool EmitData, bool use402Format, + bool useExtraChecksum, bool NoRedZone_) : ModulePass(ID), EmitNotes(EmitNotes), EmitData(EmitData), - Use402Format(use402Format), UseExtraChecksum(useExtraChecksum) { + Use402Format(use402Format), UseExtraChecksum(useExtraChecksum), + NoRedZone(NoRedZone_) { assert((EmitNotes || EmitData) && "GCOVProfiler asked to do nothing?"); initializeGCOVProfilerPass(*PassRegistry::getPassRegistry()); } @@ -90,6 +91,7 @@ namespace { // list. void insertCounterWriteout(ArrayRef<std::pair<GlobalVariable*, MDNode*> >); void insertIndirectCounterIncrement(); + void insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> >); std::string mangleName(DICompileUnit CU, const char *NewStem); @@ -97,6 +99,7 @@ namespace { bool EmitData; bool Use402Format; bool UseExtraChecksum; + bool NoRedZone; Module *M; LLVMContext *Ctx; @@ -109,8 +112,10 @@ INITIALIZE_PASS(GCOVProfiler, "insert-gcov-profiling", ModulePass *llvm::createGCOVProfilerPass(bool EmitNotes, bool EmitData, bool Use402Format, - bool UseExtraChecksum) { - return new GCOVProfiler(EmitNotes, EmitData, Use402Format, UseExtraChecksum); + bool UseExtraChecksum, + bool NoRedZone) { + return new GCOVProfiler(EmitNotes, EmitData, Use402Format, UseExtraChecksum, + NoRedZone); } namespace { @@ -518,6 +523,7 @@ bool GCOVProfiler::emitProfileArcs() { } insertCounterWriteout(CountersBySP); + insertFlush(CountersBySP); } if (InsertIndCounterIncrCode) @@ -538,13 +544,13 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable( // read it. Threads and invoke make this untrue. // emit [(succs * preds) x i64*], logically [succ x [pred x i64*]]. + size_t TableSize = Succs.size() * Preds.size(); Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx); - ArrayType *EdgeTableTy = ArrayType::get( - Int64PtrTy, Succs.size() * Preds.size()); + ArrayType *EdgeTableTy = ArrayType::get(Int64PtrTy, TableSize); - Constant **EdgeTable = new Constant*[Succs.size() * Preds.size()]; + OwningArrayPtr<Constant *> EdgeTable(new Constant*[TableSize]); Constant *NullValue = Constant::getNullValue(Int64PtrTy); - for (int i = 0, ie = Succs.size() * Preds.size(); i != ie; ++i) + for (size_t i = 0; i != TableSize; ++i) EdgeTable[i] = NullValue; unsigned Edge = 0; @@ -564,7 +570,7 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable( Edge += Successors; } - ArrayRef<Constant*> V(&EdgeTable[0], Succs.size() * Preds.size()); + ArrayRef<Constant*> V(&EdgeTable[0], TableSize); GlobalVariable *EdgeTableGV = new GlobalVariable( *M, EdgeTableTy, true, GlobalValue::InternalLinkage, @@ -630,13 +636,17 @@ GlobalVariable *GCOVProfiler::getEdgeStateValue() { void GCOVProfiler::insertCounterWriteout( ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) { - FunctionType *WriteoutFTy = - FunctionType::get(Type::getVoidTy(*Ctx), false); - Function *WriteoutF = Function::Create(WriteoutFTy, - GlobalValue::InternalLinkage, - "__llvm_gcov_writeout", M); + FunctionType *WriteoutFTy = FunctionType::get(Type::getVoidTy(*Ctx), false); + Function *WriteoutF = M->getFunction("__llvm_gcov_writeout"); + if (!WriteoutF) + WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage, + "__llvm_gcov_writeout", M); WriteoutF->setUnnamedAddr(true); - BasicBlock *BB = BasicBlock::Create(*Ctx, "", WriteoutF); + WriteoutF->addFnAttr(Attribute::NoInline); + if (NoRedZone) + WriteoutF->addFnAttr(Attribute::NoRedZone); + + BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF); IRBuilder<> Builder(BB); Constant *StartFile = getStartFileFunc(); @@ -647,8 +657,8 @@ void GCOVProfiler::insertCounterWriteout( NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu"); if (CU_Nodes) { for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) { - DICompileUnit compile_unit(CU_Nodes->getOperand(i)); - std::string FilenameGcda = mangleName(compile_unit, "gcda"); + DICompileUnit CU(CU_Nodes->getOperand(i)); + std::string FilenameGcda = mangleName(CU, "gcda"); Builder.CreateCall(StartFile, Builder.CreateGlobalStringPtr(FilenameGcda)); for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator @@ -680,6 +690,8 @@ void GCOVProfiler::insertCounterWriteout( F->setUnnamedAddr(true); F->setLinkage(GlobalValue::InternalLinkage); F->addFnAttr(Attribute::NoInline); + if (NoRedZone) + F->addFnAttr(Attribute::NoRedZone); BB = BasicBlock::Create(*Ctx, "entry", F); Builder.SetInsertPoint(BB); @@ -699,6 +711,8 @@ void GCOVProfiler::insertIndirectCounterIncrement() { Fn->setUnnamedAddr(true); Fn->setLinkage(GlobalValue::InternalLinkage); Fn->addFnAttr(Attribute::NoInline); + if (NoRedZone) + Fn->addFnAttr(Attribute::NoRedZone); Type *Int32Ty = Type::getInt32Ty(*Ctx); Type *Int64Ty = Type::getInt64Ty(*Ctx); @@ -744,3 +758,45 @@ void GCOVProfiler::insertIndirectCounterIncrement() { Builder.SetInsertPoint(Exit); Builder.CreateRetVoid(); } + +void GCOVProfiler:: +insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) { + FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); + Function *FlushF = M->getFunction("__gcov_flush"); + if (!FlushF) + FlushF = Function::Create(FTy, GlobalValue::InternalLinkage, + "__gcov_flush", M); + else + FlushF->setLinkage(GlobalValue::InternalLinkage); + FlushF->setUnnamedAddr(true); + FlushF->addFnAttr(Attribute::NoInline); + if (NoRedZone) + FlushF->addFnAttr(Attribute::NoRedZone); + + BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", FlushF); + + // Write out the current counters. + Constant *WriteoutF = M->getFunction("__llvm_gcov_writeout"); + assert(WriteoutF && "Need to create the writeout function first!"); + + IRBuilder<> Builder(Entry); + Builder.CreateCall(WriteoutF); + + // Zero out the counters. + for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator + I = CountersBySP.begin(), E = CountersBySP.end(); + I != E; ++I) { + GlobalVariable *GV = I->first; + Constant *Null = Constant::getNullValue(GV->getType()->getElementType()); + Builder.CreateStore(Null, GV); + } + + Type *RetTy = FlushF->getReturnType(); + if (RetTy == Type::getVoidTy(*Ctx)) + Builder.CreateRetVoid(); + else if (RetTy->isIntegerTy()) + // Used if __gcov_flush was implicitly declared. + Builder.CreateRet(ConstantInt::get(RetTy, 0)); + else + report_fatal_error("invalid return type for __gcov_flush"); +} diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp index 1e0b4a3..8ba1025 100644 --- a/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -21,11 +21,13 @@ using namespace llvm; /// library. void llvm::initializeInstrumentation(PassRegistry &Registry) { initializeAddressSanitizerPass(Registry); + initializeAddressSanitizerModulePass(Registry); initializeBoundsCheckingPass(Registry); initializeEdgeProfilerPass(Registry); initializeGCOVProfilerPass(Registry); initializeOptimalEdgeProfilerPass(Registry); initializePathProfilerPass(Registry); + initializeMemorySanitizerPass(Registry); initializeThreadSanitizerPass(Registry); } diff --git a/lib/Transforms/Instrumentation/MaximumSpanningTree.h b/lib/Transforms/Instrumentation/MaximumSpanningTree.h index a4bb5a6..363539b 100644 --- a/lib/Transforms/Instrumentation/MaximumSpanningTree.h +++ b/lib/Transforms/Instrumentation/MaximumSpanningTree.h @@ -15,10 +15,10 @@ #ifndef LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H #define LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H -#include "llvm/BasicBlock.h" #include "llvm/ADT/EquivalenceClasses.h" -#include <vector> +#include "llvm/IR/BasicBlock.h" #include <algorithm> +#include <vector> namespace llvm { diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp new file mode 100644 index 0000000..58d5801 --- /dev/null +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -0,0 +1,1857 @@ +//===-- MemorySanitizer.cpp - detector of uninitialized reads -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file is a part of MemorySanitizer, a detector of uninitialized +/// reads. +/// +/// Status: early prototype. +/// +/// The algorithm of the tool is similar to Memcheck +/// (http://goo.gl/QKbem). We associate a few shadow bits with every +/// byte of the application memory, poison the shadow of the malloc-ed +/// or alloca-ed memory, load the shadow bits on every memory read, +/// propagate the shadow bits through some of the arithmetic +/// instruction (including MOV), store the shadow bits on every memory +/// write, report a bug on some other instructions (e.g. JMP) if the +/// associated shadow is poisoned. +/// +/// But there are differences too. The first and the major one: +/// compiler instrumentation instead of binary instrumentation. This +/// gives us much better register allocation, possible compiler +/// optimizations and a fast start-up. But this brings the major issue +/// as well: msan needs to see all program events, including system +/// calls and reads/writes in system libraries, so we either need to +/// compile *everything* with msan or use a binary translation +/// component (e.g. DynamoRIO) to instrument pre-built libraries. +/// Another difference from Memcheck is that we use 8 shadow bits per +/// byte of application memory and use a direct shadow mapping. This +/// greatly simplifies the instrumentation code and avoids races on +/// shadow updates (Memcheck is single-threaded so races are not a +/// concern there. Memcheck uses 2 shadow bits per byte with a slow +/// path storage that uses 8 bits per byte). +/// +/// The default value of shadow is 0, which means "clean" (not poisoned). +/// +/// Every module initializer should call __msan_init to ensure that the +/// shadow memory is ready. On error, __msan_warning is called. Since +/// parameters and return values may be passed via registers, we have a +/// specialized thread-local shadow for return values +/// (__msan_retval_tls) and parameters (__msan_param_tls). +/// +/// Origin tracking. +/// +/// MemorySanitizer can track origins (allocation points) of all uninitialized +/// values. This behavior is controlled with a flag (msan-track-origins) and is +/// disabled by default. +/// +/// Origins are 4-byte values created and interpreted by the runtime library. +/// They are stored in a second shadow mapping, one 4-byte value for 4 bytes +/// of application memory. Propagation of origins is basically a bunch of +/// "select" instructions that pick the origin of a dirty argument, if an +/// instruction has one. +/// +/// Every 4 aligned, consecutive bytes of application memory have one origin +/// value associated with them. If these bytes contain uninitialized data +/// coming from 2 different allocations, the last store wins. Because of this, +/// MemorySanitizer reports can show unrelated origins, but this is unlikely in +/// practice. +/// +/// Origins are meaningless for fully initialized values, so MemorySanitizer +/// avoids storing origin to memory when a fully initialized value is stored. +/// This way it avoids needless overwritting origin of the 4-byte region on +/// a short (i.e. 1 byte) clean store, and it is also good for performance. +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "msan" + +#include "llvm/Transforms/Instrumentation.h" +#include "BlackList.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/ValueMap.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/InstVisitor.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; + +static const uint64_t kShadowMask32 = 1ULL << 31; +static const uint64_t kShadowMask64 = 1ULL << 46; +static const uint64_t kOriginOffset32 = 1ULL << 30; +static const uint64_t kOriginOffset64 = 1ULL << 45; +static const unsigned kMinOriginAlignment = 4; +static const unsigned kShadowTLSAlignment = 8; + +/// \brief Track origins of uninitialized values. +/// +/// Adds a section to MemorySanitizer report that points to the allocation +/// (stack or heap) the uninitialized bits came from originally. +static cl::opt<bool> ClTrackOrigins("msan-track-origins", + cl::desc("Track origins (allocation sites) of poisoned memory"), + cl::Hidden, cl::init(false)); +static cl::opt<bool> ClKeepGoing("msan-keep-going", + cl::desc("keep going after reporting a UMR"), + cl::Hidden, cl::init(false)); +static cl::opt<bool> ClPoisonStack("msan-poison-stack", + cl::desc("poison uninitialized stack variables"), + cl::Hidden, cl::init(true)); +static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call", + cl::desc("poison uninitialized stack variables with a call"), + cl::Hidden, cl::init(false)); +static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern", + cl::desc("poison uninitialized stack variables with the given patter"), + cl::Hidden, cl::init(0xff)); + +static cl::opt<bool> ClHandleICmp("msan-handle-icmp", + cl::desc("propagate shadow through ICmpEQ and ICmpNE"), + cl::Hidden, cl::init(true)); + +static cl::opt<bool> ClStoreCleanOrigin("msan-store-clean-origin", + cl::desc("store origin for clean (fully initialized) values"), + cl::Hidden, cl::init(false)); + +// This flag controls whether we check the shadow of the address +// operand of load or store. Such bugs are very rare, since load from +// a garbage address typically results in SEGV, but still happen +// (e.g. only lower bits of address are garbage, or the access happens +// early at program startup where malloc-ed memory is more likely to +// be zeroed. As of 2012-08-28 this flag adds 20% slowdown. +static cl::opt<bool> ClCheckAccessAddress("msan-check-access-address", + cl::desc("report accesses through a pointer which has poisoned shadow"), + cl::Hidden, cl::init(true)); + +static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions", + cl::desc("print out instructions with default strict semantics"), + cl::Hidden, cl::init(false)); + +static cl::opt<std::string> ClBlacklistFile("msan-blacklist", + cl::desc("File containing the list of functions where MemorySanitizer " + "should not report bugs"), cl::Hidden); + +namespace { + +/// \brief An instrumentation pass implementing detection of uninitialized +/// reads. +/// +/// MemorySanitizer: instrument the code in module to find +/// uninitialized reads. +class MemorySanitizer : public FunctionPass { + public: + MemorySanitizer(bool TrackOrigins = false, + StringRef BlacklistFile = StringRef()) + : FunctionPass(ID), + TrackOrigins(TrackOrigins || ClTrackOrigins), + TD(0), + WarningFn(0), + BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile + : BlacklistFile) { } + const char *getPassName() const { return "MemorySanitizer"; } + bool runOnFunction(Function &F); + bool doInitialization(Module &M); + static char ID; // Pass identification, replacement for typeid. + + private: + void initializeCallbacks(Module &M); + + /// \brief Track origins (allocation points) of uninitialized values. + bool TrackOrigins; + + DataLayout *TD; + LLVMContext *C; + Type *IntptrTy; + Type *OriginTy; + /// \brief Thread-local shadow storage for function parameters. + GlobalVariable *ParamTLS; + /// \brief Thread-local origin storage for function parameters. + GlobalVariable *ParamOriginTLS; + /// \brief Thread-local shadow storage for function return value. + GlobalVariable *RetvalTLS; + /// \brief Thread-local origin storage for function return value. + GlobalVariable *RetvalOriginTLS; + /// \brief Thread-local shadow storage for in-register va_arg function + /// parameters (x86_64-specific). + GlobalVariable *VAArgTLS; + /// \brief Thread-local shadow storage for va_arg overflow area + /// (x86_64-specific). + GlobalVariable *VAArgOverflowSizeTLS; + /// \brief Thread-local space used to pass origin value to the UMR reporting + /// function. + GlobalVariable *OriginTLS; + + /// \brief The run-time callback to print a warning. + Value *WarningFn; + /// \brief Run-time helper that copies origin info for a memory range. + Value *MsanCopyOriginFn; + /// \brief Run-time helper that generates a new origin value for a stack + /// allocation. + Value *MsanSetAllocaOriginFn; + /// \brief Run-time helper that poisons stack on function entry. + Value *MsanPoisonStackFn; + /// \brief MSan runtime replacements for memmove, memcpy and memset. + Value *MemmoveFn, *MemcpyFn, *MemsetFn; + + /// \brief Address mask used in application-to-shadow address calculation. + /// ShadowAddr is computed as ApplicationAddr & ~ShadowMask. + uint64_t ShadowMask; + /// \brief Offset of the origin shadow from the "normal" shadow. + /// OriginAddr is computed as (ShadowAddr + OriginOffset) & ~3ULL + uint64_t OriginOffset; + /// \brief Branch weights for error reporting. + MDNode *ColdCallWeights; + /// \brief Branch weights for origin store. + MDNode *OriginStoreWeights; + /// \bried Path to blacklist file. + SmallString<64> BlacklistFile; + /// \brief The blacklist. + OwningPtr<BlackList> BL; + /// \brief An empty volatile inline asm that prevents callback merge. + InlineAsm *EmptyAsm; + + friend struct MemorySanitizerVisitor; + friend struct VarArgAMD64Helper; +}; +} // namespace + +char MemorySanitizer::ID = 0; +INITIALIZE_PASS(MemorySanitizer, "msan", + "MemorySanitizer: detects uninitialized reads.", + false, false) + +FunctionPass *llvm::createMemorySanitizerPass(bool TrackOrigins, + StringRef BlacklistFile) { + return new MemorySanitizer(TrackOrigins, BlacklistFile); +} + +/// \brief Create a non-const global initialized with the given string. +/// +/// Creates a writable global for Str so that we can pass it to the +/// run-time lib. Runtime uses first 4 bytes of the string to store the +/// frame ID, so the string needs to be mutable. +static GlobalVariable *createPrivateNonConstGlobalForString(Module &M, + StringRef Str) { + Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str); + return new GlobalVariable(M, StrConst->getType(), /*isConstant=*/false, + GlobalValue::PrivateLinkage, StrConst, ""); +} + + +/// \brief Insert extern declaration of runtime-provided functions and globals. +void MemorySanitizer::initializeCallbacks(Module &M) { + // Only do this once. + if (WarningFn) + return; + + IRBuilder<> IRB(*C); + // Create the callback. + // FIXME: this function should have "Cold" calling conv, + // which is not yet implemented. + StringRef WarningFnName = ClKeepGoing ? "__msan_warning" + : "__msan_warning_noreturn"; + WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), NULL); + + MsanCopyOriginFn = M.getOrInsertFunction( + "__msan_copy_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IntptrTy, NULL); + MsanSetAllocaOriginFn = M.getOrInsertFunction( + "__msan_set_alloca_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, + IRB.getInt8PtrTy(), NULL); + MsanPoisonStackFn = M.getOrInsertFunction( + "__msan_poison_stack", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, NULL); + MemmoveFn = M.getOrInsertFunction( + "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IntptrTy, NULL); + MemcpyFn = M.getOrInsertFunction( + "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), + IntptrTy, NULL); + MemsetFn = M.getOrInsertFunction( + "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(), + IntptrTy, NULL); + + // Create globals. + RetvalTLS = new GlobalVariable( + M, ArrayType::get(IRB.getInt64Ty(), 8), false, + GlobalVariable::ExternalLinkage, 0, "__msan_retval_tls", 0, + GlobalVariable::GeneralDynamicTLSModel); + RetvalOriginTLS = new GlobalVariable( + M, OriginTy, false, GlobalVariable::ExternalLinkage, 0, + "__msan_retval_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel); + + ParamTLS = new GlobalVariable( + M, ArrayType::get(IRB.getInt64Ty(), 1000), false, + GlobalVariable::ExternalLinkage, 0, "__msan_param_tls", 0, + GlobalVariable::GeneralDynamicTLSModel); + ParamOriginTLS = new GlobalVariable( + M, ArrayType::get(OriginTy, 1000), false, GlobalVariable::ExternalLinkage, + 0, "__msan_param_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel); + + VAArgTLS = new GlobalVariable( + M, ArrayType::get(IRB.getInt64Ty(), 1000), false, + GlobalVariable::ExternalLinkage, 0, "__msan_va_arg_tls", 0, + GlobalVariable::GeneralDynamicTLSModel); + VAArgOverflowSizeTLS = new GlobalVariable( + M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, 0, + "__msan_va_arg_overflow_size_tls", 0, + GlobalVariable::GeneralDynamicTLSModel); + OriginTLS = new GlobalVariable( + M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, 0, + "__msan_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel); + + // We insert an empty inline asm after __msan_report* to avoid callback merge. + EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), + StringRef(""), StringRef(""), + /*hasSideEffects=*/true); +} + +/// \brief Module-level initialization. +/// +/// inserts a call to __msan_init to the module's constructor list. +bool MemorySanitizer::doInitialization(Module &M) { + TD = getAnalysisIfAvailable<DataLayout>(); + if (!TD) + return false; + BL.reset(new BlackList(BlacklistFile)); + C = &(M.getContext()); + unsigned PtrSize = TD->getPointerSizeInBits(/* AddressSpace */0); + switch (PtrSize) { + case 64: + ShadowMask = kShadowMask64; + OriginOffset = kOriginOffset64; + break; + case 32: + ShadowMask = kShadowMask32; + OriginOffset = kOriginOffset32; + break; + default: + report_fatal_error("unsupported pointer size"); + break; + } + + IRBuilder<> IRB(*C); + IntptrTy = IRB.getIntPtrTy(TD); + OriginTy = IRB.getInt32Ty(); + + ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000); + OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000); + + // Insert a call to __msan_init/__msan_track_origins into the module's CTORs. + appendToGlobalCtors(M, cast<Function>(M.getOrInsertFunction( + "__msan_init", IRB.getVoidTy(), NULL)), 0); + + new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage, + IRB.getInt32(TrackOrigins), "__msan_track_origins"); + + return true; +} + +namespace { + +/// \brief A helper class that handles instrumentation of VarArg +/// functions on a particular platform. +/// +/// Implementations are expected to insert the instrumentation +/// necessary to propagate argument shadow through VarArg function +/// calls. Visit* methods are called during an InstVisitor pass over +/// the function, and should avoid creating new basic blocks. A new +/// instance of this class is created for each instrumented function. +struct VarArgHelper { + /// \brief Visit a CallSite. + virtual void visitCallSite(CallSite &CS, IRBuilder<> &IRB) = 0; + + /// \brief Visit a va_start call. + virtual void visitVAStartInst(VAStartInst &I) = 0; + + /// \brief Visit a va_copy call. + virtual void visitVACopyInst(VACopyInst &I) = 0; + + /// \brief Finalize function instrumentation. + /// + /// This method is called after visiting all interesting (see above) + /// instructions in a function. + virtual void finalizeInstrumentation() = 0; + + virtual ~VarArgHelper() {} +}; + +struct MemorySanitizerVisitor; + +VarArgHelper* +CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, + MemorySanitizerVisitor &Visitor); + +/// This class does all the work for a given function. Store and Load +/// instructions store and load corresponding shadow and origin +/// values. Most instructions propagate shadow from arguments to their +/// return values. Certain instructions (most importantly, BranchInst) +/// test their argument shadow and print reports (with a runtime call) if it's +/// non-zero. +struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { + Function &F; + MemorySanitizer &MS; + SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes; + ValueMap<Value*, Value*> ShadowMap, OriginMap; + bool InsertChecks; + OwningPtr<VarArgHelper> VAHelper; + + // An unfortunate workaround for asymmetric lowering of va_arg stuff. + // See a comment in visitCallSite for more details. + static const unsigned AMD64GpEndOffset = 48; // AMD64 ABI Draft 0.99.6 p3.5.7 + static const unsigned AMD64FpEndOffset = 176; + + struct ShadowOriginAndInsertPoint { + Instruction *Shadow; + Instruction *Origin; + Instruction *OrigIns; + ShadowOriginAndInsertPoint(Instruction *S, Instruction *O, Instruction *I) + : Shadow(S), Origin(O), OrigIns(I) { } + ShadowOriginAndInsertPoint() : Shadow(0), Origin(0), OrigIns(0) { } + }; + SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList; + SmallVector<Instruction*, 16> StoreList; + + MemorySanitizerVisitor(Function &F, MemorySanitizer &MS) + : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) { + InsertChecks = !MS.BL->isIn(F); + DEBUG(if (!InsertChecks) + dbgs() << "MemorySanitizer is not inserting checks into '" + << F.getName() << "'\n"); + } + + void materializeStores() { + for (size_t i = 0, n = StoreList.size(); i < n; i++) { + StoreInst& I = *dyn_cast<StoreInst>(StoreList[i]); + + IRBuilder<> IRB(&I); + Value *Val = I.getValueOperand(); + Value *Addr = I.getPointerOperand(); + Value *Shadow = getShadow(Val); + Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB); + + StoreInst *NewSI = + IRB.CreateAlignedStore(Shadow, ShadowPtr, I.getAlignment()); + DEBUG(dbgs() << " STORE: " << *NewSI << "\n"); + (void)NewSI; + // If the store is volatile, add a check. + if (I.isVolatile()) + insertCheck(Val, &I); + if (ClCheckAccessAddress) + insertCheck(Addr, &I); + + if (MS.TrackOrigins) { + unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment()); + if (ClStoreCleanOrigin || isa<StructType>(Shadow->getType())) { + IRB.CreateAlignedStore(getOrigin(Val), getOriginPtr(Addr, IRB), + Alignment); + } else { + Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); + + Constant *Cst = dyn_cast_or_null<Constant>(ConvertedShadow); + // TODO(eugenis): handle non-zero constant shadow by inserting an + // unconditional check (can not simply fail compilation as this could + // be in the dead code). + if (Cst) + continue; + + Value *Cmp = IRB.CreateICmpNE(ConvertedShadow, + getCleanShadow(ConvertedShadow), "_mscmp"); + Instruction *CheckTerm = + SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false, + MS.OriginStoreWeights); + IRBuilder<> IRBNew(CheckTerm); + IRBNew.CreateAlignedStore(getOrigin(Val), getOriginPtr(Addr, IRBNew), + Alignment); + } + } + } + } + + void materializeChecks() { + for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) { + Instruction *Shadow = InstrumentationList[i].Shadow; + Instruction *OrigIns = InstrumentationList[i].OrigIns; + IRBuilder<> IRB(OrigIns); + DEBUG(dbgs() << " SHAD0 : " << *Shadow << "\n"); + Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); + DEBUG(dbgs() << " SHAD1 : " << *ConvertedShadow << "\n"); + Value *Cmp = IRB.CreateICmpNE(ConvertedShadow, + getCleanShadow(ConvertedShadow), "_mscmp"); + Instruction *CheckTerm = + SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), + /* Unreachable */ !ClKeepGoing, + MS.ColdCallWeights); + + IRB.SetInsertPoint(CheckTerm); + if (MS.TrackOrigins) { + Instruction *Origin = InstrumentationList[i].Origin; + IRB.CreateStore(Origin ? (Value*)Origin : (Value*)IRB.getInt32(0), + MS.OriginTLS); + } + CallInst *Call = IRB.CreateCall(MS.WarningFn); + Call->setDebugLoc(OrigIns->getDebugLoc()); + IRB.CreateCall(MS.EmptyAsm); + DEBUG(dbgs() << " CHECK: " << *Cmp << "\n"); + } + DEBUG(dbgs() << "DONE:\n" << F); + } + + /// \brief Add MemorySanitizer instrumentation to a function. + bool runOnFunction() { + MS.initializeCallbacks(*F.getParent()); + if (!MS.TD) return false; + + // In the presence of unreachable blocks, we may see Phi nodes with + // incoming nodes from such blocks. Since InstVisitor skips unreachable + // blocks, such nodes will not have any shadow value associated with them. + // It's easier to remove unreachable blocks than deal with missing shadow. + removeUnreachableBlocks(F); + + // Iterate all BBs in depth-first order and create shadow instructions + // for all instructions (where applicable). + // For PHI nodes we create dummy shadow PHIs which will be finalized later. + for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()), + DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) { + BasicBlock *BB = *DI; + visit(*BB); + } + + // Finalize PHI nodes. + for (size_t i = 0, n = ShadowPHINodes.size(); i < n; i++) { + PHINode *PN = ShadowPHINodes[i]; + PHINode *PNS = cast<PHINode>(getShadow(PN)); + PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : 0; + size_t NumValues = PN->getNumIncomingValues(); + for (size_t v = 0; v < NumValues; v++) { + PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v)); + if (PNO) + PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v)); + } + } + + VAHelper->finalizeInstrumentation(); + + // Delayed instrumentation of StoreInst. + // This may add new checks to be inserted later. + materializeStores(); + + // Insert shadow value checks. + materializeChecks(); + + return true; + } + + /// \brief Compute the shadow type that corresponds to a given Value. + Type *getShadowTy(Value *V) { + return getShadowTy(V->getType()); + } + + /// \brief Compute the shadow type that corresponds to a given Type. + Type *getShadowTy(Type *OrigTy) { + if (!OrigTy->isSized()) { + return 0; + } + // For integer type, shadow is the same as the original type. + // This may return weird-sized types like i1. + if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy)) + return IT; + if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) { + uint32_t EltSize = MS.TD->getTypeStoreSizeInBits(VT->getElementType()); + return VectorType::get(IntegerType::get(*MS.C, EltSize), + VT->getNumElements()); + } + if (StructType *ST = dyn_cast<StructType>(OrigTy)) { + SmallVector<Type*, 4> Elements; + for (unsigned i = 0, n = ST->getNumElements(); i < n; i++) + Elements.push_back(getShadowTy(ST->getElementType(i))); + StructType *Res = StructType::get(*MS.C, Elements, ST->isPacked()); + DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n"); + return Res; + } + uint32_t TypeSize = MS.TD->getTypeStoreSizeInBits(OrigTy); + return IntegerType::get(*MS.C, TypeSize); + } + + /// \brief Flatten a vector type. + Type *getShadowTyNoVec(Type *ty) { + if (VectorType *vt = dyn_cast<VectorType>(ty)) + return IntegerType::get(*MS.C, vt->getBitWidth()); + return ty; + } + + /// \brief Convert a shadow value to it's flattened variant. + Value *convertToShadowTyNoVec(Value *V, IRBuilder<> &IRB) { + Type *Ty = V->getType(); + Type *NoVecTy = getShadowTyNoVec(Ty); + if (Ty == NoVecTy) return V; + return IRB.CreateBitCast(V, NoVecTy); + } + + /// \brief Compute the shadow address that corresponds to a given application + /// address. + /// + /// Shadow = Addr & ~ShadowMask. + Value *getShadowPtr(Value *Addr, Type *ShadowTy, + IRBuilder<> &IRB) { + Value *ShadowLong = + IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy), + ConstantInt::get(MS.IntptrTy, ~MS.ShadowMask)); + return IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0)); + } + + /// \brief Compute the origin address that corresponds to a given application + /// address. + /// + /// OriginAddr = (ShadowAddr + OriginOffset) & ~3ULL + Value *getOriginPtr(Value *Addr, IRBuilder<> &IRB) { + Value *ShadowLong = + IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy), + ConstantInt::get(MS.IntptrTy, ~MS.ShadowMask)); + Value *Add = + IRB.CreateAdd(ShadowLong, + ConstantInt::get(MS.IntptrTy, MS.OriginOffset)); + Value *SecondAnd = + IRB.CreateAnd(Add, ConstantInt::get(MS.IntptrTy, ~3ULL)); + return IRB.CreateIntToPtr(SecondAnd, PointerType::get(IRB.getInt32Ty(), 0)); + } + + /// \brief Compute the shadow address for a given function argument. + /// + /// Shadow = ParamTLS+ArgOffset. + Value *getShadowPtrForArgument(Value *A, IRBuilder<> &IRB, + int ArgOffset) { + Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy); + Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); + return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0), + "_msarg"); + } + + /// \brief Compute the origin address for a given function argument. + Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB, + int ArgOffset) { + if (!MS.TrackOrigins) return 0; + Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy); + Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); + return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0), + "_msarg_o"); + } + + /// \brief Compute the shadow address for a retval. + Value *getShadowPtrForRetval(Value *A, IRBuilder<> &IRB) { + Value *Base = IRB.CreatePointerCast(MS.RetvalTLS, MS.IntptrTy); + return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0), + "_msret"); + } + + /// \brief Compute the origin address for a retval. + Value *getOriginPtrForRetval(IRBuilder<> &IRB) { + // We keep a single origin for the entire retval. Might be too optimistic. + return MS.RetvalOriginTLS; + } + + /// \brief Set SV to be the shadow value for V. + void setShadow(Value *V, Value *SV) { + assert(!ShadowMap.count(V) && "Values may only have one shadow"); + ShadowMap[V] = SV; + } + + /// \brief Set Origin to be the origin value for V. + void setOrigin(Value *V, Value *Origin) { + if (!MS.TrackOrigins) return; + assert(!OriginMap.count(V) && "Values may only have one origin"); + DEBUG(dbgs() << "ORIGIN: " << *V << " ==> " << *Origin << "\n"); + OriginMap[V] = Origin; + } + + /// \brief Create a clean shadow value for a given value. + /// + /// Clean shadow (all zeroes) means all bits of the value are defined + /// (initialized). + Value *getCleanShadow(Value *V) { + Type *ShadowTy = getShadowTy(V); + if (!ShadowTy) + return 0; + return Constant::getNullValue(ShadowTy); + } + + /// \brief Create a dirty shadow of a given shadow type. + Constant *getPoisonedShadow(Type *ShadowTy) { + assert(ShadowTy); + if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) + return Constant::getAllOnesValue(ShadowTy); + StructType *ST = cast<StructType>(ShadowTy); + SmallVector<Constant *, 4> Vals; + for (unsigned i = 0, n = ST->getNumElements(); i < n; i++) + Vals.push_back(getPoisonedShadow(ST->getElementType(i))); + return ConstantStruct::get(ST, Vals); + } + + /// \brief Create a clean (zero) origin. + Value *getCleanOrigin() { + return Constant::getNullValue(MS.OriginTy); + } + + /// \brief Get the shadow value for a given Value. + /// + /// This function either returns the value set earlier with setShadow, + /// or extracts if from ParamTLS (for function arguments). + Value *getShadow(Value *V) { + if (Instruction *I = dyn_cast<Instruction>(V)) { + // For instructions the shadow is already stored in the map. + Value *Shadow = ShadowMap[V]; + if (!Shadow) { + DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent())); + (void)I; + assert(Shadow && "No shadow for a value"); + } + return Shadow; + } + if (UndefValue *U = dyn_cast<UndefValue>(V)) { + Value *AllOnes = getPoisonedShadow(getShadowTy(V)); + DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n"); + (void)U; + return AllOnes; + } + if (Argument *A = dyn_cast<Argument>(V)) { + // For arguments we compute the shadow on demand and store it in the map. + Value **ShadowPtr = &ShadowMap[V]; + if (*ShadowPtr) + return *ShadowPtr; + Function *F = A->getParent(); + IRBuilder<> EntryIRB(F->getEntryBlock().getFirstNonPHI()); + unsigned ArgOffset = 0; + for (Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end(); + AI != AE; ++AI) { + if (!AI->getType()->isSized()) { + DEBUG(dbgs() << "Arg is not sized\n"); + continue; + } + unsigned Size = AI->hasByValAttr() + ? MS.TD->getTypeAllocSize(AI->getType()->getPointerElementType()) + : MS.TD->getTypeAllocSize(AI->getType()); + if (A == AI) { + Value *Base = getShadowPtrForArgument(AI, EntryIRB, ArgOffset); + if (AI->hasByValAttr()) { + // ByVal pointer itself has clean shadow. We copy the actual + // argument shadow to the underlying memory. + Value *Cpy = EntryIRB.CreateMemCpy( + getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB), + Base, Size, AI->getParamAlignment()); + DEBUG(dbgs() << " ByValCpy: " << *Cpy << "\n"); + (void)Cpy; + *ShadowPtr = getCleanShadow(V); + } else { + *ShadowPtr = EntryIRB.CreateLoad(Base); + } + DEBUG(dbgs() << " ARG: " << *AI << " ==> " << + **ShadowPtr << "\n"); + if (MS.TrackOrigins) { + Value* OriginPtr = getOriginPtrForArgument(AI, EntryIRB, ArgOffset); + setOrigin(A, EntryIRB.CreateLoad(OriginPtr)); + } + } + ArgOffset += DataLayout::RoundUpAlignment(Size, 8); + } + assert(*ShadowPtr && "Could not find shadow for an argument"); + return *ShadowPtr; + } + // For everything else the shadow is zero. + return getCleanShadow(V); + } + + /// \brief Get the shadow for i-th argument of the instruction I. + Value *getShadow(Instruction *I, int i) { + return getShadow(I->getOperand(i)); + } + + /// \brief Get the origin for a value. + Value *getOrigin(Value *V) { + if (!MS.TrackOrigins) return 0; + if (isa<Instruction>(V) || isa<Argument>(V)) { + Value *Origin = OriginMap[V]; + if (!Origin) { + DEBUG(dbgs() << "NO ORIGIN: " << *V << "\n"); + Origin = getCleanOrigin(); + } + return Origin; + } + return getCleanOrigin(); + } + + /// \brief Get the origin for i-th argument of the instruction I. + Value *getOrigin(Instruction *I, int i) { + return getOrigin(I->getOperand(i)); + } + + /// \brief Remember the place where a shadow check should be inserted. + /// + /// This location will be later instrumented with a check that will print a + /// UMR warning in runtime if the value is not fully defined. + void insertCheck(Value *Val, Instruction *OrigIns) { + assert(Val); + if (!InsertChecks) return; + Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val)); + if (!Shadow) return; +#ifndef NDEBUG + Type *ShadowTy = Shadow->getType(); + assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) && + "Can only insert checks for integer and vector shadow types"); +#endif + Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val)); + InstrumentationList.push_back( + ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns)); + } + + // ------------------- Visitors. + + /// \brief Instrument LoadInst + /// + /// Loads the corresponding shadow and (optionally) origin. + /// Optionally, checks that the load address is fully defined. + void visitLoadInst(LoadInst &I) { + assert(I.getType()->isSized() && "Load type must have size"); + IRBuilder<> IRB(&I); + Type *ShadowTy = getShadowTy(&I); + Value *Addr = I.getPointerOperand(); + Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB); + setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, I.getAlignment(), "_msld")); + + if (ClCheckAccessAddress) + insertCheck(I.getPointerOperand(), &I); + + if (MS.TrackOrigins) { + unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment()); + setOrigin(&I, IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB), Alignment)); + } + } + + /// \brief Instrument StoreInst + /// + /// Stores the corresponding shadow and (optionally) origin. + /// Optionally, checks that the store address is fully defined. + /// Volatile stores check that the value being stored is fully defined. + void visitStoreInst(StoreInst &I) { + StoreList.push_back(&I); + } + + // Vector manipulation. + void visitExtractElementInst(ExtractElementInst &I) { + insertCheck(I.getOperand(1), &I); + IRBuilder<> IRB(&I); + setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1), + "_msprop")); + setOrigin(&I, getOrigin(&I, 0)); + } + + void visitInsertElementInst(InsertElementInst &I) { + insertCheck(I.getOperand(2), &I); + IRBuilder<> IRB(&I); + setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1), + I.getOperand(2), "_msprop")); + setOriginForNaryOp(I); + } + + void visitShuffleVectorInst(ShuffleVectorInst &I) { + insertCheck(I.getOperand(2), &I); + IRBuilder<> IRB(&I); + setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1), + I.getOperand(2), "_msprop")); + setOriginForNaryOp(I); + } + + // Casts. + void visitSExtInst(SExtInst &I) { + IRBuilder<> IRB(&I); + setShadow(&I, IRB.CreateSExt(getShadow(&I, 0), I.getType(), "_msprop")); + setOrigin(&I, getOrigin(&I, 0)); + } + + void visitZExtInst(ZExtInst &I) { + IRBuilder<> IRB(&I); + setShadow(&I, IRB.CreateZExt(getShadow(&I, 0), I.getType(), "_msprop")); + setOrigin(&I, getOrigin(&I, 0)); + } + + void visitTruncInst(TruncInst &I) { + IRBuilder<> IRB(&I); + setShadow(&I, IRB.CreateTrunc(getShadow(&I, 0), I.getType(), "_msprop")); + setOrigin(&I, getOrigin(&I, 0)); + } + + void visitBitCastInst(BitCastInst &I) { + IRBuilder<> IRB(&I); + setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I))); + setOrigin(&I, getOrigin(&I, 0)); + } + + void visitPtrToIntInst(PtrToIntInst &I) { + IRBuilder<> IRB(&I); + setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false, + "_msprop_ptrtoint")); + setOrigin(&I, getOrigin(&I, 0)); + } + + void visitIntToPtrInst(IntToPtrInst &I) { + IRBuilder<> IRB(&I); + setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false, + "_msprop_inttoptr")); + setOrigin(&I, getOrigin(&I, 0)); + } + + void visitFPToSIInst(CastInst& I) { handleShadowOr(I); } + void visitFPToUIInst(CastInst& I) { handleShadowOr(I); } + void visitSIToFPInst(CastInst& I) { handleShadowOr(I); } + void visitUIToFPInst(CastInst& I) { handleShadowOr(I); } + void visitFPExtInst(CastInst& I) { handleShadowOr(I); } + void visitFPTruncInst(CastInst& I) { handleShadowOr(I); } + + /// \brief Propagate shadow for bitwise AND. + /// + /// This code is exact, i.e. if, for example, a bit in the left argument + /// is defined and 0, then neither the value not definedness of the + /// corresponding bit in B don't affect the resulting shadow. + void visitAnd(BinaryOperator &I) { + IRBuilder<> IRB(&I); + // "And" of 0 and a poisoned value results in unpoisoned value. + // 1&1 => 1; 0&1 => 0; p&1 => p; + // 1&0 => 0; 0&0 => 0; p&0 => 0; + // 1&p => p; 0&p => 0; p&p => p; + // S = (S1 & S2) | (V1 & S2) | (S1 & V2) + Value *S1 = getShadow(&I, 0); + Value *S2 = getShadow(&I, 1); + Value *V1 = I.getOperand(0); + Value *V2 = I.getOperand(1); + if (V1->getType() != S1->getType()) { + V1 = IRB.CreateIntCast(V1, S1->getType(), false); + V2 = IRB.CreateIntCast(V2, S2->getType(), false); + } + Value *S1S2 = IRB.CreateAnd(S1, S2); + Value *V1S2 = IRB.CreateAnd(V1, S2); + Value *S1V2 = IRB.CreateAnd(S1, V2); + setShadow(&I, IRB.CreateOr(S1S2, IRB.CreateOr(V1S2, S1V2))); + setOriginForNaryOp(I); + } + + void visitOr(BinaryOperator &I) { + IRBuilder<> IRB(&I); + // "Or" of 1 and a poisoned value results in unpoisoned value. + // 1|1 => 1; 0|1 => 1; p|1 => 1; + // 1|0 => 1; 0|0 => 0; p|0 => p; + // 1|p => 1; 0|p => p; p|p => p; + // S = (S1 & S2) | (~V1 & S2) | (S1 & ~V2) + Value *S1 = getShadow(&I, 0); + Value *S2 = getShadow(&I, 1); + Value *V1 = IRB.CreateNot(I.getOperand(0)); + Value *V2 = IRB.CreateNot(I.getOperand(1)); + if (V1->getType() != S1->getType()) { + V1 = IRB.CreateIntCast(V1, S1->getType(), false); + V2 = IRB.CreateIntCast(V2, S2->getType(), false); + } + Value *S1S2 = IRB.CreateAnd(S1, S2); + Value *V1S2 = IRB.CreateAnd(V1, S2); + Value *S1V2 = IRB.CreateAnd(S1, V2); + setShadow(&I, IRB.CreateOr(S1S2, IRB.CreateOr(V1S2, S1V2))); + setOriginForNaryOp(I); + } + + /// \brief Default propagation of shadow and/or origin. + /// + /// This class implements the general case of shadow propagation, used in all + /// cases where we don't know and/or don't care about what the operation + /// actually does. It converts all input shadow values to a common type + /// (extending or truncating as necessary), and bitwise OR's them. + /// + /// This is much cheaper than inserting checks (i.e. requiring inputs to be + /// fully initialized), and less prone to false positives. + /// + /// This class also implements the general case of origin propagation. For a + /// Nary operation, result origin is set to the origin of an argument that is + /// not entirely initialized. If there is more than one such arguments, the + /// rightmost of them is picked. It does not matter which one is picked if all + /// arguments are initialized. + template <bool CombineShadow> + class Combiner { + Value *Shadow; + Value *Origin; + IRBuilder<> &IRB; + MemorySanitizerVisitor *MSV; + + public: + Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB) : + Shadow(0), Origin(0), IRB(IRB), MSV(MSV) {} + + /// \brief Add a pair of shadow and origin values to the mix. + Combiner &Add(Value *OpShadow, Value *OpOrigin) { + if (CombineShadow) { + assert(OpShadow); + if (!Shadow) + Shadow = OpShadow; + else { + OpShadow = MSV->CreateShadowCast(IRB, OpShadow, Shadow->getType()); + Shadow = IRB.CreateOr(Shadow, OpShadow, "_msprop"); + } + } + + if (MSV->MS.TrackOrigins) { + assert(OpOrigin); + if (!Origin) { + Origin = OpOrigin; + } else { + Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB); + Value *Cond = IRB.CreateICmpNE(FlatShadow, + MSV->getCleanShadow(FlatShadow)); + Origin = IRB.CreateSelect(Cond, OpOrigin, Origin); + } + } + return *this; + } + + /// \brief Add an application value to the mix. + Combiner &Add(Value *V) { + Value *OpShadow = MSV->getShadow(V); + Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : 0; + return Add(OpShadow, OpOrigin); + } + + /// \brief Set the current combined values as the given instruction's shadow + /// and origin. + void Done(Instruction *I) { + if (CombineShadow) { + assert(Shadow); + Shadow = MSV->CreateShadowCast(IRB, Shadow, MSV->getShadowTy(I)); + MSV->setShadow(I, Shadow); + } + if (MSV->MS.TrackOrigins) { + assert(Origin); + MSV->setOrigin(I, Origin); + } + } + }; + + typedef Combiner<true> ShadowAndOriginCombiner; + typedef Combiner<false> OriginCombiner; + + /// \brief Propagate origin for arbitrary operation. + void setOriginForNaryOp(Instruction &I) { + if (!MS.TrackOrigins) return; + IRBuilder<> IRB(&I); + OriginCombiner OC(this, IRB); + for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI) + OC.Add(OI->get()); + OC.Done(&I); + } + + size_t VectorOrPrimitiveTypeSizeInBits(Type *Ty) { + assert(!(Ty->isVectorTy() && Ty->getScalarType()->isPointerTy()) && + "Vector of pointers is not a valid shadow type"); + return Ty->isVectorTy() ? + Ty->getVectorNumElements() * Ty->getScalarSizeInBits() : + Ty->getPrimitiveSizeInBits(); + } + + /// \brief Cast between two shadow types, extending or truncating as + /// necessary. + Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy) { + Type *srcTy = V->getType(); + if (dstTy->isIntegerTy() && srcTy->isIntegerTy()) + return IRB.CreateIntCast(V, dstTy, false); + if (dstTy->isVectorTy() && srcTy->isVectorTy() && + dstTy->getVectorNumElements() == srcTy->getVectorNumElements()) + return IRB.CreateIntCast(V, dstTy, false); + size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy); + size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy); + Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits)); + Value *V2 = + IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), false); + return IRB.CreateBitCast(V2, dstTy); + // TODO: handle struct types. + } + + /// \brief Propagate shadow for arbitrary operation. + void handleShadowOr(Instruction &I) { + IRBuilder<> IRB(&I); + ShadowAndOriginCombiner SC(this, IRB); + for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI) + SC.Add(OI->get()); + SC.Done(&I); + } + + void visitFAdd(BinaryOperator &I) { handleShadowOr(I); } + void visitFSub(BinaryOperator &I) { handleShadowOr(I); } + void visitFMul(BinaryOperator &I) { handleShadowOr(I); } + void visitAdd(BinaryOperator &I) { handleShadowOr(I); } + void visitSub(BinaryOperator &I) { handleShadowOr(I); } + void visitXor(BinaryOperator &I) { handleShadowOr(I); } + void visitMul(BinaryOperator &I) { handleShadowOr(I); } + + void handleDiv(Instruction &I) { + IRBuilder<> IRB(&I); + // Strict on the second argument. + insertCheck(I.getOperand(1), &I); + setShadow(&I, getShadow(&I, 0)); + setOrigin(&I, getOrigin(&I, 0)); + } + + void visitUDiv(BinaryOperator &I) { handleDiv(I); } + void visitSDiv(BinaryOperator &I) { handleDiv(I); } + void visitFDiv(BinaryOperator &I) { handleDiv(I); } + void visitURem(BinaryOperator &I) { handleDiv(I); } + void visitSRem(BinaryOperator &I) { handleDiv(I); } + void visitFRem(BinaryOperator &I) { handleDiv(I); } + + /// \brief Instrument == and != comparisons. + /// + /// Sometimes the comparison result is known even if some of the bits of the + /// arguments are not. + void handleEqualityComparison(ICmpInst &I) { + IRBuilder<> IRB(&I); + Value *A = I.getOperand(0); + Value *B = I.getOperand(1); + Value *Sa = getShadow(A); + Value *Sb = getShadow(B); + if (A->getType()->isPointerTy()) + A = IRB.CreatePointerCast(A, MS.IntptrTy); + if (B->getType()->isPointerTy()) + B = IRB.CreatePointerCast(B, MS.IntptrTy); + // A == B <==> (C = A^B) == 0 + // A != B <==> (C = A^B) != 0 + // Sc = Sa | Sb + Value *C = IRB.CreateXor(A, B); + Value *Sc = IRB.CreateOr(Sa, Sb); + // Now dealing with i = (C == 0) comparison (or C != 0, does not matter now) + // Result is defined if one of the following is true + // * there is a defined 1 bit in C + // * C is fully defined + // Si = !(C & ~Sc) && Sc + Value *Zero = Constant::getNullValue(Sc->getType()); + Value *MinusOne = Constant::getAllOnesValue(Sc->getType()); + Value *Si = + IRB.CreateAnd(IRB.CreateICmpNE(Sc, Zero), + IRB.CreateICmpEQ( + IRB.CreateAnd(IRB.CreateXor(Sc, MinusOne), C), Zero)); + Si->setName("_msprop_icmp"); + setShadow(&I, Si); + setOriginForNaryOp(I); + } + + /// \brief Instrument signed relational comparisons. + /// + /// Handle (x<0) and (x>=0) comparisons (essentially, sign bit tests) by + /// propagating the highest bit of the shadow. Everything else is delegated + /// to handleShadowOr(). + void handleSignedRelationalComparison(ICmpInst &I) { + Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0)); + Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1)); + Value* op = NULL; + CmpInst::Predicate pre = I.getPredicate(); + if (constOp0 && constOp0->isNullValue() && + (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE)) { + op = I.getOperand(1); + } else if (constOp1 && constOp1->isNullValue() && + (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) { + op = I.getOperand(0); + } + if (op) { + IRBuilder<> IRB(&I); + Value* Shadow = + IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op), "_msprop_icmpslt"); + setShadow(&I, Shadow); + setOrigin(&I, getOrigin(op)); + } else { + handleShadowOr(I); + } + } + + void visitICmpInst(ICmpInst &I) { + if (ClHandleICmp && I.isEquality()) + handleEqualityComparison(I); + else if (ClHandleICmp && I.isSigned() && I.isRelational()) + handleSignedRelationalComparison(I); + else + handleShadowOr(I); + } + + void visitFCmpInst(FCmpInst &I) { + handleShadowOr(I); + } + + void handleShift(BinaryOperator &I) { + IRBuilder<> IRB(&I); + // If any of the S2 bits are poisoned, the whole thing is poisoned. + // Otherwise perform the same shift on S1. + Value *S1 = getShadow(&I, 0); + Value *S2 = getShadow(&I, 1); + Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)), + S2->getType()); + Value *V2 = I.getOperand(1); + Value *Shift = IRB.CreateBinOp(I.getOpcode(), S1, V2); + setShadow(&I, IRB.CreateOr(Shift, S2Conv)); + setOriginForNaryOp(I); + } + + void visitShl(BinaryOperator &I) { handleShift(I); } + void visitAShr(BinaryOperator &I) { handleShift(I); } + void visitLShr(BinaryOperator &I) { handleShift(I); } + + /// \brief Instrument llvm.memmove + /// + /// At this point we don't know if llvm.memmove will be inlined or not. + /// If we don't instrument it and it gets inlined, + /// our interceptor will not kick in and we will lose the memmove. + /// If we instrument the call here, but it does not get inlined, + /// we will memove the shadow twice: which is bad in case + /// of overlapping regions. So, we simply lower the intrinsic to a call. + /// + /// Similar situation exists for memcpy and memset. + void visitMemMoveInst(MemMoveInst &I) { + IRBuilder<> IRB(&I); + IRB.CreateCall3( + MS.MemmoveFn, + IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()), + IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()), + IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)); + I.eraseFromParent(); + } + + // Similar to memmove: avoid copying shadow twice. + // This is somewhat unfortunate as it may slowdown small constant memcpys. + // FIXME: consider doing manual inline for small constant sizes and proper + // alignment. + void visitMemCpyInst(MemCpyInst &I) { + IRBuilder<> IRB(&I); + IRB.CreateCall3( + MS.MemcpyFn, + IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()), + IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()), + IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)); + I.eraseFromParent(); + } + + // Same as memcpy. + void visitMemSetInst(MemSetInst &I) { + IRBuilder<> IRB(&I); + IRB.CreateCall3( + MS.MemsetFn, + IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()), + IRB.CreateIntCast(I.getArgOperand(1), IRB.getInt32Ty(), false), + IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)); + I.eraseFromParent(); + } + + void visitVAStartInst(VAStartInst &I) { + VAHelper->visitVAStartInst(I); + } + + void visitVACopyInst(VACopyInst &I) { + VAHelper->visitVACopyInst(I); + } + + enum IntrinsicKind { + IK_DoesNotAccessMemory, + IK_OnlyReadsMemory, + IK_WritesMemory + }; + + static IntrinsicKind getIntrinsicKind(Intrinsic::ID iid) { + const int DoesNotAccessMemory = IK_DoesNotAccessMemory; + const int OnlyReadsArgumentPointees = IK_OnlyReadsMemory; + const int OnlyReadsMemory = IK_OnlyReadsMemory; + const int OnlyAccessesArgumentPointees = IK_WritesMemory; + const int UnknownModRefBehavior = IK_WritesMemory; +#define GET_INTRINSIC_MODREF_BEHAVIOR +#define ModRefBehavior IntrinsicKind +#include "llvm/IR/Intrinsics.gen" +#undef ModRefBehavior +#undef GET_INTRINSIC_MODREF_BEHAVIOR + } + + /// \brief Handle vector store-like intrinsics. + /// + /// Instrument intrinsics that look like a simple SIMD store: writes memory, + /// has 1 pointer argument and 1 vector argument, returns void. + bool handleVectorStoreIntrinsic(IntrinsicInst &I) { + IRBuilder<> IRB(&I); + Value* Addr = I.getArgOperand(0); + Value *Shadow = getShadow(&I, 1); + Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB); + + // We don't know the pointer alignment (could be unaligned SSE store!). + // Have to assume to worst case. + IRB.CreateAlignedStore(Shadow, ShadowPtr, 1); + + if (ClCheckAccessAddress) + insertCheck(Addr, &I); + + // FIXME: use ClStoreCleanOrigin + // FIXME: factor out common code from materializeStores + if (MS.TrackOrigins) + IRB.CreateStore(getOrigin(&I, 1), getOriginPtr(Addr, IRB)); + return true; + } + + /// \brief Handle vector load-like intrinsics. + /// + /// Instrument intrinsics that look like a simple SIMD load: reads memory, + /// has 1 pointer argument, returns a vector. + bool handleVectorLoadIntrinsic(IntrinsicInst &I) { + IRBuilder<> IRB(&I); + Value *Addr = I.getArgOperand(0); + + Type *ShadowTy = getShadowTy(&I); + Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB); + // We don't know the pointer alignment (could be unaligned SSE load!). + // Have to assume to worst case. + setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, 1, "_msld")); + + if (ClCheckAccessAddress) + insertCheck(Addr, &I); + + if (MS.TrackOrigins) + setOrigin(&I, IRB.CreateLoad(getOriginPtr(Addr, IRB))); + return true; + } + + /// \brief Handle (SIMD arithmetic)-like intrinsics. + /// + /// Instrument intrinsics with any number of arguments of the same type, + /// equal to the return type. The type should be simple (no aggregates or + /// pointers; vectors are fine). + /// Caller guarantees that this intrinsic does not access memory. + bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) { + Type *RetTy = I.getType(); + if (!(RetTy->isIntOrIntVectorTy() || + RetTy->isFPOrFPVectorTy() || + RetTy->isX86_MMXTy())) + return false; + + unsigned NumArgOperands = I.getNumArgOperands(); + + for (unsigned i = 0; i < NumArgOperands; ++i) { + Type *Ty = I.getArgOperand(i)->getType(); + if (Ty != RetTy) + return false; + } + + IRBuilder<> IRB(&I); + ShadowAndOriginCombiner SC(this, IRB); + for (unsigned i = 0; i < NumArgOperands; ++i) + SC.Add(I.getArgOperand(i)); + SC.Done(&I); + + return true; + } + + /// \brief Heuristically instrument unknown intrinsics. + /// + /// The main purpose of this code is to do something reasonable with all + /// random intrinsics we might encounter, most importantly - SIMD intrinsics. + /// We recognize several classes of intrinsics by their argument types and + /// ModRefBehaviour and apply special intrumentation when we are reasonably + /// sure that we know what the intrinsic does. + /// + /// We special-case intrinsics where this approach fails. See llvm.bswap + /// handling as an example of that. + bool handleUnknownIntrinsic(IntrinsicInst &I) { + unsigned NumArgOperands = I.getNumArgOperands(); + if (NumArgOperands == 0) + return false; + + Intrinsic::ID iid = I.getIntrinsicID(); + IntrinsicKind IK = getIntrinsicKind(iid); + bool OnlyReadsMemory = IK == IK_OnlyReadsMemory; + bool WritesMemory = IK == IK_WritesMemory; + assert(!(OnlyReadsMemory && WritesMemory)); + + if (NumArgOperands == 2 && + I.getArgOperand(0)->getType()->isPointerTy() && + I.getArgOperand(1)->getType()->isVectorTy() && + I.getType()->isVoidTy() && + WritesMemory) { + // This looks like a vector store. + return handleVectorStoreIntrinsic(I); + } + + if (NumArgOperands == 1 && + I.getArgOperand(0)->getType()->isPointerTy() && + I.getType()->isVectorTy() && + OnlyReadsMemory) { + // This looks like a vector load. + return handleVectorLoadIntrinsic(I); + } + + if (!OnlyReadsMemory && !WritesMemory) + if (maybeHandleSimpleNomemIntrinsic(I)) + return true; + + // FIXME: detect and handle SSE maskstore/maskload + return false; + } + + void handleBswap(IntrinsicInst &I) { + IRBuilder<> IRB(&I); + Value *Op = I.getArgOperand(0); + Type *OpType = Op->getType(); + Function *BswapFunc = Intrinsic::getDeclaration( + F.getParent(), Intrinsic::bswap, ArrayRef<Type*>(&OpType, 1)); + setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op))); + setOrigin(&I, getOrigin(Op)); + } + + void visitIntrinsicInst(IntrinsicInst &I) { + switch (I.getIntrinsicID()) { + case llvm::Intrinsic::bswap: + handleBswap(I); + break; + default: + if (!handleUnknownIntrinsic(I)) + visitInstruction(I); + break; + } + } + + void visitCallSite(CallSite CS) { + Instruction &I = *CS.getInstruction(); + assert((CS.isCall() || CS.isInvoke()) && "Unknown type of CallSite"); + if (CS.isCall()) { + CallInst *Call = cast<CallInst>(&I); + + // For inline asm, do the usual thing: check argument shadow and mark all + // outputs as clean. Note that any side effects of the inline asm that are + // not immediately visible in its constraints are not handled. + if (Call->isInlineAsm()) { + visitInstruction(I); + return; + } + + // Allow only tail calls with the same types, otherwise + // we may have a false positive: shadow for a non-void RetVal + // will get propagated to a void RetVal. + if (Call->isTailCall() && Call->getType() != Call->getParent()->getType()) + Call->setTailCall(false); + + assert(!isa<IntrinsicInst>(&I) && "intrinsics are handled elsewhere"); + + // We are going to insert code that relies on the fact that the callee + // will become a non-readonly function after it is instrumented by us. To + // prevent this code from being optimized out, mark that function + // non-readonly in advance. + if (Function *Func = Call->getCalledFunction()) { + // Clear out readonly/readnone attributes. + AttrBuilder B; + B.addAttribute(Attribute::ReadOnly) + .addAttribute(Attribute::ReadNone); + Func->removeAttribute(AttributeSet::FunctionIndex, + Attribute::get(Func->getContext(), B)); + } + } + IRBuilder<> IRB(&I); + unsigned ArgOffset = 0; + DEBUG(dbgs() << " CallSite: " << I << "\n"); + for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end(); + ArgIt != End; ++ArgIt) { + Value *A = *ArgIt; + unsigned i = ArgIt - CS.arg_begin(); + if (!A->getType()->isSized()) { + DEBUG(dbgs() << "Arg " << i << " is not sized: " << I << "\n"); + continue; + } + unsigned Size = 0; + Value *Store = 0; + // Compute the Shadow for arg even if it is ByVal, because + // in that case getShadow() will copy the actual arg shadow to + // __msan_param_tls. + Value *ArgShadow = getShadow(A); + Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset); + DEBUG(dbgs() << " Arg#" << i << ": " << *A << + " Shadow: " << *ArgShadow << "\n"); + if (CS.paramHasAttr(i + 1, Attribute::ByVal)) { + assert(A->getType()->isPointerTy() && + "ByVal argument is not a pointer!"); + Size = MS.TD->getTypeAllocSize(A->getType()->getPointerElementType()); + unsigned Alignment = CS.getParamAlignment(i + 1); + Store = IRB.CreateMemCpy(ArgShadowBase, + getShadowPtr(A, Type::getInt8Ty(*MS.C), IRB), + Size, Alignment); + } else { + Size = MS.TD->getTypeAllocSize(A->getType()); + Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase, + kShadowTLSAlignment); + } + if (MS.TrackOrigins) + IRB.CreateStore(getOrigin(A), + getOriginPtrForArgument(A, IRB, ArgOffset)); + assert(Size != 0 && Store != 0); + DEBUG(dbgs() << " Param:" << *Store << "\n"); + ArgOffset += DataLayout::RoundUpAlignment(Size, 8); + } + DEBUG(dbgs() << " done with call args\n"); + + FunctionType *FT = + cast<FunctionType>(CS.getCalledValue()->getType()-> getContainedType(0)); + if (FT->isVarArg()) { + VAHelper->visitCallSite(CS, IRB); + } + + // Now, get the shadow for the RetVal. + if (!I.getType()->isSized()) return; + IRBuilder<> IRBBefore(&I); + // Untill we have full dynamic coverage, make sure the retval shadow is 0. + Value *Base = getShadowPtrForRetval(&I, IRBBefore); + IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment); + Instruction *NextInsn = 0; + if (CS.isCall()) { + NextInsn = I.getNextNode(); + } else { + BasicBlock *NormalDest = cast<InvokeInst>(&I)->getNormalDest(); + if (!NormalDest->getSinglePredecessor()) { + // FIXME: this case is tricky, so we are just conservative here. + // Perhaps we need to split the edge between this BB and NormalDest, + // but a naive attempt to use SplitEdge leads to a crash. + setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); + return; + } + NextInsn = NormalDest->getFirstInsertionPt(); + assert(NextInsn && + "Could not find insertion point for retval shadow load"); + } + IRBuilder<> IRBAfter(NextInsn); + Value *RetvalShadow = + IRBAfter.CreateAlignedLoad(getShadowPtrForRetval(&I, IRBAfter), + kShadowTLSAlignment, "_msret"); + setShadow(&I, RetvalShadow); + if (MS.TrackOrigins) + setOrigin(&I, IRBAfter.CreateLoad(getOriginPtrForRetval(IRBAfter))); + } + + void visitReturnInst(ReturnInst &I) { + IRBuilder<> IRB(&I); + if (Value *RetVal = I.getReturnValue()) { + // Set the shadow for the RetVal. + Value *Shadow = getShadow(RetVal); + Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB); + DEBUG(dbgs() << "Return: " << *Shadow << "\n" << *ShadowPtr << "\n"); + IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment); + if (MS.TrackOrigins) + IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB)); + } + } + + void visitPHINode(PHINode &I) { + IRBuilder<> IRB(&I); + ShadowPHINodes.push_back(&I); + setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(), + "_msphi_s")); + if (MS.TrackOrigins) + setOrigin(&I, IRB.CreatePHI(MS.OriginTy, I.getNumIncomingValues(), + "_msphi_o")); + } + + void visitAllocaInst(AllocaInst &I) { + setShadow(&I, getCleanShadow(&I)); + if (!ClPoisonStack) return; + IRBuilder<> IRB(I.getNextNode()); + uint64_t Size = MS.TD->getTypeAllocSize(I.getAllocatedType()); + if (ClPoisonStackWithCall) { + IRB.CreateCall2(MS.MsanPoisonStackFn, + IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), + ConstantInt::get(MS.IntptrTy, Size)); + } else { + Value *ShadowBase = getShadowPtr(&I, Type::getInt8PtrTy(*MS.C), IRB); + IRB.CreateMemSet(ShadowBase, IRB.getInt8(ClPoisonStackPattern), + Size, I.getAlignment()); + } + + if (MS.TrackOrigins) { + setOrigin(&I, getCleanOrigin()); + SmallString<2048> StackDescriptionStorage; + raw_svector_ostream StackDescription(StackDescriptionStorage); + // We create a string with a description of the stack allocation and + // pass it into __msan_set_alloca_origin. + // It will be printed by the run-time if stack-originated UMR is found. + // The first 4 bytes of the string are set to '----' and will be replaced + // by __msan_va_arg_overflow_size_tls at the first call. + StackDescription << "----" << I.getName() << "@" << F.getName(); + Value *Descr = + createPrivateNonConstGlobalForString(*F.getParent(), + StackDescription.str()); + IRB.CreateCall3(MS.MsanSetAllocaOriginFn, + IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), + ConstantInt::get(MS.IntptrTy, Size), + IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy())); + } + } + + void visitSelectInst(SelectInst& I) { + IRBuilder<> IRB(&I); + setShadow(&I, IRB.CreateSelect(I.getCondition(), + getShadow(I.getTrueValue()), getShadow(I.getFalseValue()), + "_msprop")); + if (MS.TrackOrigins) { + // Origins are always i32, so any vector conditions must be flattened. + // FIXME: consider tracking vector origins for app vectors? + Value *Cond = I.getCondition(); + if (Cond->getType()->isVectorTy()) { + Value *ConvertedShadow = convertToShadowTyNoVec(Cond, IRB); + Cond = IRB.CreateICmpNE(ConvertedShadow, + getCleanShadow(ConvertedShadow), "_mso_select"); + } + setOrigin(&I, IRB.CreateSelect(Cond, + getOrigin(I.getTrueValue()), getOrigin(I.getFalseValue()))); + } + } + + void visitLandingPadInst(LandingPadInst &I) { + // Do nothing. + // See http://code.google.com/p/memory-sanitizer/issues/detail?id=1 + setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); + } + + void visitGetElementPtrInst(GetElementPtrInst &I) { + handleShadowOr(I); + } + + void visitExtractValueInst(ExtractValueInst &I) { + IRBuilder<> IRB(&I); + Value *Agg = I.getAggregateOperand(); + DEBUG(dbgs() << "ExtractValue: " << I << "\n"); + Value *AggShadow = getShadow(Agg); + DEBUG(dbgs() << " AggShadow: " << *AggShadow << "\n"); + Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices()); + DEBUG(dbgs() << " ResShadow: " << *ResShadow << "\n"); + setShadow(&I, ResShadow); + setOrigin(&I, getCleanOrigin()); + } + + void visitInsertValueInst(InsertValueInst &I) { + IRBuilder<> IRB(&I); + DEBUG(dbgs() << "InsertValue: " << I << "\n"); + Value *AggShadow = getShadow(I.getAggregateOperand()); + Value *InsShadow = getShadow(I.getInsertedValueOperand()); + DEBUG(dbgs() << " AggShadow: " << *AggShadow << "\n"); + DEBUG(dbgs() << " InsShadow: " << *InsShadow << "\n"); + Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices()); + DEBUG(dbgs() << " Res: " << *Res << "\n"); + setShadow(&I, Res); + setOrigin(&I, getCleanOrigin()); + } + + void dumpInst(Instruction &I) { + if (CallInst *CI = dyn_cast<CallInst>(&I)) { + errs() << "ZZZ call " << CI->getCalledFunction()->getName() << "\n"; + } else { + errs() << "ZZZ " << I.getOpcodeName() << "\n"; + } + errs() << "QQQ " << I << "\n"; + } + + void visitResumeInst(ResumeInst &I) { + DEBUG(dbgs() << "Resume: " << I << "\n"); + // Nothing to do here. + } + + void visitInstruction(Instruction &I) { + // Everything else: stop propagating and check for poisoned shadow. + if (ClDumpStrictInstructions) + dumpInst(I); + DEBUG(dbgs() << "DEFAULT: " << I << "\n"); + for (size_t i = 0, n = I.getNumOperands(); i < n; i++) + insertCheck(I.getOperand(i), &I); + setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); + } +}; + +/// \brief AMD64-specific implementation of VarArgHelper. +struct VarArgAMD64Helper : public VarArgHelper { + // An unfortunate workaround for asymmetric lowering of va_arg stuff. + // See a comment in visitCallSite for more details. + static const unsigned AMD64GpEndOffset = 48; // AMD64 ABI Draft 0.99.6 p3.5.7 + static const unsigned AMD64FpEndOffset = 176; + + Function &F; + MemorySanitizer &MS; + MemorySanitizerVisitor &MSV; + Value *VAArgTLSCopy; + Value *VAArgOverflowSize; + + SmallVector<CallInst*, 16> VAStartInstrumentationList; + + VarArgAMD64Helper(Function &F, MemorySanitizer &MS, + MemorySanitizerVisitor &MSV) + : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(0), VAArgOverflowSize(0) { } + + enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory }; + + ArgKind classifyArgument(Value* arg) { + // A very rough approximation of X86_64 argument classification rules. + Type *T = arg->getType(); + if (T->isFPOrFPVectorTy() || T->isX86_MMXTy()) + return AK_FloatingPoint; + if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64) + return AK_GeneralPurpose; + if (T->isPointerTy()) + return AK_GeneralPurpose; + return AK_Memory; + } + + // For VarArg functions, store the argument shadow in an ABI-specific format + // that corresponds to va_list layout. + // We do this because Clang lowers va_arg in the frontend, and this pass + // only sees the low level code that deals with va_list internals. + // A much easier alternative (provided that Clang emits va_arg instructions) + // would have been to associate each live instance of va_list with a copy of + // MSanParamTLS, and extract shadow on va_arg() call in the argument list + // order. + void visitCallSite(CallSite &CS, IRBuilder<> &IRB) { + unsigned GpOffset = 0; + unsigned FpOffset = AMD64GpEndOffset; + unsigned OverflowOffset = AMD64FpEndOffset; + for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end(); + ArgIt != End; ++ArgIt) { + Value *A = *ArgIt; + ArgKind AK = classifyArgument(A); + if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset) + AK = AK_Memory; + if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset) + AK = AK_Memory; + Value *Base; + switch (AK) { + case AK_GeneralPurpose: + Base = getShadowPtrForVAArgument(A, IRB, GpOffset); + GpOffset += 8; + break; + case AK_FloatingPoint: + Base = getShadowPtrForVAArgument(A, IRB, FpOffset); + FpOffset += 16; + break; + case AK_Memory: + uint64_t ArgSize = MS.TD->getTypeAllocSize(A->getType()); + Base = getShadowPtrForVAArgument(A, IRB, OverflowOffset); + OverflowOffset += DataLayout::RoundUpAlignment(ArgSize, 8); + } + IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); + } + Constant *OverflowSize = + ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AMD64FpEndOffset); + IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS); + } + + /// \brief Compute the shadow address for a given va_arg. + Value *getShadowPtrForVAArgument(Value *A, IRBuilder<> &IRB, + int ArgOffset) { + Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); + Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); + return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(A), 0), + "_msarg"); + } + + void visitVAStartInst(VAStartInst &I) { + IRBuilder<> IRB(&I); + VAStartInstrumentationList.push_back(&I); + Value *VAListTag = I.getArgOperand(0); + Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); + + // Unpoison the whole __va_list_tag. + // FIXME: magic ABI constants. + IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), + /* size */24, /* alignment */16, false); + } + + void visitVACopyInst(VACopyInst &I) { + IRBuilder<> IRB(&I); + Value *VAListTag = I.getArgOperand(0); + Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); + + // Unpoison the whole __va_list_tag. + // FIXME: magic ABI constants. + IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), + /* size */ 24, /* alignment */ 16, false); + } + + void finalizeInstrumentation() { + assert(!VAArgOverflowSize && !VAArgTLSCopy && + "finalizeInstrumentation called twice"); + if (!VAStartInstrumentationList.empty()) { + // If there is a va_start in this function, make a backup copy of + // va_arg_tls somewhere in the function entry block. + IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI()); + VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS); + Value *CopySize = + IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset), + VAArgOverflowSize); + VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); + IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8); + } + + // Instrument va_start. + // Copy va_list shadow from the backup copy of the TLS contents. + for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) { + CallInst *OrigInst = VAStartInstrumentationList[i]; + IRBuilder<> IRB(OrigInst->getNextNode()); + Value *VAListTag = OrigInst->getArgOperand(0); + + Value *RegSaveAreaPtrPtr = + IRB.CreateIntToPtr( + IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), + ConstantInt::get(MS.IntptrTy, 16)), + Type::getInt64PtrTy(*MS.C)); + Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr); + Value *RegSaveAreaShadowPtr = + MSV.getShadowPtr(RegSaveAreaPtr, IRB.getInt8Ty(), IRB); + IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, + AMD64FpEndOffset, 16); + + Value *OverflowArgAreaPtrPtr = + IRB.CreateIntToPtr( + IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), + ConstantInt::get(MS.IntptrTy, 8)), + Type::getInt64PtrTy(*MS.C)); + Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr); + Value *OverflowArgAreaShadowPtr = + MSV.getShadowPtr(OverflowArgAreaPtr, IRB.getInt8Ty(), IRB); + Value *SrcPtr = + getShadowPtrForVAArgument(VAArgTLSCopy, IRB, AMD64FpEndOffset); + IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize, 16); + } + } +}; + +VarArgHelper* CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, + MemorySanitizerVisitor &Visitor) { + return new VarArgAMD64Helper(Func, Msan, Visitor); +} + +} // namespace + +bool MemorySanitizer::runOnFunction(Function &F) { + MemorySanitizerVisitor Visitor(F, *this); + + // Clear out readonly/readnone attributes. + AttrBuilder B; + B.addAttribute(Attribute::ReadOnly) + .addAttribute(Attribute::ReadNone); + F.removeAttribute(AttributeSet::FunctionIndex, + Attribute::get(F.getContext(), B)); + + return Visitor.runOnFunction(); +} diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp index 1fe1254..c5a1fe9 100644 --- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp +++ b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp @@ -13,20 +13,21 @@ // //===----------------------------------------------------------------------===// #define DEBUG_TYPE "insert-optimal-edge-profiling" +#include "llvm/Transforms/Instrumentation.h" +#include "MaximumSpanningTree.h" #include "ProfilingUtils.h" -#include "llvm/Constants.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/ProfileInfo.h" #include "llvm/Analysis/ProfileInfoLoader.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Instrumentation.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/Statistic.h" -#include "MaximumSpanningTree.h" using namespace llvm; STATISTIC(NumEdgesInserted, "The # of edges inserted."); @@ -75,8 +76,8 @@ inline static void printEdgeCounter(ProfileInfo::Edge e, bool OptimalEdgeProfiler::runOnModule(Module &M) { Function *Main = M.getFunction("main"); if (Main == 0) { - errs() << "WARNING: cannot insert edge profiling into a module" - << " with no main function!\n"; + M.getContext().emitWarning("cannot insert edge profiling into a module" + " with no main function"); return false; // No main, no instrumentation! } diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp index cc27146..358bbeb 100644 --- a/lib/Transforms/Instrumentation/PathProfiling.cpp +++ b/lib/Transforms/Instrumentation/PathProfiling.cpp @@ -45,24 +45,23 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "insert-path-profiling" -#include "llvm/DerivedTypes.h" +#include "llvm/Transforms/Instrumentation.h" #include "ProfilingUtils.h" #include "llvm/Analysis/PathNumbering.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/InstrTypes.h" -#include "llvm/Instructions.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/TypeBuilder.h" #include "llvm/Pass.h" -#include "llvm/TypeBuilder.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Instrumentation.h" #include <vector> #define HASH_THRESHHOLD 100000 @@ -1346,8 +1345,8 @@ bool PathProfiler::runOnModule(Module &M) { Main = M.getFunction("MAIN__"); if (!Main) { - errs() << "WARNING: cannot insert path profiling into a module" - << " with no main function!\n"; + Context->emitWarning("cannot insert edge profiling into a module" + " with no main function"); return false; } diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/lib/Transforms/Instrumentation/ProfilingUtils.cpp index de57cd1..4b3de6d 100644 --- a/lib/Transforms/Instrumentation/ProfilingUtils.cpp +++ b/lib/Transforms/Instrumentation/ProfilingUtils.cpp @@ -15,11 +15,11 @@ //===----------------------------------------------------------------------===// #include "ProfilingUtils.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Instructions.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName, GlobalValue *Array, diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 17b7775..29d2ece 100644 --- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -21,32 +21,41 @@ #define DEBUG_TYPE "tsan" +#include "llvm/Transforms/Instrumentation.h" #include "BlackList.h" -#include "llvm/Function.h" -#include "llvm/IRBuilder.h" -#include "llvm/Intrinsics.h" -#include "llvm/LLVMContext.h" -#include "llvm/Metadata.h" -#include "llvm/Module.h" -#include "llvm/Type.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ModuleUtils.h" using namespace llvm; -static cl::opt<std::string> ClBlackListFile("tsan-blacklist", +static cl::opt<std::string> ClBlacklistFile("tsan-blacklist", cl::desc("Blacklist file"), cl::Hidden); +static cl::opt<bool> ClInstrumentMemoryAccesses( + "tsan-instrument-memory-accesses", cl::init(true), + cl::desc("Instrument memory accesses"), cl::Hidden); +static cl::opt<bool> ClInstrumentFuncEntryExit( + "tsan-instrument-func-entry-exit", cl::init(true), + cl::desc("Instrument function entry and exit"), cl::Hidden); +static cl::opt<bool> ClInstrumentAtomics( + "tsan-instrument-atomics", cl::init(true), + cl::desc("Instrument atomics"), cl::Hidden); STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); @@ -62,13 +71,18 @@ namespace { /// ThreadSanitizer: instrument the code in module to find races. struct ThreadSanitizer : public FunctionPass { - ThreadSanitizer(); + ThreadSanitizer(StringRef BlacklistFile = StringRef()) + : FunctionPass(ID), + TD(0), + BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile + : BlacklistFile) { } const char *getPassName() const; bool runOnFunction(Function &F); bool doInitialization(Module &M); static char ID; // Pass identification, replacement for typeid. private: + void initializeCallbacks(Module &M); bool instrumentLoadOrStore(Instruction *I); bool instrumentAtomic(Instruction *I); void chooseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local, @@ -76,7 +90,8 @@ struct ThreadSanitizer : public FunctionPass { bool addrPointsToConstantData(Value *Addr); int getMemoryAccessFuncIndex(Value *Addr); - TargetData *TD; + DataLayout *TD; + SmallString<64> BlacklistFile; OwningPtr<BlackList> BL; IntegerType *OrdTy; // Callbacks to run-time library are computed in doInitialization. @@ -88,6 +103,10 @@ struct ThreadSanitizer : public FunctionPass { Function *TsanWrite[kNumberOfAccessSizes]; Function *TsanAtomicLoad[kNumberOfAccessSizes]; Function *TsanAtomicStore[kNumberOfAccessSizes]; + Function *TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1][kNumberOfAccessSizes]; + Function *TsanAtomicCAS[kNumberOfAccessSizes]; + Function *TsanAtomicThreadFence; + Function *TsanAtomicSignalFence; Function *TsanVptrUpdate; }; } // namespace @@ -101,13 +120,8 @@ const char *ThreadSanitizer::getPassName() const { return "ThreadSanitizer"; } -ThreadSanitizer::ThreadSanitizer() - : FunctionPass(ID), - TD(NULL) { -} - -FunctionPass *llvm::createThreadSanitizerPass() { - return new ThreadSanitizer(); +FunctionPass *llvm::createThreadSanitizerPass(StringRef BlacklistFile) { + return new ThreadSanitizer(BlacklistFile); } static Function *checkInterfaceFunction(Constant *FuncOrBitcast) { @@ -117,18 +131,8 @@ static Function *checkInterfaceFunction(Constant *FuncOrBitcast) { report_fatal_error("ThreadSanitizer interface function redefined"); } -bool ThreadSanitizer::doInitialization(Module &M) { - TD = getAnalysisIfAvailable<TargetData>(); - if (!TD) - return false; - BL.reset(new BlackList(ClBlackListFile)); - - // Always insert a call to __tsan_init into the module's CTORs. +void ThreadSanitizer::initializeCallbacks(Module &M) { IRBuilder<> IRB(M.getContext()); - Value *TsanInit = M.getOrInsertFunction("__tsan_init", - IRB.getVoidTy(), NULL); - appendToGlobalCtors(M, cast<Function>(TsanInit), 0); - // Initialize the callbacks. TsanFuncEntry = checkInterfaceFunction(M.getOrInsertFunction( "__tsan_func_entry", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL)); @@ -158,10 +162,58 @@ bool ThreadSanitizer::doInitialization(Module &M) { TsanAtomicStore[i] = checkInterfaceFunction(M.getOrInsertFunction( AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy, NULL)); + + for (int op = AtomicRMWInst::FIRST_BINOP; + op <= AtomicRMWInst::LAST_BINOP; ++op) { + TsanAtomicRMW[op][i] = NULL; + const char *NamePart = NULL; + if (op == AtomicRMWInst::Xchg) + NamePart = "_exchange"; + else if (op == AtomicRMWInst::Add) + NamePart = "_fetch_add"; + else if (op == AtomicRMWInst::Sub) + NamePart = "_fetch_sub"; + else if (op == AtomicRMWInst::And) + NamePart = "_fetch_and"; + else if (op == AtomicRMWInst::Or) + NamePart = "_fetch_or"; + else if (op == AtomicRMWInst::Xor) + NamePart = "_fetch_xor"; + else if (op == AtomicRMWInst::Nand) + NamePart = "_fetch_nand"; + else + continue; + SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart); + TsanAtomicRMW[op][i] = checkInterfaceFunction(M.getOrInsertFunction( + RMWName, Ty, PtrTy, Ty, OrdTy, NULL)); + } + + SmallString<32> AtomicCASName("__tsan_atomic" + itostr(BitSize) + + "_compare_exchange_val"); + TsanAtomicCAS[i] = checkInterfaceFunction(M.getOrInsertFunction( + AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, NULL)); } TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction( "__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), NULL)); + TsanAtomicThreadFence = checkInterfaceFunction(M.getOrInsertFunction( + "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, NULL)); + TsanAtomicSignalFence = checkInterfaceFunction(M.getOrInsertFunction( + "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, NULL)); +} + +bool ThreadSanitizer::doInitialization(Module &M) { + TD = getAnalysisIfAvailable<DataLayout>(); + if (!TD) + return false; + BL.reset(new BlackList(BlacklistFile)); + + // Always insert a call to __tsan_init into the module's CTORs. + IRBuilder<> IRB(M.getContext()); + Value *TsanInit = M.getOrInsertFunction("__tsan_init", + IRB.getVoidTy(), NULL); + appendToGlobalCtors(M, cast<Function>(TsanInit), 0); + return true; } @@ -244,14 +296,15 @@ static bool isAtomic(Instruction *I) { return true; if (isa<AtomicCmpXchgInst>(I)) return true; - if (FenceInst *FI = dyn_cast<FenceInst>(I)) - return FI->getSynchScope() == CrossThread; + if (isa<FenceInst>(I)) + return true; return false; } bool ThreadSanitizer::runOnFunction(Function &F) { if (!TD) return false; if (BL->isIn(F)) return false; + initializeCallbacks(*F.getParent()); SmallVector<Instruction*, 8> RetVec; SmallVector<Instruction*, 8> AllLoadsAndStores; SmallVector<Instruction*, 8> LocalLoadsAndStores; @@ -284,17 +337,19 @@ bool ThreadSanitizer::runOnFunction(Function &F) { // (e.g. variables that do not escape, etc). // Instrument memory accesses. - for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) { - Res |= instrumentLoadOrStore(AllLoadsAndStores[i]); - } + if (ClInstrumentMemoryAccesses) + for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) { + Res |= instrumentLoadOrStore(AllLoadsAndStores[i]); + } // Instrument atomic memory accesses. - for (size_t i = 0, n = AtomicAccesses.size(); i < n; ++i) { - Res |= instrumentAtomic(AtomicAccesses[i]); - } + if (ClInstrumentAtomics) + for (size_t i = 0, n = AtomicAccesses.size(); i < n; ++i) { + Res |= instrumentAtomic(AtomicAccesses[i]); + } // Instrument function entry/exit points if there were instrumented accesses. - if (Res || HasCalls) { + if ((Res || HasCalls) && ClInstrumentFuncEntryExit) { IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI()); Value *ReturnAddress = IRB.CreateCall( Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress), @@ -343,16 +398,39 @@ static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) { switch (ord) { case NotAtomic: assert(false); case Unordered: // Fall-through. - case Monotonic: v = 1 << 0; break; - // case Consume: v = 1 << 1; break; // Not specified yet. - case Acquire: v = 1 << 2; break; - case Release: v = 1 << 3; break; - case AcquireRelease: v = 1 << 4; break; - case SequentiallyConsistent: v = 1 << 5; break; + case Monotonic: v = 0; break; + // case Consume: v = 1; break; // Not specified yet. + case Acquire: v = 2; break; + case Release: v = 3; break; + case AcquireRelease: v = 4; break; + case SequentiallyConsistent: v = 5; break; + } + return IRB->getInt32(v); +} + +static ConstantInt *createFailOrdering(IRBuilder<> *IRB, AtomicOrdering ord) { + uint32_t v = 0; + switch (ord) { + case NotAtomic: assert(false); + case Unordered: // Fall-through. + case Monotonic: v = 0; break; + // case Consume: v = 1; break; // Not specified yet. + case Acquire: v = 2; break; + case Release: v = 0; break; + case AcquireRelease: v = 2; break; + case SequentiallyConsistent: v = 5; break; } return IRB->getInt32(v); } +// Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x +// standards. For background see C++11 standard. A slightly older, publically +// available draft of the standard (not entirely up-to-date, but close enough +// for casual browsing) is available here: +// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2011/n3242.pdf +// The following page contains more background information: +// http://www.hpl.hp.com/personal/Hans_Boehm/c++mm/ + bool ThreadSanitizer::instrumentAtomic(Instruction *I) { IRBuilder<> IRB(I); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { @@ -385,12 +463,45 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) { CallInst *C = CallInst::Create(TsanAtomicStore[Idx], ArrayRef<Value*>(Args)); ReplaceInstWithInst(I, C); - } else if (isa<AtomicRMWInst>(I)) { - // FIXME: Not yet supported. - } else if (isa<AtomicCmpXchgInst>(I)) { - // FIXME: Not yet supported. - } else if (isa<FenceInst>(I)) { - // FIXME: Not yet supported. + } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) { + Value *Addr = RMWI->getPointerOperand(); + int Idx = getMemoryAccessFuncIndex(Addr); + if (Idx < 0) + return false; + Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx]; + if (F == NULL) + return false; + const size_t ByteSize = 1 << Idx; + const size_t BitSize = ByteSize * 8; + Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); + Type *PtrTy = Ty->getPointerTo(); + Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), + IRB.CreateIntCast(RMWI->getValOperand(), Ty, false), + createOrdering(&IRB, RMWI->getOrdering())}; + CallInst *C = CallInst::Create(F, ArrayRef<Value*>(Args)); + ReplaceInstWithInst(I, C); + } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) { + Value *Addr = CASI->getPointerOperand(); + int Idx = getMemoryAccessFuncIndex(Addr); + if (Idx < 0) + return false; + const size_t ByteSize = 1 << Idx; + const size_t BitSize = ByteSize * 8; + Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); + Type *PtrTy = Ty->getPointerTo(); + Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), + IRB.CreateIntCast(CASI->getCompareOperand(), Ty, false), + IRB.CreateIntCast(CASI->getNewValOperand(), Ty, false), + createOrdering(&IRB, CASI->getOrdering()), + createFailOrdering(&IRB, CASI->getOrdering())}; + CallInst *C = CallInst::Create(TsanAtomicCAS[Idx], ArrayRef<Value*>(Args)); + ReplaceInstWithInst(I, C); + } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) { + Value *Args[] = {createOrdering(&IRB, FI->getOrdering())}; + Function *F = FI->getSynchScope() == SingleThread ? + TsanAtomicSignalFence : TsanAtomicThreadFence; + CallInst *C = CallInst::Create(F, ArrayRef<Value*>(Args)); + ReplaceInstWithInst(I, C); } return true; } diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index b344952..a097308 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -16,16 +16,16 @@ #define DEBUG_TYPE "adce" #include "llvm/Transforms/Scalar.h" -#include "llvm/BasicBlock.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Pass.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/InstIterator.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/InstIterator.h" using namespace llvm; STATISTIC(NumRemoved, "Number of instructions removed"); diff --git a/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/lib/Transforms/Scalar/BasicBlockPlacement.cpp index cee5502..e755008 100644 --- a/lib/Transforms/Scalar/BasicBlockPlacement.cpp +++ b/lib/Transforms/Scalar/BasicBlockPlacement.cpp @@ -27,12 +27,12 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "block-placement" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ProfileInfo.h" -#include "llvm/Function.h" +#include "llvm/IR/Function.h" #include "llvm/Pass.h" #include "llvm/Support/CFG.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Transforms/Scalar.h" #include <set> using namespace llvm; diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index a01e066..b3fc6e3 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -25,6 +25,7 @@ add_llvm_library(LLVMScalarOpts Reassociate.cpp Reg2Mem.cpp SCCP.cpp + SROA.cpp Scalar.cpp ScalarReplAggregates.cpp SimplifyCFGPass.cpp diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp index 5912107..d513c96 100644 --- a/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -15,21 +15,23 @@ #define DEBUG_TYPE "codegenprepare" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/IRBuilder.h" -#include "llvm/InlineAsm.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Pass.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/DominatorInternals.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ProfileInfo.h" #include "llvm/Assembly/Writer.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -37,10 +39,8 @@ #include "llvm/Support/PatternMatch.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Target/TargetLowering.h" -#include "llvm/Transforms/Utils/AddrModeMatcher.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/BypassSlowDivision.h" @@ -105,6 +105,8 @@ namespace { } bool runOnFunction(Function &F); + const char *getPassName() const { return "CodeGen Prepare"; } + virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<DominatorTree>(); AU.addPreserved<ProfileInfo>(); @@ -124,7 +126,7 @@ namespace { bool MoveExtToFormExtLoad(Instruction *I); bool OptimizeExtUses(Instruction *I); bool OptimizeSelectInst(SelectInst *SI); - bool DupRetToEnableTailCallOpts(ReturnInst *RI); + bool DupRetToEnableTailCallOpts(BasicBlock *BB); bool PlaceDbgValues(Function &F); }; } @@ -147,18 +149,16 @@ bool CodeGenPrepare::runOnFunction(Function &F) { TLInfo = &getAnalysis<TargetLibraryInfo>(); DT = getAnalysisIfAvailable<DominatorTree>(); PFI = getAnalysisIfAvailable<ProfileInfo>(); - OptSize = F.hasFnAttr(Attribute::OptimizeForSize); + OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize); /// This optimization identifies DIV instructions that can be /// profitably bypassed and carried out with a shorter, faster divide. if (TLI && TLI->isSlowDivBypassed()) { - const DenseMap<Type *, Type *> &BypassTypeMap = TLI->getBypassSlowDivTypes(); - - for (Function::iterator I = F.begin(); I != F.end(); I++) { - EverMadeChange |= bypassSlowDivision(F, - I, - BypassTypeMap); - } + const DenseMap<unsigned int, unsigned int> &BypassWidths = + TLI->getBypassSlowDivWidths(); + for (Function::iterator I = F.begin(); I != F.end(); I++) + EverMadeChange |= bypassSlowDivision(F, I, BypassWidths); } // Eliminate blocks that contain only PHI nodes and an @@ -173,7 +173,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) { bool MadeChange = true; while (MadeChange) { MadeChange = false; - for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { + for (Function::iterator I = F.begin(); I != F.end(); ) { BasicBlock *BB = I++; MadeChange |= OptimizeBlock(*BB); } @@ -196,9 +196,20 @@ bool CodeGenPrepare::runOnFunction(Function &F) { WorkList.insert(*II); } - for (SmallPtrSet<BasicBlock*, 8>::iterator - I = WorkList.begin(), E = WorkList.end(); I != E; ++I) - DeleteDeadBlock(*I); + // Delete the dead blocks and any of their dead successors. + MadeChange |= !WorkList.empty(); + while (!WorkList.empty()) { + BasicBlock *BB = *WorkList.begin(); + WorkList.erase(BB); + SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB)); + + DeleteDeadBlock(BB); + + for (SmallVectorImpl<BasicBlock*>::iterator + II = Successors.begin(), IE = Successors.end(); II != IE; ++II) + if (pred_begin(*II) == pred_end(*II)) + WorkList.insert(*II); + } // Merge pairs of basic blocks with unconditional branches, connected by // a single edge. @@ -228,7 +239,8 @@ bool CodeGenPrepare::EliminateFallThrough(Function &F) { // edge, just collapse it. BasicBlock *SinglePred = BB->getSinglePredecessor(); - if (!SinglePred || SinglePred == BB) continue; + // Don't merge if BB's address is taken. + if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue; BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator()); if (Term && !Term->isConditional()) { @@ -623,7 +635,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { // happens. WeakVH IterHandle(CurInstIterator); - replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getTargetData() : 0, + replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getDataLayout() : 0, TLInfo, ModifiedDT ? 0 : DT); // If the iterator instruction was recursively deleted, start over at the @@ -647,8 +659,8 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { // From here on out we're working with named functions. if (CI->getCalledFunction() == 0) return false; - // We'll need TargetData from here on out. - const TargetData *TD = TLI ? TLI->getTargetData() : 0; + // We'll need DataLayout from here on out. + const DataLayout *TD = TLI ? TLI->getDataLayout() : 0; if (!TD) return false; // Lower all default uses of _chk calls. This is very similar @@ -662,6 +674,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { /// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return /// instructions to the predecessor to enable tail call optimizations. The /// case it is currently looking for is: +/// @code /// bb0: /// %tmp0 = tail call i32 @f0() /// br label %return @@ -674,9 +687,11 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { /// return: /// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ] /// ret i32 %retval +/// @endcode /// /// => /// +/// @code /// bb0: /// %tmp0 = tail call i32 @f0() /// ret i32 %tmp0 @@ -686,11 +701,15 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { /// bb2: /// %tmp2 = tail call i32 @f2() /// ret i32 %tmp2 -/// -bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { +/// @endcode +bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) { if (!TLI) return false; + ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()); + if (!RI) + return false; + PHINode *PN = 0; BitCastInst *BCI = 0; Value *V = RI->getReturnValue(); @@ -704,15 +723,15 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { return false; } - BasicBlock *BB = RI->getParent(); if (PN && PN->getParent() != BB) return false; // It's not safe to eliminate the sign / zero extension of the return value. // See llvm::isInTailCallPosition(). const Function *F = BB->getParent(); - Attributes CallerRetAttr = F->getAttributes().getRetAttributes(); - if ((CallerRetAttr & Attribute::ZExt) || (CallerRetAttr & Attribute::SExt)) + Attribute CallerRetAttr = F->getAttributes().getRetAttributes(); + if (CallerRetAttr.hasAttribute(Attribute::ZExt) || + CallerRetAttr.hasAttribute(Attribute::SExt)) return false; // Make sure there are no instructions between the PHI and return, or that the @@ -769,8 +788,11 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { // Conservatively require the attributes of the call to match those of the // return. Ignore noalias because it doesn't affect the call sequence. - Attributes CalleeRetAttr = CS.getAttributes().getRetAttributes(); - if ((CalleeRetAttr ^ CallerRetAttr) & ~Attribute::NoAlias) + Attribute CalleeRetAttr = CS.getAttributes().getRetAttributes(); + if (AttrBuilder(CalleeRetAttr). + removeAttribute(Attribute::NoAlias) != + AttrBuilder(CallerRetAttr). + removeAttribute(Attribute::NoAlias)) continue; // Make sure the call instruction is followed by an unconditional branch to @@ -787,7 +809,7 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { } // If we eliminated all predecessors of the block, delete the block now. - if (Changed && pred_begin(BB) == pred_end(BB)) + if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB)) BB->eraseFromParent(); return Changed; @@ -797,6 +819,629 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { // Memory Optimization //===----------------------------------------------------------------------===// +namespace { + +/// ExtAddrMode - This is an extended version of TargetLowering::AddrMode +/// which holds actual Value*'s for register values. +struct ExtAddrMode : public TargetLowering::AddrMode { + Value *BaseReg; + Value *ScaledReg; + ExtAddrMode() : BaseReg(0), ScaledReg(0) {} + void print(raw_ostream &OS) const; + void dump() const; + + bool operator==(const ExtAddrMode& O) const { + return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) && + (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) && + (HasBaseReg == O.HasBaseReg) && (Scale == O.Scale); + } +}; + +static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) { + AM.print(OS); + return OS; +} + +void ExtAddrMode::print(raw_ostream &OS) const { + bool NeedPlus = false; + OS << "["; + if (BaseGV) { + OS << (NeedPlus ? " + " : "") + << "GV:"; + WriteAsOperand(OS, BaseGV, /*PrintType=*/false); + NeedPlus = true; + } + + if (BaseOffs) + OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true; + + if (BaseReg) { + OS << (NeedPlus ? " + " : "") + << "Base:"; + WriteAsOperand(OS, BaseReg, /*PrintType=*/false); + NeedPlus = true; + } + if (Scale) { + OS << (NeedPlus ? " + " : "") + << Scale << "*"; + WriteAsOperand(OS, ScaledReg, /*PrintType=*/false); + NeedPlus = true; + } + + OS << ']'; +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void ExtAddrMode::dump() const { + print(dbgs()); + dbgs() << '\n'; +} +#endif + + +/// \brief A helper class for matching addressing modes. +/// +/// This encapsulates the logic for matching the target-legal addressing modes. +class AddressingModeMatcher { + SmallVectorImpl<Instruction*> &AddrModeInsts; + const TargetLowering &TLI; + + /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and + /// the memory instruction that we're computing this address for. + Type *AccessTy; + Instruction *MemoryInst; + + /// AddrMode - This is the addressing mode that we're building up. This is + /// part of the return value of this addressing mode matching stuff. + ExtAddrMode &AddrMode; + + /// IgnoreProfitability - This is set to true when we should not do + /// profitability checks. When true, IsProfitableToFoldIntoAddressingMode + /// always returns true. + bool IgnoreProfitability; + + AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI, + const TargetLowering &T, Type *AT, + Instruction *MI, ExtAddrMode &AM) + : AddrModeInsts(AMI), TLI(T), AccessTy(AT), MemoryInst(MI), AddrMode(AM) { + IgnoreProfitability = false; + } +public: + + /// Match - Find the maximal addressing mode that a load/store of V can fold, + /// give an access type of AccessTy. This returns a list of involved + /// instructions in AddrModeInsts. + static ExtAddrMode Match(Value *V, Type *AccessTy, + Instruction *MemoryInst, + SmallVectorImpl<Instruction*> &AddrModeInsts, + const TargetLowering &TLI) { + ExtAddrMode Result; + + bool Success = + AddressingModeMatcher(AddrModeInsts, TLI, AccessTy, + MemoryInst, Result).MatchAddr(V, 0); + (void)Success; assert(Success && "Couldn't select *anything*?"); + return Result; + } +private: + bool MatchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth); + bool MatchAddr(Value *V, unsigned Depth); + bool MatchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth); + bool IsProfitableToFoldIntoAddressingMode(Instruction *I, + ExtAddrMode &AMBefore, + ExtAddrMode &AMAfter); + bool ValueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2); +}; + +/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode. +/// Return true and update AddrMode if this addr mode is legal for the target, +/// false if not. +bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, + unsigned Depth) { + // If Scale is 1, then this is the same as adding ScaleReg to the addressing + // mode. Just process that directly. + if (Scale == 1) + return MatchAddr(ScaleReg, Depth); + + // If the scale is 0, it takes nothing to add this. + if (Scale == 0) + return true; + + // If we already have a scale of this value, we can add to it, otherwise, we + // need an available scale field. + if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg) + return false; + + ExtAddrMode TestAddrMode = AddrMode; + + // Add scale to turn X*4+X*3 -> X*7. This could also do things like + // [A+B + A*7] -> [B+A*8]. + TestAddrMode.Scale += Scale; + TestAddrMode.ScaledReg = ScaleReg; + + // If the new address isn't legal, bail out. + if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) + return false; + + // It was legal, so commit it. + AddrMode = TestAddrMode; + + // Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now + // to see if ScaleReg is actually X+C. If so, we can turn this into adding + // X*Scale + C*Scale to addr mode. + ConstantInt *CI = 0; Value *AddLHS = 0; + if (isa<Instruction>(ScaleReg) && // not a constant expr. + match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) { + TestAddrMode.ScaledReg = AddLHS; + TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale; + + // If this addressing mode is legal, commit it and remember that we folded + // this instruction. + if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) { + AddrModeInsts.push_back(cast<Instruction>(ScaleReg)); + AddrMode = TestAddrMode; + return true; + } + } + + // Otherwise, not (x+c)*scale, just return what we have. + return true; +} + +/// MightBeFoldableInst - This is a little filter, which returns true if an +/// addressing computation involving I might be folded into a load/store +/// accessing it. This doesn't need to be perfect, but needs to accept at least +/// the set of instructions that MatchOperationAddr can. +static bool MightBeFoldableInst(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::BitCast: + // Don't touch identity bitcasts. + if (I->getType() == I->getOperand(0)->getType()) + return false; + return I->getType()->isPointerTy() || I->getType()->isIntegerTy(); + case Instruction::PtrToInt: + // PtrToInt is always a noop, as we know that the int type is pointer sized. + return true; + case Instruction::IntToPtr: + // We know the input is intptr_t, so this is foldable. + return true; + case Instruction::Add: + return true; + case Instruction::Mul: + case Instruction::Shl: + // Can only handle X*C and X << C. + return isa<ConstantInt>(I->getOperand(1)); + case Instruction::GetElementPtr: + return true; + default: + return false; + } +} + +/// MatchOperationAddr - Given an instruction or constant expr, see if we can +/// fold the operation into the addressing mode. If so, update the addressing +/// mode and return true, otherwise return false without modifying AddrMode. +bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, + unsigned Depth) { + // Avoid exponential behavior on extremely deep expression trees. + if (Depth >= 5) return false; + + switch (Opcode) { + case Instruction::PtrToInt: + // PtrToInt is always a noop, as we know that the int type is pointer sized. + return MatchAddr(AddrInst->getOperand(0), Depth); + case Instruction::IntToPtr: + // This inttoptr is a no-op if the integer type is pointer sized. + if (TLI.getValueType(AddrInst->getOperand(0)->getType()) == + TLI.getPointerTy()) + return MatchAddr(AddrInst->getOperand(0), Depth); + return false; + case Instruction::BitCast: + // BitCast is always a noop, and we can handle it as long as it is + // int->int or pointer->pointer (we don't want int<->fp or something). + if ((AddrInst->getOperand(0)->getType()->isPointerTy() || + AddrInst->getOperand(0)->getType()->isIntegerTy()) && + // Don't touch identity bitcasts. These were probably put here by LSR, + // and we don't want to mess around with them. Assume it knows what it + // is doing. + AddrInst->getOperand(0)->getType() != AddrInst->getType()) + return MatchAddr(AddrInst->getOperand(0), Depth); + return false; + case Instruction::Add: { + // Check to see if we can merge in the RHS then the LHS. If so, we win. + ExtAddrMode BackupAddrMode = AddrMode; + unsigned OldSize = AddrModeInsts.size(); + if (MatchAddr(AddrInst->getOperand(1), Depth+1) && + MatchAddr(AddrInst->getOperand(0), Depth+1)) + return true; + + // Restore the old addr mode info. + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + + // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. + if (MatchAddr(AddrInst->getOperand(0), Depth+1) && + MatchAddr(AddrInst->getOperand(1), Depth+1)) + return true; + + // Otherwise we definitely can't merge the ADD in. + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + break; + } + //case Instruction::Or: + // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD. + //break; + case Instruction::Mul: + case Instruction::Shl: { + // Can only handle X*C and X << C. + ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1)); + if (!RHS) return false; + int64_t Scale = RHS->getSExtValue(); + if (Opcode == Instruction::Shl) + Scale = 1LL << Scale; + + return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth); + } + case Instruction::GetElementPtr: { + // Scan the GEP. We check it if it contains constant offsets and at most + // one variable offset. + int VariableOperand = -1; + unsigned VariableScale = 0; + + int64_t ConstantOffset = 0; + const DataLayout *TD = TLI.getDataLayout(); + gep_type_iterator GTI = gep_type_begin(AddrInst); + for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) { + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + const StructLayout *SL = TD->getStructLayout(STy); + unsigned Idx = + cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue(); + ConstantOffset += SL->getElementOffset(Idx); + } else { + uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType()); + if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) { + ConstantOffset += CI->getSExtValue()*TypeSize; + } else if (TypeSize) { // Scales of zero don't do anything. + // We only allow one variable index at the moment. + if (VariableOperand != -1) + return false; + + // Remember the variable index. + VariableOperand = i; + VariableScale = TypeSize; + } + } + } + + // A common case is for the GEP to only do a constant offset. In this case, + // just add it to the disp field and check validity. + if (VariableOperand == -1) { + AddrMode.BaseOffs += ConstantOffset; + if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){ + // Check to see if we can fold the base pointer in too. + if (MatchAddr(AddrInst->getOperand(0), Depth+1)) + return true; + } + AddrMode.BaseOffs -= ConstantOffset; + return false; + } + + // Save the valid addressing mode in case we can't match. + ExtAddrMode BackupAddrMode = AddrMode; + unsigned OldSize = AddrModeInsts.size(); + + // See if the scale and offset amount is valid for this target. + AddrMode.BaseOffs += ConstantOffset; + + // Match the base operand of the GEP. + if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) { + // If it couldn't be matched, just stuff the value in a register. + if (AddrMode.HasBaseReg) { + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + return false; + } + AddrMode.HasBaseReg = true; + AddrMode.BaseReg = AddrInst->getOperand(0); + } + + // Match the remaining variable portion of the GEP. + if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, + Depth)) { + // If it couldn't be matched, try stuffing the base into a register + // instead of matching it, and retrying the match of the scale. + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + if (AddrMode.HasBaseReg) + return false; + AddrMode.HasBaseReg = true; + AddrMode.BaseReg = AddrInst->getOperand(0); + AddrMode.BaseOffs += ConstantOffset; + if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), + VariableScale, Depth)) { + // If even that didn't work, bail. + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + return false; + } + } + + return true; + } + } + return false; +} + +/// MatchAddr - If we can, try to add the value of 'Addr' into the current +/// addressing mode. If Addr can't be added to AddrMode this returns false and +/// leaves AddrMode unmodified. This assumes that Addr is either a pointer type +/// or intptr_t for the target. +/// +bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { + if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) { + // Fold in immediates if legal for the target. + AddrMode.BaseOffs += CI->getSExtValue(); + if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + return true; + AddrMode.BaseOffs -= CI->getSExtValue(); + } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) { + // If this is a global variable, try to fold it into the addressing mode. + if (AddrMode.BaseGV == 0) { + AddrMode.BaseGV = GV; + if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + return true; + AddrMode.BaseGV = 0; + } + } else if (Instruction *I = dyn_cast<Instruction>(Addr)) { + ExtAddrMode BackupAddrMode = AddrMode; + unsigned OldSize = AddrModeInsts.size(); + + // Check to see if it is possible to fold this operation. + if (MatchOperationAddr(I, I->getOpcode(), Depth)) { + // Okay, it's possible to fold this. Check to see if it is actually + // *profitable* to do so. We use a simple cost model to avoid increasing + // register pressure too much. + if (I->hasOneUse() || + IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) { + AddrModeInsts.push_back(I); + return true; + } + + // It isn't profitable to do this, roll back. + //cerr << "NOT FOLDING: " << *I; + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + } + } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) { + if (MatchOperationAddr(CE, CE->getOpcode(), Depth)) + return true; + } else if (isa<ConstantPointerNull>(Addr)) { + // Null pointer gets folded without affecting the addressing mode. + return true; + } + + // Worse case, the target should support [reg] addressing modes. :) + if (!AddrMode.HasBaseReg) { + AddrMode.HasBaseReg = true; + AddrMode.BaseReg = Addr; + // Still check for legality in case the target supports [imm] but not [i+r]. + if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + return true; + AddrMode.HasBaseReg = false; + AddrMode.BaseReg = 0; + } + + // If the base register is already taken, see if we can do [r+r]. + if (AddrMode.Scale == 0) { + AddrMode.Scale = 1; + AddrMode.ScaledReg = Addr; + if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + return true; + AddrMode.Scale = 0; + AddrMode.ScaledReg = 0; + } + // Couldn't match. + return false; +} + +/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified +/// inline asm call are due to memory operands. If so, return true, otherwise +/// return false. +static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, + const TargetLowering &TLI) { + TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI)); + for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { + TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; + + // Compute the constraint code and ConstraintType to use. + TLI.ComputeConstraintToUse(OpInfo, SDValue()); + + // If this asm operand is our Value*, and if it isn't an indirect memory + // operand, we can't fold it! + if (OpInfo.CallOperandVal == OpVal && + (OpInfo.ConstraintType != TargetLowering::C_Memory || + !OpInfo.isIndirect)) + return false; + } + + return true; +} + +/// FindAllMemoryUses - Recursively walk all the uses of I until we find a +/// memory use. If we find an obviously non-foldable instruction, return true. +/// Add the ultimately found memory instructions to MemoryUses. +static bool FindAllMemoryUses(Instruction *I, + SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses, + SmallPtrSet<Instruction*, 16> &ConsideredInsts, + const TargetLowering &TLI) { + // If we already considered this instruction, we're done. + if (!ConsideredInsts.insert(I)) + return false; + + // If this is an obviously unfoldable instruction, bail out. + if (!MightBeFoldableInst(I)) + return true; + + // Loop over all the uses, recursively processing them. + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); + UI != E; ++UI) { + User *U = *UI; + + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo())); + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + unsigned opNo = UI.getOperandNo(); + if (opNo == 0) return true; // Storing addr, not into addr. + MemoryUses.push_back(std::make_pair(SI, opNo)); + continue; + } + + if (CallInst *CI = dyn_cast<CallInst>(U)) { + InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue()); + if (!IA) return true; + + // If this is a memory operand, we're cool, otherwise bail out. + if (!IsOperandAMemoryOperand(CI, IA, I, TLI)) + return true; + continue; + } + + if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts, + TLI)) + return true; + } + + return false; +} + +/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at +/// the use site that we're folding it into. If so, there is no cost to +/// include it in the addressing mode. KnownLive1 and KnownLive2 are two values +/// that we know are live at the instruction already. +bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, + Value *KnownLive2) { + // If Val is either of the known-live values, we know it is live! + if (Val == 0 || Val == KnownLive1 || Val == KnownLive2) + return true; + + // All values other than instructions and arguments (e.g. constants) are live. + if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true; + + // If Val is a constant sized alloca in the entry block, it is live, this is + // true because it is just a reference to the stack/frame pointer, which is + // live for the whole function. + if (AllocaInst *AI = dyn_cast<AllocaInst>(Val)) + if (AI->isStaticAlloca()) + return true; + + // Check to see if this value is already used in the memory instruction's + // block. If so, it's already live into the block at the very least, so we + // can reasonably fold it. + return Val->isUsedInBasicBlock(MemoryInst->getParent()); +} + +/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing +/// mode of the machine to fold the specified instruction into a load or store +/// that ultimately uses it. However, the specified instruction has multiple +/// uses. Given this, it may actually increase register pressure to fold it +/// into the load. For example, consider this code: +/// +/// X = ... +/// Y = X+1 +/// use(Y) -> nonload/store +/// Z = Y+1 +/// load Z +/// +/// In this case, Y has multiple uses, and can be folded into the load of Z +/// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to +/// be live at the use(Y) line. If we don't fold Y into load Z, we use one +/// fewer register. Since Y can't be folded into "use(Y)" we don't increase the +/// number of computations either. +/// +/// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If +/// X was live across 'load Z' for other reasons, we actually *would* want to +/// fold the addressing mode in the Z case. This would make Y die earlier. +bool AddressingModeMatcher:: +IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, + ExtAddrMode &AMAfter) { + if (IgnoreProfitability) return true; + + // AMBefore is the addressing mode before this instruction was folded into it, + // and AMAfter is the addressing mode after the instruction was folded. Get + // the set of registers referenced by AMAfter and subtract out those + // referenced by AMBefore: this is the set of values which folding in this + // address extends the lifetime of. + // + // Note that there are only two potential values being referenced here, + // BaseReg and ScaleReg (global addresses are always available, as are any + // folded immediates). + Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg; + + // If the BaseReg or ScaledReg was referenced by the previous addrmode, their + // lifetime wasn't extended by adding this instruction. + if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) + BaseReg = 0; + if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) + ScaledReg = 0; + + // If folding this instruction (and it's subexprs) didn't extend any live + // ranges, we're ok with it. + if (BaseReg == 0 && ScaledReg == 0) + return true; + + // If all uses of this instruction are ultimately load/store/inlineasm's, + // check to see if their addressing modes will include this instruction. If + // so, we can fold it into all uses, so it doesn't matter if it has multiple + // uses. + SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; + SmallPtrSet<Instruction*, 16> ConsideredInsts; + if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI)) + return false; // Has a non-memory, non-foldable use! + + // Now that we know that all uses of this instruction are part of a chain of + // computation involving only operations that could theoretically be folded + // into a memory use, loop over each of these uses and see if they could + // *actually* fold the instruction. + SmallVector<Instruction*, 32> MatchedAddrModeInsts; + for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) { + Instruction *User = MemoryUses[i].first; + unsigned OpNo = MemoryUses[i].second; + + // Get the access type of this use. If the use isn't a pointer, we don't + // know what it accesses. + Value *Address = User->getOperand(OpNo); + if (!Address->getType()->isPointerTy()) + return false; + Type *AddressAccessTy = + cast<PointerType>(Address->getType())->getElementType(); + + // Do a match against the root of this address, ignoring profitability. This + // will tell us if the addressing mode for the memory operation will + // *actually* cover the shared instruction. + ExtAddrMode Result; + AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy, + MemoryInst, Result); + Matcher.IgnoreProfitability = true; + bool Success = Matcher.MatchAddr(Address, 0); + (void)Success; assert(Success && "Couldn't select *anything*?"); + + // If the match didn't cover I, then it won't be shared by it. + if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(), + I) == MatchedAddrModeInsts.end()) + return false; + + MatchedAddrModeInsts.clear(); + } + + return true; +} + +} // end anonymous namespace + /// IsNonLocalValue - Return true if the specified values are defined in a /// different basic block than BB. static bool IsNonLocalValue(Value *V, BasicBlock *BB) { @@ -927,7 +1572,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst); Type *IntPtrTy = - TLI->getTargetData()->getIntPtrType(AccessTy->getContext()); + TLI->getDataLayout()->getIntPtrType(AccessTy->getContext()); Value *Result = 0; @@ -1313,9 +1958,6 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { if (CallInst *CI = dyn_cast<CallInst>(I)) return OptimizeCallInst(CI); - if (ReturnInst *RI = dyn_cast<ReturnInst>(I)) - return DupRetToEnableTailCallOpts(RI); - if (SelectInst *SI = dyn_cast<SelectInst>(I)) return OptimizeSelectInst(SI); @@ -1330,9 +1972,11 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { bool MadeChange = false; CurInstIterator = BB.begin(); - for (BasicBlock::iterator E = BB.end(); CurInstIterator != E; ) + while (CurInstIterator != BB.end()) MadeChange |= OptimizeInst(CurInstIterator++); + MadeChange |= DupRetToEnableTailCallOpts(&BB); + return MadeChange; } diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp index 5430f62..d5a96ec 100644 --- a/lib/Transforms/Scalar/ConstantProp.cpp +++ b/lib/Transforms/Scalar/ConstantProp.cpp @@ -20,14 +20,14 @@ #define DEBUG_TYPE "constprop" #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Constant.h" -#include "llvm/Instruction.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Support/InstIterator.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Target/TargetLibraryInfo.h" #include <set> using namespace llvm; @@ -67,7 +67,7 @@ bool ConstantPropagation::runOnFunction(Function &F) { WorkList.insert(&*i); } bool Changed = false; - TargetData *TD = getAnalysisIfAvailable<TargetData>(); + DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); while (!WorkList.empty()) { diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 9b0aadb..4c3631b 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -13,15 +13,15 @@ #define DEBUG_TYPE "correlated-value-propagation" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/Pass.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" #include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumPhis, "Number of phis propagated"); @@ -235,6 +235,11 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) { // This case never fires - remove it. CI.getCaseSuccessor()->removePredecessor(BB); SI->removeCase(CI); // Does not invalidate the iterator. + + // The condition can be modified by removePredecessor's PHI simplification + // logic. + Cond = SI->getCondition(); + ++NumDeadCases; Changed = true; } else if (State == LazyValueInfo::True) { diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp index 086f0a1..e8a090a 100644 --- a/lib/Transforms/Scalar/DCE.cpp +++ b/lib/Transforms/Scalar/DCE.cpp @@ -18,12 +18,12 @@ #define DEBUG_TYPE "dce" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Instruction.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Instruction.h" #include "llvm/Pass.h" #include "llvm/Support/InstIterator.h" #include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; STATISTIC(DIEEliminated, "Number of insts removed by DIE pass"); @@ -118,13 +118,8 @@ bool DCE::runOnFunction(Function &F) { I->eraseFromParent(); // Remove the instruction from the worklist if it still exists in it. - for (std::vector<Instruction*>::iterator WI = WorkList.begin(); - WI != WorkList.end(); ) { - if (*WI == I) - WI = WorkList.erase(WI); - else - ++WI; - } + WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I), + WorkList.end()); MadeChange = true; ++DCEEliminated; diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index 1ff4329..fe3acbf 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -17,24 +17,25 @@ #define DEBUG_TYPE "dse" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/Function.h" -#include "llvm/GlobalVariable.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Pass.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; STATISTIC(NumFastStores, "Number of stores deleted"); @@ -45,6 +46,7 @@ namespace { AliasAnalysis *AA; MemoryDependenceAnalysis *MD; DominatorTree *DT; + const TargetLibraryInfo *TLI; static char ID; // Pass identification, replacement for typeid DSE() : FunctionPass(ID), AA(0), MD(0), DT(0) { @@ -55,6 +57,7 @@ namespace { AA = &getAnalysis<AliasAnalysis>(); MD = &getAnalysis<MemoryDependenceAnalysis>(); DT = &getAnalysis<DominatorTree>(); + TLI = AA->getTargetLibraryInfo(); bool Changed = false; for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) @@ -144,7 +147,7 @@ static void DeleteDeadInstruction(Instruction *I, /// hasMemoryWrite - Does this instruction write some memory? This only returns /// true for things that we can analyze with other helpers below. -static bool hasMemoryWrite(Instruction *I) { +static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { if (isa<StoreInst>(I)) return true; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { @@ -159,6 +162,26 @@ static bool hasMemoryWrite(Instruction *I) { return true; } } + if (CallSite CS = I) { + if (Function *F = CS.getCalledFunction()) { + if (TLI && TLI->has(LibFunc::strcpy) && + F->getName() == TLI->getName(LibFunc::strcpy)) { + return true; + } + if (TLI && TLI->has(LibFunc::strncpy) && + F->getName() == TLI->getName(LibFunc::strncpy)) { + return true; + } + if (TLI && TLI->has(LibFunc::strcat) && + F->getName() == TLI->getName(LibFunc::strcat)) { + return true; + } + if (TLI && TLI->has(LibFunc::strncat) && + F->getName() == TLI->getName(LibFunc::strncat)) { + return true; + } + } + } return false; } @@ -176,7 +199,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { // If we don't have target data around, an unknown size in Location means // that we should use the size of the pointee type. This isn't valid for // memset/memcpy, which writes more than an i8. - if (Loc.Size == AliasAnalysis::UnknownSize && AA.getTargetData() == 0) + if (Loc.Size == AliasAnalysis::UnknownSize && AA.getDataLayout() == 0) return AliasAnalysis::Location(); return Loc; } @@ -190,7 +213,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { // If we don't have target data around, an unknown size in Location means // that we should use the size of the pointee type. This isn't valid for // init.trampoline, which writes more than an i8. - if (AA.getTargetData() == 0) return AliasAnalysis::Location(); + if (AA.getDataLayout() == 0) return AliasAnalysis::Location(); // FIXME: We don't know the size of the trampoline, so we can't really // handle it here. @@ -206,7 +229,8 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { /// instruction if any. static AliasAnalysis::Location getLocForRead(Instruction *Inst, AliasAnalysis &AA) { - assert(hasMemoryWrite(Inst) && "Unknown instruction case"); + assert(hasMemoryWrite(Inst, AA.getTargetLibraryInfo()) && + "Unknown instruction case"); // The only instructions that both read and write are the mem transfer // instructions (memcpy/memmove). @@ -223,23 +247,29 @@ static bool isRemovable(Instruction *I) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->isUnordered(); - IntrinsicInst *II = cast<IntrinsicInst>(I); - switch (II->getIntrinsicID()) { - default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate"); - case Intrinsic::lifetime_end: - // Never remove dead lifetime_end's, e.g. because it is followed by a - // free. - return false; - case Intrinsic::init_trampoline: - // Always safe to remove init_trampoline. - return true; + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate"); + case Intrinsic::lifetime_end: + // Never remove dead lifetime_end's, e.g. because it is followed by a + // free. + return false; + case Intrinsic::init_trampoline: + // Always safe to remove init_trampoline. + return true; - case Intrinsic::memset: - case Intrinsic::memmove: - case Intrinsic::memcpy: - // Don't remove volatile memory intrinsics. - return !cast<MemIntrinsic>(II)->isVolatile(); + case Intrinsic::memset: + case Intrinsic::memmove: + case Intrinsic::memcpy: + // Don't remove volatile memory intrinsics. + return !cast<MemIntrinsic>(II)->isVolatile(); + } } + + if (CallSite CS = I) + return CS.getInstruction()->use_empty(); + + return false; } @@ -250,14 +280,19 @@ static bool isShortenable(Instruction *I) { if (isa<StoreInst>(I)) return false; - IntrinsicInst *II = cast<IntrinsicInst>(I); - switch (II->getIntrinsicID()) { - default: return false; - case Intrinsic::memset: - case Intrinsic::memcpy: - // Do shorten memory intrinsics. - return true; + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::memset: + case Intrinsic::memcpy: + // Do shorten memory intrinsics. + return true; + } } + + // Don't shorten libcalls calls for now. + + return false; } /// getStoredPointerOperand - Return the pointer that is being written to. @@ -267,17 +302,23 @@ static Value *getStoredPointerOperand(Instruction *I) { if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) return MI->getDest(); - IntrinsicInst *II = cast<IntrinsicInst>(I); - switch (II->getIntrinsicID()) { - default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::init_trampoline: - return II->getArgOperand(0); + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::init_trampoline: + return II->getArgOperand(0); + } } + + CallSite CS = I; + // All the supported functions so far happen to have dest as their first + // argument. + return CS.getArgument(0); } static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) { uint64_t Size; - if (getObjectSize(V, Size, AA.getTargetData(), AA.getTargetLibraryInfo())) + if (getObjectSize(V, Size, AA.getDataLayout(), AA.getTargetLibraryInfo())) return Size; return AliasAnalysis::UnknownSize; } @@ -310,10 +351,10 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // comparison. if (Later.Size == AliasAnalysis::UnknownSize || Earlier.Size == AliasAnalysis::UnknownSize) { - // If we have no TargetData information around, then the size of the store + // If we have no DataLayout information around, then the size of the store // is inferrable from the pointee type. If they are the same type, then // we know that the store is safe. - if (AA.getTargetData() == 0 && + if (AA.getDataLayout() == 0 && Later.Ptr->getType() == Earlier.Ptr->getType()) return OverwriteComplete; @@ -329,13 +370,13 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // larger than the earlier one. if (Later.Size == AliasAnalysis::UnknownSize || Earlier.Size == AliasAnalysis::UnknownSize || - AA.getTargetData() == 0) + AA.getDataLayout() == 0) return OverwriteUnknown; // Check to see if the later store is to the entire object (either a global, // an alloca, or a byval argument). If so, then it clearly overwrites any // other store to the same object. - const TargetData &TD = *AA.getTargetData(); + const DataLayout &TD = *AA.getDataLayout(); const Value *UO1 = GetUnderlyingObject(P1, &TD), *UO2 = GetUnderlyingObject(P2, &TD); @@ -455,13 +496,13 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { Instruction *Inst = BBI++; // Handle 'free' calls specially. - if (CallInst *F = isFreeCall(Inst, AA->getTargetLibraryInfo())) { + if (CallInst *F = isFreeCall(Inst, TLI)) { MadeChange |= HandleFree(F); continue; } // If we find something that writes memory, get its memory dependence. - if (!hasMemoryWrite(Inst)) + if (!hasMemoryWrite(Inst, TLI)) continue; MemDepResult InstDep = MD->getDependency(Inst); @@ -484,7 +525,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { // in case we need it. WeakVH NextInst(BBI); - DeleteDeadInstruction(SI, *MD, AA->getTargetLibraryInfo()); + DeleteDeadInstruction(SI, *MD, TLI); if (NextInst == 0) // Next instruction deleted. BBI = BB.begin(); @@ -531,7 +572,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { << *DepWrite << "\n KILLER: " << *Inst << '\n'); // Delete the store and now-dead instructions that feed it. - DeleteDeadInstruction(DepWrite, *MD, AA->getTargetLibraryInfo()); + DeleteDeadInstruction(DepWrite, *MD, TLI); ++NumFastStores; MadeChange = true; @@ -628,7 +669,7 @@ bool DSE::HandleFree(CallInst *F) { MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB); while (Dep.isDef() || Dep.isClobber()) { Instruction *Dependency = Dep.getInst(); - if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency)) + if (!hasMemoryWrite(Dependency, TLI) || !isRemovable(Dependency)) break; Value *DepPointer = @@ -641,7 +682,7 @@ bool DSE::HandleFree(CallInst *F) { Instruction *Next = llvm::next(BasicBlock::iterator(Dependency)); // DCE instructions only used to calculate that store - DeleteDeadInstruction(Dependency, *MD, AA->getTargetLibraryInfo()); + DeleteDeadInstruction(Dependency, *MD, TLI); ++NumFastStores; MadeChange = true; @@ -660,6 +701,22 @@ bool DSE::HandleFree(CallInst *F) { return MadeChange; } +namespace { + struct CouldRef { + typedef Value *argument_type; + const CallSite CS; + AliasAnalysis *AA; + + bool operator()(Value *I) { + // See if the call site touches the value. + AliasAnalysis::ModRefResult A = + AA->getModRefInfo(CS, I, getPointerSize(I, *AA)); + + return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref; + } + }; +} + /// handleEndBlock - Remove dead stores to stack-allocated locations in the /// function end block. Ex: /// %A = alloca i32 @@ -681,8 +738,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // Okay, so these are dead heap objects, but if the pointer never escapes // then it's leaked by this function anyways. - else if (isAllocLikeFn(I, AA->getTargetLibraryInfo()) && - !PointerMayBeCaptured(I, true, true)) + else if (isAllocLikeFn(I, TLI) && !PointerMayBeCaptured(I, true, true)) DeadStackObjects.insert(I); } @@ -698,7 +754,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { --BBI; // If we find a store, check to see if it points into a dead stack value. - if (hasMemoryWrite(BBI) && isRemovable(BBI)) { + if (hasMemoryWrite(BBI, TLI) && isRemovable(BBI)) { // See through pointer-to-pointer bitcasts SmallVector<Value *, 4> Pointers; GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers); @@ -726,8 +782,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { dbgs() << '\n'); // DCE instructions only used to calculate that store. - DeleteDeadInstruction(Dead, *MD, AA->getTargetLibraryInfo(), - &DeadStackObjects); + DeleteDeadInstruction(Dead, *MD, TLI, &DeadStackObjects); ++NumFastStores; MadeChange = true; continue; @@ -735,10 +790,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) { } // Remove any dead non-memory-mutating instructions. - if (isInstructionTriviallyDead(BBI, AA->getTargetLibraryInfo())) { + if (isInstructionTriviallyDead(BBI, TLI)) { Instruction *Inst = BBI++; - DeleteDeadInstruction(Inst, *MD, AA->getTargetLibraryInfo(), - &DeadStackObjects); + DeleteDeadInstruction(Inst, *MD, TLI, &DeadStackObjects); ++NumFastOther; MadeChange = true; continue; @@ -754,7 +808,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { if (CallSite CS = cast<Value>(BBI)) { // Remove allocation function calls from the list of dead stack objects; // there can't be any references before the definition. - if (isAllocLikeFn(BBI, AA->getTargetLibraryInfo())) + if (isAllocLikeFn(BBI, TLI)) DeadStackObjects.remove(BBI); // If this call does not access memory, it can't be loading any of our @@ -764,26 +818,14 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // If the call might load from any of our allocas, then any store above // the call is live. - SmallVector<Value*, 8> LiveAllocas; - for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(), - E = DeadStackObjects.end(); I != E; ++I) { - // See if the call site touches it. - AliasAnalysis::ModRefResult A = - AA->getModRefInfo(CS, *I, getPointerSize(*I, *AA)); - - if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref) - LiveAllocas.push_back(*I); - } + CouldRef Pred = { CS, AA }; + DeadStackObjects.remove_if(Pred); // If all of the allocas were clobbered by the call then we're not going // to find anything else to process. - if (DeadStackObjects.size() == LiveAllocas.size()) + if (DeadStackObjects.empty()) break; - for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(), - E = LiveAllocas.end(); I != E; ++I) - DeadStackObjects.remove(*I); - continue; } @@ -820,6 +862,20 @@ bool DSE::handleEndBlock(BasicBlock &BB) { return MadeChange; } +namespace { + struct CouldAlias { + typedef Value *argument_type; + const AliasAnalysis::Location &LoadedLoc; + AliasAnalysis *AA; + + bool operator()(Value *I) { + // See if the loaded location could alias the stack location. + AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA)); + return !AA->isNoAlias(StackLoc, LoadedLoc); + } + }; +} + /// RemoveAccessedObjects - Check to see if the specified location may alias any /// of the stack objects in the DeadStackObjects set. If so, they become live /// because the location is being loaded. @@ -838,16 +894,7 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, return; } - SmallVector<Value*, 16> NowLive; - for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(), - E = DeadStackObjects.end(); I != E; ++I) { - // See if the loaded location could alias the stack location. - AliasAnalysis::Location StackLoc(*I, getPointerSize(*I, *AA)); - if (!AA->isNoAlias(StackLoc, LoadedLoc)) - NowLive.push_back(*I); - } - - for (SmallVector<Value*, 16>::iterator I = NowLive.begin(), E = NowLive.end(); - I != E; ++I) - DeadStackObjects.remove(*I); + // Remove objects that could alias LoadedLoc. + CouldAlias Pred = { LoadedLoc, AA }; + DeadStackObjects.remove_if(Pred); } diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index 2627113..3c08634 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -14,17 +14,18 @@ #define DEBUG_TYPE "early-cse" #include "llvm/Transforms/Scalar.h" -#include "llvm/Instructions.h" -#include "llvm/Pass.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/ScopedHashTable.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/RecyclingAllocator.h" -#include "llvm/ADT/ScopedHashTable.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include <deque> using namespace llvm; @@ -90,35 +91,56 @@ template<> struct DenseMapInfo<SimpleValue> { unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { Instruction *Inst = Val.Inst; - // Hash in all of the operands as pointers. - unsigned Res = 0; - for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) - Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); + if (BinaryOperator* BinOp = dyn_cast<BinaryOperator>(Inst)) { + Value *LHS = BinOp->getOperand(0); + Value *RHS = BinOp->getOperand(1); + if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1)) + std::swap(LHS, RHS); + + if (isa<OverflowingBinaryOperator>(BinOp)) { + // Hash the overflow behavior + unsigned Overflow = + BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap | + BinOp->hasNoUnsignedWrap() * OverflowingBinaryOperator::NoUnsignedWrap; + return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS); + } - if (CastInst *CI = dyn_cast<CastInst>(Inst)) - Res ^= getHash(CI->getType()); - else if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) - Res ^= CI->getPredicate(); - else if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst)) { - for (ExtractValueInst::idx_iterator I = EVI->idx_begin(), - E = EVI->idx_end(); I != E; ++I) - Res ^= *I; - } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst)) { - for (InsertValueInst::idx_iterator I = IVI->idx_begin(), - E = IVI->idx_end(); I != E; ++I) - Res ^= *I; - } else { - // nothing extra to hash in. - assert((isa<CallInst>(Inst) || - isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) || - isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || - isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst)) && - "Invalid/unknown instruction"); + return hash_combine(BinOp->getOpcode(), LHS, RHS); } + if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) { + Value *LHS = CI->getOperand(0); + Value *RHS = CI->getOperand(1); + CmpInst::Predicate Pred = CI->getPredicate(); + if (Inst->getOperand(0) > Inst->getOperand(1)) { + std::swap(LHS, RHS); + Pred = CI->getSwappedPredicate(); + } + return hash_combine(Inst->getOpcode(), Pred, LHS, RHS); + } + + if (CastInst *CI = dyn_cast<CastInst>(Inst)) + return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0)); + + if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst)) + return hash_combine(EVI->getOpcode(), EVI->getOperand(0), + hash_combine_range(EVI->idx_begin(), EVI->idx_end())); + + if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst)) + return hash_combine(IVI->getOpcode(), IVI->getOperand(0), + IVI->getOperand(1), + hash_combine_range(IVI->idx_begin(), IVI->idx_end())); + + assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) || + isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) || + isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) || + isa<ShuffleVectorInst>(Inst)) && "Invalid/unknown instruction"); + // Mix in the opcode. - return (Res << 1) ^ Inst->getOpcode(); + return hash_combine(Inst->getOpcode(), + hash_combine_range(Inst->value_op_begin(), + Inst->value_op_end())); } bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { @@ -128,7 +150,41 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { return LHSI == RHSI; if (LHSI->getOpcode() != RHSI->getOpcode()) return false; - return LHSI->isIdenticalTo(RHSI); + if (LHSI->isIdenticalTo(RHSI)) return true; + + // If we're not strictly identical, we still might be a commutable instruction + if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) { + if (!LHSBinOp->isCommutative()) + return false; + + assert(isa<BinaryOperator>(RHSI) + && "same opcode, but different instruction type?"); + BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI); + + // Check overflow attributes + if (isa<OverflowingBinaryOperator>(LHSBinOp)) { + assert(isa<OverflowingBinaryOperator>(RHSBinOp) + && "same opcode, but different operator type?"); + if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() || + LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap()) + return false; + } + + // Commuted equality + return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) && + LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0); + } + if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) { + assert(isa<CmpInst>(RHSI) + && "same opcode, but different instruction type?"); + CmpInst *RHSCmp = cast<CmpInst>(RHSI); + // Commuted equality + return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) && + LHSCmp->getOperand(1) == RHSCmp->getOperand(0) && + LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate(); + } + + return false; } //===----------------------------------------------------------------------===// @@ -216,7 +272,7 @@ namespace { /// cases. class EarlyCSE : public FunctionPass { public: - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; DominatorTree *DT; typedef RecyclingAllocator<BumpPtrAllocator, @@ -274,7 +330,8 @@ private: CallScope(*availableCalls) {} private: - NodeScope(const NodeScope&); // DO NOT IMPLEMENT + NodeScope(const NodeScope&) LLVM_DELETED_FUNCTION; + void operator=(const NodeScope&) LLVM_DELETED_FUNCTION; ScopedHTType::ScopeTy Scope; LoadHTType::ScopeTy LoadScope; @@ -313,7 +370,8 @@ private: void process() { Processed = true; } private: - StackNode(const StackNode&); // DO NOT IMPLEMENT + StackNode(const StackNode&) LLVM_DELETED_FUNCTION; + void operator=(const StackNode&) LLVM_DELETED_FUNCTION; // Members. unsigned CurrentGeneration; @@ -506,7 +564,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { bool EarlyCSE::runOnFunction(Function &F) { std::deque<StackNode *> nodesToProcess; - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); DT = &getAnalysis<DominatorTree>(); diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 16ae6ad..14201b9 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -17,11 +17,6 @@ #define DEBUG_TYPE "gvn" #include "llvm/Transforms/Scalar.h" -#include "llvm/GlobalVariable.h" -#include "llvm/IRBuilder.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Metadata.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" @@ -37,11 +32,16 @@ #include "llvm/Analysis/PHITransAddr.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Assembly/Writer.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/PatternMatch.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -503,7 +503,7 @@ namespace { bool NoLoads; MemoryDependenceAnalysis *MD; DominatorTree *DT; - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; ValueTable VN; @@ -535,7 +535,7 @@ namespace { InstrsToErase.push_back(I); } - const TargetData *getTargetData() const { return TD; } + const DataLayout *getDataLayout() const { return TD; } DominatorTree &getDominatorTree() const { return *DT; } AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } MemoryDependenceAnalysis &getMemDep() const { return *MD; } @@ -632,7 +632,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void GVN::dump(DenseMap<uint32_t, Value*>& d) { errs() << "{\n"; for (DenseMap<uint32_t, Value*>::iterator I = d.begin(), @@ -730,7 +730,7 @@ SpeculationFailure: /// CoerceAvailableValueToLoadType will succeed. static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, - const TargetData &TD) { + const DataLayout &TD) { // If the loaded or stored value is an first class array or struct, don't try // to transform them. We need to be able to bitcast to integer. if (LoadTy->isStructTy() || LoadTy->isArrayTy() || @@ -746,7 +746,6 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, return true; } - /// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and /// then a load from a must-aliased pointer of a different type, try to coerce /// the stored value. LoadedTy is the type of the load we want to replace and @@ -756,7 +755,7 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, Instruction *InsertPt, - const TargetData &TD) { + const DataLayout &TD) { if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD)) return 0; @@ -769,24 +768,25 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, // If the store and reload are the same size, we can always reuse it. if (StoreSize == LoadSize) { // Pointer to Pointer -> use bitcast. - if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy()) + if (StoredValTy->getScalarType()->isPointerTy() && + LoadedTy->getScalarType()->isPointerTy()) return new BitCastInst(StoredVal, LoadedTy, "", InsertPt); // Convert source pointers to integers, which can be bitcast. - if (StoredValTy->isPointerTy()) { - StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); + if (StoredValTy->getScalarType()->isPointerTy()) { + StoredValTy = TD.getIntPtrType(StoredValTy); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } Type *TypeToCastTo = LoadedTy; - if (TypeToCastTo->isPointerTy()) - TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext()); + if (TypeToCastTo->getScalarType()->isPointerTy()) + TypeToCastTo = TD.getIntPtrType(TypeToCastTo); if (StoredValTy != TypeToCastTo) StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt); // Cast to pointer if the load needs a pointer type. - if (LoadedTy->isPointerTy()) + if (LoadedTy->getScalarType()->isPointerTy()) StoredVal = new IntToPtrInst(StoredVal, LoadedTy, "", InsertPt); return StoredVal; @@ -798,8 +798,8 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail"); // Convert source pointers to integers, which can be manipulated. - if (StoredValTy->isPointerTy()) { - StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); + if (StoredValTy->getScalarType()->isPointerTy()) { + StoredValTy = TD.getIntPtrType(StoredValTy); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } @@ -824,7 +824,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, return StoredVal; // If the result is a pointer, inttoptr. - if (LoadedTy->isPointerTy()) + if (LoadedTy->getScalarType()->isPointerTy()) return new IntToPtrInst(StoredVal, LoadedTy, "inttoptr", InsertPt); // Otherwise, bitcast. @@ -842,7 +842,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, Value *WritePtr, uint64_t WriteSizeInBits, - const TargetData &TD) { + const DataLayout &TD) { // If the loaded or stored value is a first class array or struct, don't try // to transform them. We need to be able to bitcast to integer. if (LoadTy->isStructTy() || LoadTy->isArrayTy()) @@ -915,7 +915,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, /// memdep query of a load that ends up being a clobbering store. static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, StoreInst *DepSI, - const TargetData &TD) { + const DataLayout &TD) { // Cannot handle reading from store of first-class aggregate yet. if (DepSI->getValueOperand()->getType()->isStructTy() || DepSI->getValueOperand()->getType()->isArrayTy()) @@ -931,7 +931,7 @@ static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, /// memdep query of a load that ends up being clobbered by another load. See if /// the other load can feed into the second load. static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, - LoadInst *DepLI, const TargetData &TD){ + LoadInst *DepLI, const DataLayout &TD){ // Cannot handle reading from store of first-class aggregate yet. if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy()) return -1; @@ -959,7 +959,7 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, MemIntrinsic *MI, - const TargetData &TD) { + const DataLayout &TD) { // If the mem operation is a non-constant size, we can't handle it. ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength()); if (SizeCst == 0) return -1; @@ -1009,7 +1009,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, /// before we give up. static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy, - Instruction *InsertPt, const TargetData &TD){ + Instruction *InsertPt, const DataLayout &TD){ LLVMContext &Ctx = SrcVal->getType()->getContext(); uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; @@ -1019,8 +1019,9 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, // Compute which bits of the stored value are being used by the load. Convert // to an integer type to start with. - if (SrcVal->getType()->isPointerTy()) - SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx)); + if (SrcVal->getType()->getScalarType()->isPointerTy()) + SrcVal = Builder.CreatePtrToInt(SrcVal, + TD.getIntPtrType(SrcVal->getType())); if (!SrcVal->getType()->isIntegerTy()) SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8)); @@ -1048,7 +1049,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy, Instruction *InsertPt, GVN &gvn) { - const TargetData &TD = *gvn.getTargetData(); + const DataLayout &TD = *gvn.getDataLayout(); // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to // widen SrcVal out to a larger load. unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType()); @@ -1107,7 +1108,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, /// memdep query of a load that ends up being a clobbering mem intrinsic. static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, Type *LoadTy, Instruction *InsertPt, - const TargetData &TD){ + const DataLayout &TD){ LLVMContext &Ctx = LoadTy->getContext(); uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8; @@ -1231,7 +1232,7 @@ struct AvailableValueInBlock { if (isSimpleValue()) { Res = getSimpleValue(); if (Res->getType() != LoadTy) { - const TargetData *TD = gvn.getTargetData(); + const DataLayout *TD = gvn.getDataLayout(); assert(TD && "Need target data to handle type mismatch case"); Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), *TD); @@ -1253,7 +1254,7 @@ struct AvailableValueInBlock { << *Res << '\n' << "\n\n\n"); } } else { - const TargetData *TD = gvn.getTargetData(); + const DataLayout *TD = gvn.getDataLayout(); assert(TD && "Need target data to handle type mismatch case"); Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy, BB->getTerminator(), *TD); @@ -1301,7 +1302,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); // If new PHI nodes were created, notify alias analysis. - if (V->getType()->isPointerTy()) { + if (V->getType()->getScalarType()->isPointerTy()) { AliasAnalysis *AA = gvn.getAliasAnalysis(); for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) @@ -1498,7 +1499,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { if (isa<PHINode>(V)) V->takeName(LI); - if (V->getType()->isPointerTy()) + if (V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(LI); ++NumGVNLoad; @@ -1730,7 +1731,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { LI->replaceAllUsesWith(V); if (isa<PHINode>(V)) V->takeName(LI); - if (V->getType()->isPointerTy()) + if (V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(LI); ++NumPRELoad; @@ -1857,7 +1858,7 @@ bool GVN::processLoad(LoadInst *L) { // Replace the load! L->replaceAllUsesWith(AvailVal); - if (AvailVal->getType()->isPointerTy()) + if (AvailVal->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(AvailVal); markInstructionForDeletion(L); ++NumGVNLoad; @@ -1914,7 +1915,7 @@ bool GVN::processLoad(LoadInst *L) { // Remove it! L->replaceAllUsesWith(StoredVal); - if (StoredVal->getType()->isPointerTy()) + if (StoredVal->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(StoredVal); markInstructionForDeletion(L); ++NumGVNLoad; @@ -1943,7 +1944,7 @@ bool GVN::processLoad(LoadInst *L) { // Remove it! patchAndReplaceAllUsesWith(AvailableVal, L); - if (DepLI->getType()->isPointerTy()) + if (DepLI->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(DepLI); markInstructionForDeletion(L); ++NumGVNLoad; @@ -2184,7 +2185,7 @@ bool GVN::processInstruction(Instruction *I) { // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) { I->replaceAllUsesWith(V); - if (MD && V->getType()->isPointerTy()) + if (MD && V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(I); ++NumGVNSimpl; @@ -2284,7 +2285,7 @@ bool GVN::processInstruction(Instruction *I) { // Remove it! patchAndReplaceAllUsesWith(repl, I); - if (MD && repl->getType()->isPointerTy()) + if (MD && repl->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(repl); markInstructionForDeletion(I); return true; @@ -2295,7 +2296,7 @@ bool GVN::runOnFunction(Function& F) { if (!NoLoads) MD = &getAnalysis<MemoryDependenceAnalysis>(); DT = &getAnalysis<DominatorTree>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); VN.setMemDep(MD); @@ -2532,7 +2533,7 @@ bool GVN::performPRE(Function &F) { addToLeaderTable(ValNo, Phi, CurrentBlock); Phi->setDebugLoc(CurInst->getDebugLoc()); CurInst->replaceAllUsesWith(Phi); - if (Phi->getType()->isPointerTy()) { + if (Phi->getType()->getScalarType()->isPointerTy()) { // Because we have added a PHI-use of the pointer value, it has now // "escaped" from alias analysis' perspective. We need to inform // AA of this. diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp index b36a3cb..1601a8d 100644 --- a/lib/Transforms/Scalar/GlobalMerge.cpp +++ b/lib/Transforms/Scalar/GlobalMerge.cpp @@ -53,19 +53,19 @@ #define DEBUG_TYPE "global-merge" #include "llvm/Transforms/Scalar.h" -#include "llvm/Attributes.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/GlobalVariable.h" -#include "llvm/Instructions.h" -#include "llvm/Intrinsics.h" -#include "llvm/Module.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumMerged , "Number of globals merged"); @@ -76,7 +76,7 @@ namespace { const TargetLowering *TLI; bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, - Module &M, bool isConst) const; + Module &M, bool isConst, unsigned AddrSpace) const; public: static char ID; // Pass identification, replacement for typeid. @@ -98,9 +98,9 @@ namespace { } struct GlobalCmp { - const TargetData *TD; + const DataLayout *TD; - GlobalCmp(const TargetData *td) : TD(td) { } + GlobalCmp(const DataLayout *td) : TD(td) { } bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) { Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType(); @@ -118,8 +118,8 @@ INITIALIZE_PASS(GlobalMerge, "global-merge", bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, - Module &M, bool isConst) const { - const TargetData *TD = TLI->getTargetData(); + Module &M, bool isConst, unsigned AddrSpace) const { + const DataLayout *TD = TLI->getDataLayout(); // FIXME: Infer the maximum possible offset depending on the actual users // (these max offsets are different for the users inside Thumb or ARM @@ -150,7 +150,9 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, Constant *MergedInit = ConstantStruct::get(MergedTy, Inits); GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst, GlobalValue::InternalLinkage, - MergedInit, "_MergedGlobals"); + MergedInit, "_MergedGlobals", + 0, GlobalVariable::NotThreadLocal, + AddrSpace); for (size_t k = i; k < j; ++k) { Constant *Idx[2] = { ConstantInt::get(Int32Ty, 0), @@ -169,8 +171,9 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, bool GlobalMerge::doInitialization(Module &M) { - SmallVector<GlobalVariable*, 16> Globals, ConstGlobals, BSSGlobals; - const TargetData *TD = TLI->getTargetData(); + DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals, + BSSGlobals; + const DataLayout *TD = TLI->getDataLayout(); unsigned MaxOffset = TLI->getMaximalGlobalOffset(); bool Changed = false; @@ -181,6 +184,11 @@ bool GlobalMerge::doInitialization(Module &M) { if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection()) continue; + PointerType *PT = dyn_cast<PointerType>(I->getType()); + assert(PT && "Global variable is not a pointer!"); + + unsigned AddressSpace = PT->getAddressSpace(); + // Ignore fancy-aligned globals for now. unsigned Alignment = TD->getPreferredAlignment(I); Type *Ty = I->getType()->getElementType(); @@ -195,18 +203,23 @@ bool GlobalMerge::doInitialization(Module &M) { if (TD->getTypeAllocSize(Ty) < MaxOffset) { if (TargetLoweringObjectFile::getKindForGlobal(I, TLI->getTargetMachine()) .isBSSLocal()) - BSSGlobals.push_back(I); + BSSGlobals[AddressSpace].push_back(I); else if (I->isConstant()) - ConstGlobals.push_back(I); + ConstGlobals[AddressSpace].push_back(I); else - Globals.push_back(I); + Globals[AddressSpace].push_back(I); } } - if (Globals.size() > 1) - Changed |= doMerge(Globals, M, false); - if (BSSGlobals.size() > 1) - Changed |= doMerge(BSSGlobals, M, false); + for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator + I = Globals.begin(), E = Globals.end(); I != E; ++I) + if (I->second.size() > 1) + Changed |= doMerge(I->second, M, false, I->first); + + for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator + I = BSSGlobals.begin(), E = BSSGlobals.end(); I != E; ++I) + if (I->second.size() > 1) + Changed |= doMerge(I->second, M, false, I->first); // FIXME: This currently breaks the EH processing due to way how the // typeinfo detection works. We might want to detect the TIs and ignore diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index c933a17..97fff7e 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -26,28 +26,28 @@ #define DEBUG_TYPE "indvars" #include "llvm/Transforms/Scalar.h" -#include "llvm/BasicBlock.h" -#include "llvm/Constants.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Type.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Type.h" #include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumWidened , "Number of indvars widened"); @@ -68,7 +68,7 @@ namespace { LoopInfo *LI; ScalarEvolution *SE; DominatorTree *DT; - TargetData *TD; + DataLayout *TD; TargetLibraryInfo *TLI; SmallVector<WeakVH, 16> DeadInsts; @@ -220,8 +220,6 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def, /// ConvertToSInt - Convert APF to an integer, if possible. static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { bool isExact = false; - if (&APF.getSemantics() == &APFloat::PPCDoubleDouble) - return false; // See if we can convert this to an int64_t uint64_t UIntVal; if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero, @@ -551,15 +549,17 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { PN->setIncomingValue(i, ExitVal); - // If this instruction is dead now, delete it. - RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI); + // If this instruction is dead now, delete it. Don't do it now to avoid + // invalidating iterators. + if (isInstructionTriviallyDead(Inst, TLI)) + DeadInsts.push_back(Inst); if (NumPreds == 1) { // Completely replace a single-pred PHI. This is safe, because the // NewVal won't be variant in the loop, so we don't need an LCSSA phi // node anymore. PN->replaceAllUsesWith(ExitVal); - RecursivelyDeleteTriviallyDeadInstructions(PN, TLI); + PN->eraseFromParent(); } } if (NumPreds != 1) { @@ -597,13 +597,13 @@ namespace { class WideIVVisitor : public IVVisitor { ScalarEvolution *SE; - const TargetData *TD; + const DataLayout *TD; public: WideIVInfo WI; WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV, - const TargetData *TData) : + const DataLayout *TData) : SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; } // Implement the interface used by simplifyUsersOfIV. @@ -1261,8 +1261,13 @@ static bool needsLFTR(Loop *L, DominatorTree *DT) { if (!Phi) return true; + // Do LFTR if PHI node is defined in the loop, but is *not* a counter. + int Idx = Phi->getBasicBlockIndex(L->getLoopLatch()); + if (Idx < 0) + return true; + // Do LFTR if the exit condition's IV is *not* a simple counter. - Value *IncV = Phi->getIncomingValueForBlock(L->getLoopLatch()); + Value *IncV = Phi->getIncomingValue(Idx); return Phi != getLoopPhiForCounter(IncV, L, DT); } @@ -1341,7 +1346,7 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { /// could at least handle constant BECounts. static PHINode * FindLoopCounter(Loop *L, const SCEV *BECount, - ScalarEvolution *SE, DominatorTree *DT, const TargetData *TD) { + ScalarEvolution *SE, DominatorTree *DT, const DataLayout *TD) { uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); Value *Cond = @@ -1698,7 +1703,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { LI = &getAnalysis<LoopInfo>(); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTree>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); DeadInsts.clear(); diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 20844c6..b61c5ba 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -13,28 +13,28 @@ #define DEBUG_TYPE "jump-threading" #include "llvm/Transforms/Scalar.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Pass.h" -#include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LazyValueInfo.h" -#include "llvm/Analysis/Loads.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; STATISTIC(NumThreads, "Number of jumps threaded"); @@ -75,7 +75,7 @@ namespace { /// revectored to the false side of the second if. /// class JumpThreading : public FunctionPass { - TargetData *TD; + DataLayout *TD; TargetLibraryInfo *TLI; LazyValueInfo *LVI; #ifdef NDEBUG @@ -147,7 +147,7 @@ FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); } /// bool JumpThreading::runOnFunction(Function &F) { DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); LVI = &getAnalysis<LazyValueInfo>(); @@ -216,19 +216,24 @@ bool JumpThreading::runOnFunction(Function &F) { } /// getJumpThreadDuplicationCost - Return the cost of duplicating this block to -/// thread across it. -static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) { +/// thread across it. Stop scanning the block when passing the threshold. +static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, + unsigned Threshold) { /// Ignore PHI nodes, these will be flattened when duplication happens. BasicBlock::const_iterator I = BB->getFirstNonPHI(); // FIXME: THREADING will delete values that are just used to compute the // branch, so they shouldn't count against the duplication cost. - // Sum up the cost of each instruction until we get to the terminator. Don't // include the terminator because the copy won't include it. unsigned Size = 0; for (; !isa<TerminatorInst>(I); ++I) { + + // Stop scanning the block if we've reached the threshold. + if (Size > Threshold) + return Size; + // Debugger intrinsics don't incur code size. if (isa<DbgInfoIntrinsic>(I)) continue; @@ -244,7 +249,11 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) { // as having cost of 2 total, and if they are a vector intrinsic, we model // them as having cost 1. if (const CallInst *CI = dyn_cast<CallInst>(I)) { - if (!isa<IntrinsicInst>(CI)) + if (CI->hasFnAttr(Attribute::NoDuplicate)) + // Blocks with NoDuplicate are modelled as having infinite cost, so they + // are never duplicated. + return ~0U; + else if (!isa<IntrinsicInst>(CI)) Size += 3; else if (!CI->getType()->isVectorTy()) Size += 1; @@ -1337,7 +1346,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, return false; } - unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB); + unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, Threshold); if (JumpThreadCost > Threshold) { DEBUG(dbgs() << " Not threading BB '" << BB->getName() << "' - Cost is too high: " << JumpThreadCost << "\n"); @@ -1481,7 +1490,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, return false; } - unsigned DuplicationCost = getJumpThreadDuplicationCost(BB); + unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, Threshold); if (DuplicationCost > Threshold) { DEBUG(dbgs() << " Not duplicating BB '" << BB->getName() << "' - Cost is too high: " << DuplicationCost << "\n"); diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 99bedce..dc6bef7 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -32,27 +32,28 @@ #define DEBUG_TYPE "licm" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Instructions.h" -#include "llvm/LLVMContext.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/Debug.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" #include <algorithm> using namespace llvm; @@ -90,6 +91,8 @@ namespace { AU.addRequired<TargetLibraryInfo>(); } + using llvm::Pass::doFinalization; + bool doFinalization() { assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets"); return false; @@ -100,7 +103,7 @@ namespace { LoopInfo *LI; // Current LoopInfo DominatorTree *DT; // Dominator Tree for the current Loop. - TargetData *TD; // TargetData for constant folding. + DataLayout *TD; // DataLayout for constant folding. TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding. // State that is updated as we process loops. @@ -207,7 +210,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { AA = &getAnalysis<AliasAnalysis>(); DT = &getAnalysis<DominatorTree>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); CurAST = new AliasSetTracker(*AA); @@ -663,16 +666,18 @@ namespace { AliasSetTracker &AST; DebugLoc DL; int Alignment; + MDNode *TBAATag; public: LoopPromoter(Value *SP, const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, SmallPtrSet<Value*, 4> &PMA, SmallVectorImpl<BasicBlock*> &LEB, SmallVectorImpl<Instruction*> &LIP, - AliasSetTracker &ast, DebugLoc dl, int alignment) + AliasSetTracker &ast, DebugLoc dl, int alignment, + MDNode *TBAATag) : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), - AST(ast), DL(dl), Alignment(alignment) {} + AST(ast), DL(dl), Alignment(alignment), TBAATag(TBAATag) {} virtual bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction*> &) const { @@ -696,6 +701,7 @@ namespace { StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos); NewSI->setAlignment(Alignment); NewSI->setDebugLoc(DL); + if (TBAATag) NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag); } } @@ -749,10 +755,11 @@ void LICM::PromoteAliasSet(AliasSet &AS, // We start with an alignment of one and try to find instructions that allow // us to prove better alignment. unsigned Alignment = 1; + MDNode *TBAATag = 0; // Check that all of the pointers in the alias set have the same type. We // cannot (yet) promote a memory location that is loaded and stored in - // different sizes. + // different sizes. While we are at it, collect alignment and TBAA info. for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) { Value *ASIV = ASI->getValue(); PointerMustAliases.insert(ASIV); @@ -794,8 +801,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, // instruction will be executed, update the alignment. // Larger is better, with the exception of 0 being the best alignment. unsigned InstAlignment = store->getAlignment(); - if ((InstAlignment > Alignment || InstAlignment == 0) - && (Alignment != 0)) + if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0) if (isGuaranteedToExecute(*Use)) { GuaranteedToExecute = true; Alignment = InstAlignment; @@ -807,6 +813,15 @@ void LICM::PromoteAliasSet(AliasSet &AS, } else return; // Not a load or store. + // Merge the TBAA tags. + if (LoopUses.empty()) { + // On the first load/store, just take its TBAA tag. + TBAATag = Use->getMetadata(LLVMContext::MD_tbaa); + } else if (TBAATag) { + TBAATag = MDNode::getMostGenericTBAA(TBAATag, + Use->getMetadata(LLVMContext::MD_tbaa)); + } + LoopUses.push_back(Use); } } @@ -839,7 +854,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, SmallVector<PHINode*, 16> NewPHIs; SSAUpdater SSA(&NewPHIs); LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, - InsertPts, *CurAST, DL, Alignment); + InsertPts, *CurAST, DL, Alignment, TBAATag); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. @@ -848,6 +863,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, Preheader->getTerminator()); PreheaderLoad->setAlignment(Alignment); PreheaderLoad->setDebugLoc(DL); + if (TBAATag) PreheaderLoad->setMetadata(LLVMContext::MD_tbaa, TBAATag); SSA.AddAvailableValue(Preheader, PreheaderLoad); // Rewrite all the loads in the loop and remember all the definitions from diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index 3771f5a..9c67e32 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -16,11 +16,11 @@ #define DEBUG_TYPE "loop-delete" #include "llvm/Transforms/Scalar.h" -#include "llvm/Analysis/LoopPass.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/SmallVector.h" using namespace llvm; STATISTIC(NumDeleted, "Number of loops deleted"); diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a72e288..c4f9012 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -43,18 +43,19 @@ #define DEBUG_TYPE "loop-idiom" #include "llvm/Transforms/Scalar.h" -#include "llvm/IRBuilder.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Module.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -63,16 +64,83 @@ STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); namespace { + + class LoopIdiomRecognize; + + /// This class defines some utility functions for loop idiom recognization. + class LIRUtil { + public: + /// Return true iff the block contains nothing but an uncondition branch + /// (aka goto instruction). + static bool isAlmostEmpty(BasicBlock *); + + static BranchInst *getBranch(BasicBlock *BB) { + return dyn_cast<BranchInst>(BB->getTerminator()); + } + + /// Return the condition of the branch terminating the given basic block. + static Value *getBrCondtion(BasicBlock *); + + /// Derive the precondition block (i.e the block that guards the loop + /// preheader) from the given preheader. + static BasicBlock *getPrecondBb(BasicBlock *PreHead); + }; + + /// This class is to recoginize idioms of population-count conducted in + /// a noncountable loop. Currently it only recognizes this pattern: + /// \code + /// while(x) {cnt++; ...; x &= x - 1; ...} + /// \endcode + class NclPopcountRecognize { + LoopIdiomRecognize &LIR; + Loop *CurLoop; + BasicBlock *PreCondBB; + + typedef IRBuilder<> IRBuilderTy; + + public: + explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR); + bool recognize(); + + private: + /// Take a glimpse of the loop to see if we need to go ahead recoginizing + /// the idiom. + bool preliminaryScreen(); + + /// Check if the given conditional branch is based on the comparison + /// beween a variable and zero, and if the variable is non-zero, the + /// control yeilds to the loop entry. If the branch matches the behavior, + /// the variable involved in the comparion is returned. This function will + /// be called to see if the precondition and postcondition of the loop + /// are in desirable form. + Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const; + + /// Return true iff the idiom is detected in the loop. and 1) \p CntInst + /// is set to the instruction counting the pupulation bit. 2) \p CntPhi + /// is set to the corresponding phi node. 3) \p Var is set to the value + /// whose population bits are being counted. + bool detectIdiom + (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const; + + /// Insert ctpop intrinsic function and some obviously dead instructions. + void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var); + + /// Create llvm.ctpop.* intrinsic function. + CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL); + }; + class LoopIdiomRecognize : public LoopPass { Loop *CurLoop; - const TargetData *TD; + const DataLayout *TD; DominatorTree *DT; ScalarEvolution *SE; TargetLibraryInfo *TLI; + const TargetTransformInfo *TTI; public: static char ID; explicit LoopIdiomRecognize() : LoopPass(ID) { initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); + TD = 0; DT = 0; SE = 0; TLI = 0; TTI = 0; } bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -109,7 +177,34 @@ namespace { AU.addPreserved<DominatorTree>(); AU.addRequired<DominatorTree>(); AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetTransformInfo>(); + } + + const DataLayout *getDataLayout() { + return TD ? TD : TD=getAnalysisIfAvailable<DataLayout>(); + } + + DominatorTree *getDominatorTree() { + return DT ? DT : (DT=&getAnalysis<DominatorTree>()); + } + + ScalarEvolution *getScalarEvolution() { + return SE ? SE : (SE = &getAnalysis<ScalarEvolution>()); + } + + TargetLibraryInfo *getTargetLibraryInfo() { + return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>()); + } + + const TargetTransformInfo *getTargetTransformInfo() { + return TTI ? TTI : (TTI = &getAnalysis<TargetTransformInfo>()); } + + Loop *getLoop() const { return CurLoop; } + + private: + bool runOnNoncountableLoop(); + bool runOnCountableLoop(); }; } @@ -123,6 +218,7 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) @@ -172,19 +268,393 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, deleteDeadInstruction(I, SE, TLI); } -bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { - CurLoop = L; +//===----------------------------------------------------------------------===// +// +// Implementation of LIRUtil +// +//===----------------------------------------------------------------------===// - // Disable loop idiom recognition if the function's name is a common idiom. - StringRef Name = L->getHeader()->getParent()->getName(); - if (Name == "memset" || Name == "memcpy") +// This fucntion will return true iff the given block contains nothing but goto. +// A typical usage of this function is to check if the preheader fucntion is +// "almost" empty such that generated intrinsic function can be moved across +// preheader and to be placed at the end of the preconditiona block without +// concerning of breaking data dependence. +bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { + if (BranchInst *Br = getBranch(BB)) { + return Br->isUnconditional() && BB->size() == 1; + } + return false; +} + +Value *LIRUtil::getBrCondtion(BasicBlock *BB) { + BranchInst *Br = getBranch(BB); + return Br ? Br->getCondition() : 0; +} + +BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) { + if (BasicBlock *BB = PreHead->getSinglePredecessor()) { + BranchInst *Br = getBranch(BB); + return Br && Br->isConditional() ? BB : 0; + } + return 0; +} + +//===----------------------------------------------------------------------===// +// +// Implementation of NclPopcountRecognize +// +//===----------------------------------------------------------------------===// + +NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR): + LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(0) { +} + +bool NclPopcountRecognize::preliminaryScreen() { + const TargetTransformInfo *TTI = LIR.getTargetTransformInfo(); + if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) return false; - // The trip count of the loop must be analyzable. - SE = &getAnalysis<ScalarEvolution>(); - if (!SE->hasLoopInvariantBackedgeTakenCount(L)) + // Counting population are usually conducted by few arithmetic instrutions. + // Such instructions can be easilly "absorbed" by vacant slots in a + // non-compact loop. Therefore, recognizing popcount idiom only makes sense + // in a compact loop. + + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) + return false; + + BasicBlock *LoopBody = *(CurLoop->block_begin()); + if (LoopBody->size() >= 20) { + // The loop is too big, bail out. + return false; + } + + // It should have a preheader containing nothing but a goto instruction. + BasicBlock *PreHead = CurLoop->getLoopPreheader(); + if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead)) + return false; + + // It should have a precondition block where the generated popcount instrinsic + // function will be inserted. + PreCondBB = LIRUtil::getPrecondBb(PreHead); + if (!PreCondBB) + return false; + + return true; +} + +Value *NclPopcountRecognize::matchCondition (BranchInst *Br, + BasicBlock *LoopEntry) const { + if (!Br || !Br->isConditional()) + return 0; + + ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition()); + if (!Cond) + return 0; + + ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); + if (!CmpZero || !CmpZero->isZero()) + return 0; + + ICmpInst::Predicate Pred = Cond->getPredicate(); + if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) || + (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry)) + return Cond->getOperand(0); + + return 0; +} + +bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, + PHINode *&CntPhi, + Value *&Var) const { + // Following code tries to detect this idiom: + // + // if (x0 != 0) + // goto loop-exit // the precondition of the loop + // cnt0 = init-val; + // do { + // x1 = phi (x0, x2); + // cnt1 = phi(cnt0, cnt2); + // + // cnt2 = cnt1 + 1; + // ... + // x2 = x1 & (x1 - 1); + // ... + // } while(x != 0); + // + // loop-exit: + // + + // step 1: Check to see if the look-back branch match this pattern: + // "if (a!=0) goto loop-entry". + BasicBlock *LoopEntry; + Instruction *DefX2, *CountInst; + Value *VarX1, *VarX0; + PHINode *PhiX, *CountPhi; + + DefX2 = CountInst = 0; + VarX1 = VarX0 = 0; + PhiX = CountPhi = 0; + LoopEntry = *(CurLoop->block_begin()); + + // step 1: Check if the loop-back branch is in desirable form. + { + if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry)) + DefX2 = dyn_cast<Instruction>(T); + else + return false; + } + + // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" + { + if (DefX2->getOpcode() != Instruction::And) + return false; + + BinaryOperator *SubOneOp; + + if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) + VarX1 = DefX2->getOperand(1); + else { + VarX1 = DefX2->getOperand(0); + SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); + } + if (!SubOneOp) + return false; + + Instruction *SubInst = cast<Instruction>(SubOneOp); + ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); + if (!Dec || + !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || + (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) { + return false; + } + } + + // step 3: Check the recurrence of variable X + { + PhiX = dyn_cast<PHINode>(VarX1); + if (!PhiX || + (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { + return false; + } + } + + // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 + { + CountInst = NULL; + for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(), + IterE = LoopEntry->end(); Iter != IterE; Iter++) { + Instruction *Inst = Iter; + if (Inst->getOpcode() != Instruction::Add) + continue; + + ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); + if (!Inc || !Inc->isOne()) + continue; + + PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); + if (!Phi || Phi->getParent() != LoopEntry) + continue; + + // Check if the result of the instruction is live of the loop. + bool LiveOutLoop = false; + for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end(); + I != E; I++) { + if ((cast<Instruction>(*I))->getParent() != LoopEntry) { + LiveOutLoop = true; break; + } + } + + if (LiveOutLoop) { + CountInst = Inst; + CountPhi = Phi; + break; + } + } + + if (!CountInst) + return false; + } + + // step 5: check if the precondition is in this form: + // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" + { + BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); + Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader()); + if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) + return false; + + CntInst = CountInst; + CntPhi = CountPhi; + Var = T; + } + + return true; +} + +void NclPopcountRecognize::transform(Instruction *CntInst, + PHINode *CntPhi, Value *Var) { + + ScalarEvolution *SE = LIR.getScalarEvolution(); + TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo(); + BasicBlock *PreHead = CurLoop->getLoopPreheader(); + BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); + const DebugLoc DL = CntInst->getDebugLoc(); + + // Assuming before transformation, the loop is following: + // if (x) // the precondition + // do { cnt++; x &= x - 1; } while(x); + + // Step 1: Insert the ctpop instruction at the end of the precondition block + IRBuilderTy Builder(PreCondBr); + Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; + { + PopCnt = createPopcntIntrinsic(Builder, Var, DL); + NewCount = PopCntZext = + Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); + + if (NewCount != PopCnt) + (cast<Instruction>(NewCount))->setDebugLoc(DL); + + // TripCnt is exactly the number of iterations the loop has + TripCnt = NewCount; + + // If the popoulation counter's initial value is not zero, insert Add Inst. + Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); + ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); + if (!InitConst || !InitConst->isZero()) { + NewCount = Builder.CreateAdd(NewCount, CntInitVal); + (cast<Instruction>(NewCount))->setDebugLoc(DL); + } + } + + // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to + // "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic + // function would be partial dead code, and downstream passes will drag + // it back from the precondition block to the preheader. + { + ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); + + Value *Opnd0 = PopCntZext; + Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); + if (PreCond->getOperand(0) != Var) + std::swap(Opnd0, Opnd1); + + ICmpInst *NewPreCond = + cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); + PreCond->replaceAllUsesWith(NewPreCond); + + deleteDeadInstruction(PreCond, *SE, TLI); + } + + // Step 3: Note that the population count is exactly the trip count of the + // loop in question, which enble us to to convert the loop from noncountable + // loop into a countable one. The benefit is twofold: + // + // - If the loop only counts population, the entire loop become dead after + // the transformation. It is lots easier to prove a countable loop dead + // than to prove a noncountable one. (In some C dialects, a infite loop + // isn't dead even if it computes nothing useful. In general, DCE needs + // to prove a noncountable loop finite before safely delete it.) + // + // - If the loop also performs something else, it remains alive. + // Since it is transformed to countable form, it can be aggressively + // optimized by some optimizations which are in general not applicable + // to a noncountable loop. + // + // After this step, this loop (conceptually) would look like following: + // newcnt = __builtin_ctpop(x); + // t = newcnt; + // if (x) + // do { cnt++; x &= x-1; t--) } while (t > 0); + BasicBlock *Body = *(CurLoop->block_begin()); + { + BranchInst *LbBr = LIRUtil::getBranch(Body); + ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); + Type *Ty = TripCnt->getType(); + + PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin()); + + Builder.SetInsertPoint(LbCond); + Value *Opnd1 = cast<Value>(TcPhi); + Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1)); + Instruction *TcDec = + cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true)); + + TcPhi->addIncoming(TripCnt, PreHead); + TcPhi->addIncoming(TcDec, Body); + + CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ? + CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; + LbCond->setPredicate(Pred); + LbCond->setOperand(0, TcDec); + LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0))); + } + + // Step 4: All the references to the original population counter outside + // the loop are replaced with the NewCount -- the value returned from + // __builtin_ctpop(). + { + SmallVector<Value *, 4> CntUses; + for (Value::use_iterator I = CntInst->use_begin(), E = CntInst->use_end(); + I != E; I++) { + if (cast<Instruction>(*I)->getParent() != Body) + CntUses.push_back(*I); + } + for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) { + (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount); + } + } + + // step 5: Forget the "non-computable" trip-count SCEV associated with the + // loop. The loop would otherwise not be deleted even if it becomes empty. + SE->forgetLoop(CurLoop); +} + +CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, + Value *Val, DebugLoc DL) { + Value *Ops[] = { Val }; + Type *Tys[] = { Val->getType() }; + + Module *M = (*(CurLoop->block_begin()))->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); + CallInst *CI = IRBuilder.CreateCall(Func, Ops); + CI->setDebugLoc(DL); + + return CI; +} + +/// recognize - detect population count idiom in a non-countable loop. If +/// detected, transform the relevant code to popcount intrinsic function +/// call, and return true; otherwise, return false. +bool NclPopcountRecognize::recognize() { + + if (!LIR.getTargetTransformInfo()) + return false; + + LIR.getScalarEvolution(); + + if (!preliminaryScreen()) return false; - const SCEV *BECount = SE->getBackedgeTakenCount(L); + + Instruction *CntInst; + PHINode *CntPhi; + Value *Val; + if (!detectIdiom(CntInst, CntPhi, Val)) + return false; + + transform(CntInst, CntPhi, Val); + return true; +} + +//===----------------------------------------------------------------------===// +// +// Implementation of LoopIdiomRecognize +// +//===----------------------------------------------------------------------===// + +bool LoopIdiomRecognize::runOnCountableLoop() { + const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); if (isa<SCEVCouldNotCompute>(BECount)) return false; // If this loop executes exactly one time, then it should be peeled, not @@ -194,24 +664,29 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { return false; // We require target data for now. - TD = getAnalysisIfAvailable<TargetData>(); - if (TD == 0) return false; + if (!getDataLayout()) + return false; + + // set DT + (void)getDominatorTree(); - DT = &getAnalysis<DominatorTree>(); LoopInfo &LI = getAnalysis<LoopInfo>(); TLI = &getAnalysis<TargetLibraryInfo>(); + // set TLI + (void)getTargetLibraryInfo(); + SmallVector<BasicBlock*, 8> ExitBlocks; CurLoop->getUniqueExitBlocks(ExitBlocks); DEBUG(dbgs() << "loop-idiom Scanning: F[" - << L->getHeader()->getParent()->getName() - << "] Loop %" << L->getHeader()->getName() << "\n"); + << CurLoop->getHeader()->getParent()->getName() + << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); bool MadeChange = false; // Scan all the blocks in the loop that are not in subloops. - for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; - ++BI) { + for (Loop::block_iterator BI = CurLoop->block_begin(), + E = CurLoop->block_end(); BI != E; ++BI) { // Ignore blocks in subloops. if (LI.getLoopFor(*BI) != CurLoop) continue; @@ -221,6 +696,33 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { return MadeChange; } +bool LoopIdiomRecognize::runOnNoncountableLoop() { + NclPopcountRecognize Popcount(*this); + if (Popcount.recognize()) + return true; + + return false; +} + +bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { + CurLoop = L; + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) + return false; + + // Disable loop idiom recognition if the function's name is a common idiom. + StringRef Name = L->getHeader()->getParent()->getName(); + if (Name == "memset" || Name == "memcpy") + return false; + + SE = &getAnalysis<ScalarEvolution>(); + if (SE->hasLoopInvariantBackedgeTakenCount(L)) + return runOnCountableLoop(); + return runOnNoncountableLoop(); +} + /// runOnLoopBlock - Process the specified block, which lives in a counted loop /// with the specified backedge count. This block is known to be in the current /// loop and not in any subloops. @@ -403,7 +905,7 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, /// /// Note that we don't ever attempt to use memset_pattern8 or 4, because these /// just replicate their input array and then pass on to memset_pattern16. -static Constant *getMemSetPatternValue(Value *V, const TargetData &TD) { +static Constant *getMemSetPatternValue(Value *V, const DataLayout &TD) { // If the value isn't a constant, we can't promote it to being in a constant // array. We could theoretically do a store to an alloca or something, but // that doesn't seem worthwhile. diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp index f5daa7b..c48808f 100644 --- a/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -12,17 +12,17 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "loop-instsimplify" -#include "llvm/Instructions.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumSimplified, "Number of redundant instructions simplified"); @@ -66,7 +66,7 @@ Pass *llvm::createLoopInstSimplifyPass() { bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>(); LoopInfo *LI = &getAnalysis<LoopInfo>(); - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); SmallVector<BasicBlock*, 8> ExitBlocks; diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index abe07aa..0ea80f3 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -13,20 +13,20 @@ #define DEBUG_TYPE "loop-rotate" #include "llvm/Transforms/Scalar.h" -#include "llvm/Function.h" -#include "llvm/IntrinsicInst.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/ValueMapper.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/Debug.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; #define MAX_HEADER_SIZE 16 @@ -274,10 +274,16 @@ bool LoopRotate::rotateLoop(Loop *L) { if (OrigLatch == 0 || L->isLoopExiting(OrigLatch)) return false; - // Check size of original header and reject loop if it is very big. + // Check size of original header and reject loop if it is very big or we can't + // duplicate blocks inside it. { CodeMetrics Metrics; Metrics.analyzeBasicBlock(OrigHeader); + if (Metrics.notDuplicatable) { + DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non duplicatable" + << " instructions: "; L->dump()); + return false; + } if (Metrics.NumInsts > MAX_HEADER_SIZE) return false; } diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index d7495da..c7b853e 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -37,8 +37,8 @@ // // TODO: Handle multiple loops at a time. // -// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr -// instead of a GlobalValue? +// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead +// of a GlobalValue? // // TODO: When truncation is free, truncate ICmp users' operands to make it a // smaller encoding (on x86 at least). @@ -55,25 +55,25 @@ #define DEBUG_TYPE "loop-reduce" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Analysis/IVUsers.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Assembly/Writer.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/SmallBitVector.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/Support/Debug.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLowering.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; @@ -121,7 +121,7 @@ void RegSortData::print(raw_ostream &OS) const { OS << "[NumUses=" << UsedByIndices.count() << ']'; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void RegSortData::dump() const { print(errs()); errs() << '\n'; } @@ -223,16 +223,24 @@ namespace { /// computing satisfying a use. It may include broken-out immediates and scaled /// registers. struct Formula { - /// AM - This is used to represent complex addressing, as well as other kinds - /// of interesting uses. - TargetLowering::AddrMode AM; + /// Global base address used for complex addressing. + GlobalValue *BaseGV; + + /// Base offset for complex addressing. + int64_t BaseOffset; + + /// Whether any complex addressing has a base register. + bool HasBaseReg; + + /// The scale of any complex addressing. + int64_t Scale; /// BaseRegs - The list of "base" registers for this use. When this is - /// non-empty, AM.HasBaseReg should be set to true. + /// non-empty, SmallVector<const SCEV *, 2> BaseRegs; /// ScaledReg - The 'scaled' register for this use. This should be non-null - /// when AM.Scale is not zero. + /// when Scale is not zero. const SCEV *ScaledReg; /// UnfoldedOffset - An additional constant offset which added near the @@ -240,7 +248,9 @@ struct Formula { /// live in an add immediate field rather than a register. int64_t UnfoldedOffset; - Formula() : ScaledReg(0), UnfoldedOffset(0) {} + Formula() + : BaseGV(0), BaseOffset(0), HasBaseReg(false), Scale(0), ScaledReg(0), + UnfoldedOffset(0) {} void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); @@ -326,13 +336,13 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { const SCEV *Sum = SE.getAddExpr(Good); if (!Sum->isZero()) BaseRegs.push_back(Sum); - AM.HasBaseReg = true; + HasBaseReg = true; } if (!Bad.empty()) { const SCEV *Sum = SE.getAddExpr(Bad); if (!Sum->isZero()) BaseRegs.push_back(Sum); - AM.HasBaseReg = true; + HasBaseReg = true; } } @@ -348,7 +358,7 @@ unsigned Formula::getNumRegs() const { Type *Formula::getType() const { return !BaseRegs.empty() ? BaseRegs.front()->getType() : ScaledReg ? ScaledReg->getType() : - AM.BaseGV ? AM.BaseGV->getType() : + BaseGV ? BaseGV->getType() : 0; } @@ -381,29 +391,29 @@ bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, void Formula::print(raw_ostream &OS) const { bool First = true; - if (AM.BaseGV) { + if (BaseGV) { if (!First) OS << " + "; else First = false; - WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false); + WriteAsOperand(OS, BaseGV, /*PrintType=*/false); } - if (AM.BaseOffs != 0) { + if (BaseOffset != 0) { if (!First) OS << " + "; else First = false; - OS << AM.BaseOffs; + OS << BaseOffset; } for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), E = BaseRegs.end(); I != E; ++I) { if (!First) OS << " + "; else First = false; OS << "reg(" << **I << ')'; } - if (AM.HasBaseReg && BaseRegs.empty()) { + if (HasBaseReg && BaseRegs.empty()) { if (!First) OS << " + "; else First = false; OS << "**error: HasBaseReg**"; - } else if (!AM.HasBaseReg && !BaseRegs.empty()) { + } else if (!HasBaseReg && !BaseRegs.empty()) { if (!First) OS << " + "; else First = false; OS << "**error: !HasBaseReg**"; } - if (AM.Scale != 0) { + if (Scale != 0) { if (!First) OS << " + "; else First = false; - OS << AM.Scale << "*reg("; + OS << Scale << "*reg("; if (ScaledReg) OS << *ScaledReg; else @@ -416,7 +426,7 @@ void Formula::print(raw_ostream &OS) const { } } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void Formula::dump() const { print(errs()); errs() << '\n'; } @@ -926,8 +936,8 @@ void Cost::RateFormula(const Formula &F, // Tally up the non-zero immediates. for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), E = Offsets.end(); I != E; ++I) { - int64_t Offset = (uint64_t)*I + F.AM.BaseOffs; - if (F.AM.BaseGV) + int64_t Offset = (uint64_t)*I + F.BaseOffset; + if (F.BaseGV) ImmCost += 64; // Handle symbolic values conservatively. // TODO: This should probably be the pointer size. else if (Offset != 0) @@ -978,7 +988,7 @@ void Cost::print(raw_ostream &OS) const { OS << ", plus " << SetupCost << " setup cost"; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void Cost::dump() const { print(errs()); errs() << '\n'; } @@ -1066,7 +1076,7 @@ void LSRFixup::print(raw_ostream &OS) const { OS << ", Offset=" << Offset; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LSRFixup::dump() const { print(errs()); errs() << '\n'; } @@ -1260,7 +1270,7 @@ void LSRUse::print(raw_ostream &OS) const { OS << ", widest fixup type: " << *WidestFixupType; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LSRUse::dump() const { print(errs()); errs() << '\n'; } @@ -1269,46 +1279,42 @@ void LSRUse::dump() const { /// isLegalUse - Test whether the use described by AM is "legal", meaning it can /// be completely folded into the user instruction at isel time. This includes /// address-mode folding and special icmp tricks. -static bool isLegalUse(const TargetLowering::AddrMode &AM, - LSRUse::KindType Kind, Type *AccessTy, - const TargetLowering *TLI) { +static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind, + Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, + bool HasBaseReg, int64_t Scale) { switch (Kind) { case LSRUse::Address: - // If we have low-level target information, ask the target if it can - // completely fold this address. - if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy); + return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); // Otherwise, just guess that reg+reg addressing is legal. - return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1; + //return ; case LSRUse::ICmpZero: // There's not even a target hook for querying whether it would be legal to // fold a GV into an ICmp. - if (AM.BaseGV) + if (BaseGV) return false; // ICmp only has two operands; don't allow more than two non-trivial parts. - if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0) + if (Scale != 0 && HasBaseReg && BaseOffset != 0) return false; // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by // putting the scaled register in the other operand of the icmp. - if (AM.Scale != 0 && AM.Scale != -1) + if (Scale != 0 && Scale != -1) return false; // If we have low-level target information, ask the target if it can fold an // integer immediate on an icmp. - if (AM.BaseOffs != 0) { - if (!TLI) - return false; + if (BaseOffset != 0) { // We have one of: - // ICmpZero BaseReg + Offset => ICmp BaseReg, -Offset - // ICmpZero -1*ScaleReg + Offset => ICmp ScaleReg, Offset + // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset + // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset // Offs is the ICmp immediate. - int64_t Offs = AM.BaseOffs; - if (AM.Scale == 0) - Offs = -(uint64_t)Offs; // The cast does the right thing with INT64_MIN. - return TLI->isLegalICmpImmediate(Offs); + if (Scale == 0) + // The cast does the right thing with INT64_MIN. + BaseOffset = -(uint64_t)BaseOffset; + return TTI.isLegalICmpImmediate(BaseOffset); } // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg @@ -1316,92 +1322,87 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM, case LSRUse::Basic: // Only handle single-register values. - return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0; + return !BaseGV && Scale == 0 && BaseOffset == 0; case LSRUse::Special: // Special case Basic to handle -1 scales. - return !AM.BaseGV && (AM.Scale == 0 || AM.Scale == -1) && AM.BaseOffs == 0; + return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0; } llvm_unreachable("Invalid LSRUse Kind!"); } -static bool isLegalUse(TargetLowering::AddrMode AM, - int64_t MinOffset, int64_t MaxOffset, - LSRUse::KindType Kind, Type *AccessTy, - const TargetLowering *TLI) { +static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, + int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, + GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, + int64_t Scale) { // Check for overflow. - if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) != + if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) != (MinOffset > 0)) return false; - AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset; - if (isLegalUse(AM, Kind, AccessTy, TLI)) { - AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset; - // Check for overflow. - if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) != - (MaxOffset > 0)) - return false; - AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset; - return isLegalUse(AM, Kind, AccessTy, TLI); - } - return false; + MinOffset = (uint64_t)BaseOffset + MinOffset; + if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) != + (MaxOffset > 0)) + return false; + MaxOffset = (uint64_t)BaseOffset + MaxOffset; + + return isLegalUse(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg, + Scale) && + isLegalUse(TTI, Kind, AccessTy, BaseGV, MaxOffset, HasBaseReg, Scale); +} + +static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, + int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, + const Formula &F) { + return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, + F.BaseOffset, F.HasBaseReg, F.Scale); } -static bool isAlwaysFoldable(int64_t BaseOffs, - GlobalValue *BaseGV, - bool HasBaseReg, +static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, Type *AccessTy, - const TargetLowering *TLI) { + GlobalValue *BaseGV, int64_t BaseOffset, + bool HasBaseReg) { // Fast-path: zero is always foldable. - if (BaseOffs == 0 && !BaseGV) return true; + if (BaseOffset == 0 && !BaseGV) return true; // Conservatively, create an address with an immediate and a // base and a scale. - TargetLowering::AddrMode AM; - AM.BaseOffs = BaseOffs; - AM.BaseGV = BaseGV; - AM.HasBaseReg = HasBaseReg; - AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1; + int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; // Canonicalize a scale of 1 to a base register if the formula doesn't // already have a base register. - if (!AM.HasBaseReg && AM.Scale == 1) { - AM.Scale = 0; - AM.HasBaseReg = true; + if (!HasBaseReg && Scale == 1) { + Scale = 0; + HasBaseReg = true; } - return isLegalUse(AM, Kind, AccessTy, TLI); + return isLegalUse(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); } -static bool isAlwaysFoldable(const SCEV *S, - int64_t MinOffset, int64_t MaxOffset, - bool HasBaseReg, - LSRUse::KindType Kind, Type *AccessTy, - const TargetLowering *TLI, - ScalarEvolution &SE) { +static bool isAlwaysFoldable(const TargetTransformInfo &TTI, + ScalarEvolution &SE, int64_t MinOffset, + int64_t MaxOffset, LSRUse::KindType Kind, + Type *AccessTy, const SCEV *S, bool HasBaseReg) { // Fast-path: zero is always foldable. if (S->isZero()) return true; // Conservatively, create an address with an immediate and a // base and a scale. - int64_t BaseOffs = ExtractImmediate(S, SE); + int64_t BaseOffset = ExtractImmediate(S, SE); GlobalValue *BaseGV = ExtractSymbol(S, SE); // If there's anything else involved, it's not foldable. if (!S->isZero()) return false; // Fast-path: zero is always foldable. - if (BaseOffs == 0 && !BaseGV) return true; + if (BaseOffset == 0 && !BaseGV) return true; // Conservatively, create an address with an immediate and a // base and a scale. - TargetLowering::AddrMode AM; - AM.BaseOffs = BaseOffs; - AM.BaseGV = BaseGV; - AM.HasBaseReg = HasBaseReg; - AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1; + int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; - return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI); + return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, + BaseOffset, HasBaseReg, Scale); } namespace { @@ -1501,7 +1502,7 @@ class LSRInstance { ScalarEvolution &SE; DominatorTree &DT; LoopInfo &LI; - const TargetLowering *const TLI; + const TargetTransformInfo &TTI; Loop *const L; bool Changed; @@ -1637,7 +1638,7 @@ class LSRInstance { Pass *P); public: - LSRInstance(const TargetLowering *tli, Loop *l, Pass *P); + LSRInstance(Loop *L, Pass *P); bool getChanged() const { return Changed; } @@ -1687,12 +1688,9 @@ void LSRInstance::OptimizeShadowIV() { } if (!DestTy) continue; - if (TLI) { - // If target does not support DestTy natively then do not apply - // this transformation. - EVT DVT = TLI->getValueType(DestTy); - if (!TLI->isTypeLegal(DVT)) continue; - } + // If target does not support DestTy natively then do not apply + // this transformation. + if (!TTI.isTypeLegal(DestTy)) continue; PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0)); if (!PH) continue; @@ -2014,18 +2012,17 @@ LSRInstance::OptimizeLoopTermCond() { if (C->getValue().getMinSignedBits() >= 64 || C->getValue().isMinSignedValue()) goto decline_post_inc; - // Without TLI, assume that any stride might be valid, and so any - // use might be shared. - if (!TLI) - goto decline_post_inc; // Check for possible scaled-address reuse. Type *AccessTy = getAccessType(UI->getUser()); - TargetLowering::AddrMode AM; - AM.Scale = C->getSExtValue(); - if (TLI->isLegalAddressingMode(AM, AccessTy)) + int64_t Scale = C->getSExtValue(); + if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0, + /*BaseOffset=*/ 0, + /*HasBaseReg=*/ false, Scale)) goto decline_post_inc; - AM.Scale = -AM.Scale; - if (TLI->isLegalAddressingMode(AM, AccessTy)) + Scale = -Scale; + if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0, + /*BaseOffset=*/ 0, + /*HasBaseReg=*/ false, Scale)) goto decline_post_inc; } } @@ -2095,13 +2092,13 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, return false; // Conservatively assume HasBaseReg is true for now. if (NewOffset < LU.MinOffset) { - if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg, - Kind, AccessTy, TLI)) + if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, + LU.MaxOffset - NewOffset, HasBaseReg)) return false; NewMinOffset = NewOffset; } else if (NewOffset > LU.MaxOffset) { - if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg, - Kind, AccessTy, TLI)) + if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, + NewOffset - LU.MinOffset, HasBaseReg)) return false; NewMaxOffset = NewOffset; } @@ -2130,7 +2127,8 @@ LSRInstance::getUse(const SCEV *&Expr, int64_t Offset = ExtractImmediate(Expr, SE); // Basic uses can't accept any offset, for example. - if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) { + if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, + Offset, /*HasBaseReg=*/ true)) { Expr = Copy; Offset = 0; } @@ -2198,10 +2196,10 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, // as OrigF. if (F.BaseRegs == OrigF.BaseRegs && F.ScaledReg == OrigF.ScaledReg && - F.AM.BaseGV == OrigF.AM.BaseGV && - F.AM.Scale == OrigF.AM.Scale && + F.BaseGV == OrigF.BaseGV && + F.Scale == OrigF.Scale && F.UnfoldedOffset == OrigF.UnfoldedOffset) { - if (F.AM.BaseOffs == 0) + if (F.BaseOffset == 0) return &LU; // This is the formula where all the registers and symbols matched; // there aren't going to be any others. Since we declined it, we @@ -2395,7 +2393,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr, /// TODO: Consider IVInc free if it's already used in another chains. static bool isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, - ScalarEvolution &SE, const TargetLowering *TLI) { + ScalarEvolution &SE, const TargetTransformInfo &TTI) { if (StressIVChain) return true; @@ -2653,7 +2651,7 @@ void LSRInstance::CollectChains() { for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); UsersIdx < NChains; ++UsersIdx) { if (!isProfitableChain(IVChainVec[UsersIdx], - ChainUsersVec[UsersIdx].FarUsers, SE, TLI)) + ChainUsersVec[UsersIdx].FarUsers, SE, TTI)) continue; // Preserve the chain at UsesIdx. if (ChainIdx != UsersIdx) @@ -2680,7 +2678,7 @@ void LSRInstance::FinalizeChain(IVChain &Chain) { /// Return true if the IVInc can be folded into an addressing mode. static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, - Value *Operand, const TargetLowering *TLI) { + Value *Operand, const TargetTransformInfo &TTI) { const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr); if (!IncConst || !isAddressUse(UserInst, Operand)) return false; @@ -2689,8 +2687,9 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, return false; int64_t IncOffset = IncConst->getValue()->getSExtValue(); - if (!isAlwaysFoldable(IncOffset, /*BaseGV=*/0, /*HaseBaseReg=*/false, - LSRUse::Address, getAccessType(UserInst), TLI)) + if (!isAlwaysFoldable(TTI, LSRUse::Address, + getAccessType(UserInst), /*BaseGV=*/ 0, + IncOffset, /*HaseBaseReg=*/ false)) return false; return true; @@ -2761,7 +2760,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, // If an IV increment can't be folded, use it as the next IV value. if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand, - TLI)) { + TTI)) { assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); IVSrc = IVOper; LeftOverExpr = 0; @@ -2892,6 +2891,7 @@ void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { Formula F; F.InitialMatch(S, L, SE); + F.HasBaseReg = true; bool Inserted = InsertFormula(LU, LUIdx, F); assert(Inserted && "Initial formula already exists!"); (void)Inserted; } @@ -2903,7 +2903,6 @@ LSRInstance::InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { Formula F; F.BaseRegs.push_back(S); - F.AM.HasBaseReg = true; bool Inserted = InsertFormula(LU, LUIdx, F); assert(Inserted && "Supplemental formula already exists!"); (void)Inserted; } @@ -3105,9 +3104,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // Don't pull a constant into a register if the constant could be folded // into an immediate field. - if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset, - Base.getNumRegs() > 1, - LU.Kind, LU.AccessTy, TLI, SE)) + if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, *J, Base.getNumRegs() > 1)) continue; // Collect all operands except *J. @@ -3119,9 +3117,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // Don't leave just a constant behind in a register if the constant could // be folded into an immediate field. if (InnerAddOps.size() == 1 && - isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset, - Base.getNumRegs() > 1, - LU.Kind, LU.AccessTy, TLI, SE)) + isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1)) continue; const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); @@ -3131,10 +3128,10 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // Add the remaining pieces of the add back into the new formula. const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); - if (TLI && InnerSumSC && + if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && - TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + - InnerSumSC->getValue()->getZExtValue())) { + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + InnerSumSC->getValue()->getZExtValue())) { F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue(); F.BaseRegs.erase(F.BaseRegs.begin() + i); @@ -3143,9 +3140,9 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // Add J as its own register, or an unfolded immediate. const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); - if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && - TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + - SC->getValue()->getZExtValue())) + if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + SC->getValue()->getZExtValue())) F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue(); else @@ -3194,7 +3191,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base) { // We can't add a symbolic offset if the address already contains one. - if (Base.AM.BaseGV) return; + if (Base.BaseGV) return; for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { const SCEV *G = Base.BaseRegs[i]; @@ -3202,9 +3199,8 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, if (G->isZero() || !GV) continue; Formula F = Base; - F.AM.BaseGV = GV; - if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI)) + F.BaseGV = GV; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) continue; F.BaseRegs[i] = G; (void)InsertFormula(LU, LUIdx, F); @@ -3227,9 +3223,9 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(), E = Worklist.end(); I != E; ++I) { Formula F = Base; - F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I; - if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I, - LU.Kind, LU.AccessTy, TLI)) { + F.BaseOffset = (uint64_t)Base.BaseOffset - *I; + if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind, + LU.AccessTy, F)) { // Add the offset to the base register. const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); // If it cancelled out, drop the base register, otherwise update it. @@ -3247,9 +3243,8 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, if (G->isZero() || Imm == 0) continue; Formula F = Base; - F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm; - if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI)) + F.BaseOffset = (uint64_t)F.BaseOffset + Imm; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) continue; F.BaseRegs[i] = G; (void)InsertFormula(LU, LUIdx, F); @@ -3270,7 +3265,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, // Don't do this if there is more than one offset. if (LU.MinOffset != LU.MaxOffset) return; - assert(!Base.AM.BaseGV && "ICmpZero use is not legal!"); + assert(!Base.BaseGV && "ICmpZero use is not legal!"); // Check each interesting stride. for (SmallSetVector<int64_t, 8>::const_iterator @@ -3278,10 +3273,10 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, int64_t Factor = *I; // Check that the multiplication doesn't overflow. - if (Base.AM.BaseOffs == INT64_MIN && Factor == -1) + if (Base.BaseOffset == INT64_MIN && Factor == -1) continue; - int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor; - if (NewBaseOffs / Factor != Base.AM.BaseOffs) + int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor; + if (NewBaseOffset / Factor != Base.BaseOffset) continue; // Check that multiplying with the use offset doesn't overflow. @@ -3293,14 +3288,14 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, continue; Formula F = Base; - F.AM.BaseOffs = NewBaseOffs; + F.BaseOffset = NewBaseOffset; // Check that this scale is legal. - if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI)) + if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F)) continue; // Compensate for the use having MinOffset built into it. - F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset; + F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset; const SCEV *FactorS = SE.getConstant(IntTy, Factor); @@ -3341,23 +3336,23 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { if (!IntTy) return; // If this Formula already has a scaled register, we can't add another one. - if (Base.AM.Scale != 0) return; + if (Base.Scale != 0) return; // Check each interesting stride. for (SmallSetVector<int64_t, 8>::const_iterator I = Factors.begin(), E = Factors.end(); I != E; ++I) { int64_t Factor = *I; - Base.AM.Scale = Factor; - Base.AM.HasBaseReg = Base.BaseRegs.size() > 1; + Base.Scale = Factor; + Base.HasBaseReg = Base.BaseRegs.size() > 1; // Check whether this scale is going to be legal. - if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI)) { + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, + Base)) { // As a special-case, handle special out-of-loop Basic users specially. // TODO: Reconsider this special case. if (LU.Kind == LSRUse::Basic && - isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset, - LSRUse::Special, LU.AccessTy, TLI) && + isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special, + LU.AccessTy, Base) && LU.AllFixupsOutsideLoop) LU.Kind = LSRUse::Special; else @@ -3366,7 +3361,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // For an ICmpZero, negating a solitary base register won't lead to // new solutions. if (LU.Kind == LSRUse::ICmpZero && - !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV) + !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV) continue; // For each addrec base reg, apply the scale, if possible. for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) @@ -3390,11 +3385,8 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { /// GenerateTruncates - Generate reuse formulae from different IV types. void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { - // This requires TargetLowering to tell us which truncates are free. - if (!TLI) return; - // Don't bother truncating symbolic values. - if (Base.AM.BaseGV) return; + if (Base.BaseGV) return; // Determine the integer type for the base formula. Type *DstTy = Base.getType(); @@ -3404,7 +3396,7 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { for (SmallSetVector<Type *, 4>::const_iterator I = Types.begin(), E = Types.end(); I != E; ++I) { Type *SrcTy = *I; - if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) { + if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) { Formula F = Base; if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I); @@ -3446,7 +3438,7 @@ void WorkItem::print(raw_ostream &OS) const { << " , add offset " << Imm; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void WorkItem::dump() const { print(errs()); errs() << '\n'; } @@ -3551,16 +3543,15 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { const Formula &F = LU.Formulae[L]; // Use the immediate in the scaled register. if (F.ScaledReg == OrigReg) { - int64_t Offs = (uint64_t)F.AM.BaseOffs + - Imm * (uint64_t)F.AM.Scale; + int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; // Don't create 50 + reg(-50). if (F.referencesReg(SE.getSCEV( - ConstantInt::get(IntTy, -(uint64_t)Offs)))) + ConstantInt::get(IntTy, -(uint64_t)Offset)))) continue; Formula NewF = F; - NewF.AM.BaseOffs = Offs; - if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI)) + NewF.BaseOffset = Offset; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, + NewF)) continue; NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg); @@ -3569,9 +3560,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // immediate itself, then the formula isn't worthwhile. if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) if (C->getValue()->isNegative() != - (NewF.AM.BaseOffs < 0) && - (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale)) - .ule(abs64(NewF.AM.BaseOffs))) + (NewF.BaseOffset < 0) && + (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale)) + .ule(abs64(NewF.BaseOffset))) continue; // OK, looks good. @@ -3583,11 +3574,10 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (BaseReg != OrigReg) continue; Formula NewF = F; - NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm; - if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI)) { - if (!TLI || - !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) + NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, NewF)) { + if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) continue; NewF = F; NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; @@ -3601,11 +3591,11 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end(); J != JE; ++J) if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J)) - if ((C->getValue()->getValue() + NewF.AM.BaseOffs).abs().slt( - abs64(NewF.AM.BaseOffs)) && + if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt( + abs64(NewF.BaseOffset)) && (C->getValue()->getValue() + - NewF.AM.BaseOffs).countTrailingZeros() >= - CountTrailingZeros_64(NewF.AM.BaseOffs)) + NewF.BaseOffset).countTrailingZeros() >= + CountTrailingZeros_64(NewF.BaseOffset)) goto skip_formula; // Ok, looks good. @@ -3803,7 +3793,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) { Formula NewF = F; - NewF.AM.BaseOffs += C->getValue()->getSExtValue(); + NewF.BaseOffset += C->getValue()->getSExtValue(); NewF.BaseRegs.erase(NewF.BaseRegs.begin() + (I - F.BaseRegs.begin())); if (LU.HasFormulaWithSameRegs(NewF)) { @@ -3816,9 +3806,9 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { } } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) { if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) - if (!F.AM.BaseGV) { + if (!F.BaseGV) { Formula NewF = F; - NewF.AM.BaseGV = GV; + NewF.BaseGV = GV; NewF.BaseRegs.erase(NewF.BaseRegs.begin() + (I - F.BaseRegs.begin())); if (LU.HasFormulaWithSameRegs(NewF)) { @@ -3861,9 +3851,9 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), E = LU.Formulae.end(); I != E; ++I) { const Formula &F = *I; - if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) { + if (F.BaseOffset != 0 && F.Scale == 0) { if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) { - if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs, + if (reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/false, LU.Kind, LU.AccessTy)) { DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); @@ -3877,7 +3867,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { LSRFixup &Fixup = *I; if (Fixup.LUIdx == LUIdx) { Fixup.LUIdx = LUThatHas - &Uses.front(); - Fixup.Offset += F.AM.BaseOffs; + Fixup.Offset += F.BaseOffset; // Add the new offset to LUThatHas' offset list. if (LUThatHas->Offsets.back() != Fixup.Offset) { LUThatHas->Offsets.push_back(Fixup.Offset); @@ -3897,9 +3887,8 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { bool Any = false; for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { Formula &F = LUThatHas->Formulae[i]; - if (!isLegalUse(F.AM, - LUThatHas->MinOffset, LUThatHas->MaxOffset, - LUThatHas->Kind, LUThatHas->AccessTy, TLI)) { + if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset, + LUThatHas->Kind, LUThatHas->AccessTy, F)) { DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); LUThatHas->DeleteFormula(F); @@ -4307,7 +4296,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Expand the ScaledReg portion. Value *ICmpScaledV = 0; - if (F.AM.Scale != 0) { + if (F.Scale != 0) { const SCEV *ScaledS = F.ScaledReg; // If we're expanding for a post-inc user, make the post-inc adjustment. @@ -4320,7 +4309,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // An interesting way of "folding" with an icmp is to use a negated // scale, which we'll implement by inserting it into the other operand // of the icmp. - assert(F.AM.Scale == -1 && + assert(F.Scale == -1 && "The only scale supported by ICmpZero uses is -1!"); ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP); } else { @@ -4335,20 +4324,20 @@ Value *LSRInstance::Expand(const LSRFixup &LF, } ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP)); ScaledS = SE.getMulExpr(ScaledS, - SE.getConstant(ScaledS->getType(), F.AM.Scale)); + SE.getConstant(ScaledS->getType(), F.Scale)); Ops.push_back(ScaledS); } } // Expand the GV portion. - if (F.AM.BaseGV) { + if (F.BaseGV) { // Flush the operand list to suppress SCEVExpander hoisting. if (!Ops.empty()) { Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } - Ops.push_back(SE.getUnknown(F.AM.BaseGV)); + Ops.push_back(SE.getUnknown(F.BaseGV)); } // Flush the operand list to suppress SCEVExpander hoisting of both folded and @@ -4360,7 +4349,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, } // Expand the immediate portion. - int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset; + int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset; if (Offset != 0) { if (LU.Kind == LSRUse::ICmpZero) { // The other interesting way of "folding" with an ICmpZero is to use a @@ -4401,9 +4390,9 @@ Value *LSRInstance::Expand(const LSRFixup &LF, if (LU.Kind == LSRUse::ICmpZero) { ICmpInst *CI = cast<ICmpInst>(LF.UserInst); DeadInsts.push_back(CI->getOperand(1)); - assert(!F.AM.BaseGV && "ICmp does not support folding a global value and " + assert(!F.BaseGV && "ICmp does not support folding a global value and " "a scale at the same time!"); - if (F.AM.Scale == -1) { + if (F.Scale == -1) { if (ICmpScaledV->getType() != OpTy) { Instruction *Cast = CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false, @@ -4413,7 +4402,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, } CI->setOperand(1, ICmpScaledV); } else { - assert(F.AM.Scale == 0 && + assert(F.Scale == 0 && "ICmp does not support folding a global value and " "a scale at the same time!"); Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), @@ -4464,17 +4453,21 @@ void LSRInstance::RewriteForPHI(PHINode *PN, SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs); NewBB = NewBBs[0]; } - - // If PN is outside of the loop and BB is in the loop, we want to - // move the block to be immediately before the PHI block, not - // immediately after BB. - if (L->contains(BB) && !L->contains(PN)) - NewBB->moveBefore(PN->getParent()); - - // Splitting the edge can reduce the number of PHI entries we have. - e = PN->getNumIncomingValues(); - BB = NewBB; - i = PN->getBasicBlockIndex(BB); + // If NewBB==NULL, then SplitCriticalEdge refused to split because all + // phi predecessors are identical. The simple thing to do is skip + // splitting in this case rather than complicate the API. + if (NewBB) { + // If PN is outside of the loop and BB is in the loop, we want to + // move the block to be immediately before the PHI block, not + // immediately after BB. + if (L->contains(BB) && !L->contains(PN)) + NewBB->moveBefore(PN->getParent()); + + // Splitting the edge can reduce the number of PHI entries we have. + e = PN->getNumIncomingValues(); + BB = NewBB; + i = PN->getBasicBlockIndex(BB); + } } } @@ -4584,13 +4577,11 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, Changed |= DeleteTriviallyDeadInstructions(DeadInsts); } -LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) - : IU(P->getAnalysis<IVUsers>()), - SE(P->getAnalysis<ScalarEvolution>()), - DT(P->getAnalysis<DominatorTree>()), - LI(P->getAnalysis<LoopInfo>()), - TLI(tli), L(l), Changed(false), IVIncInsertPos(0) { - +LSRInstance::LSRInstance(Loop *L, Pass *P) + : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), + DT(P->getAnalysis<DominatorTree>()), LI(P->getAnalysis<LoopInfo>()), + TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false), + IVIncInsertPos(0) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; @@ -4673,14 +4664,14 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) #ifndef NDEBUG // Formulae should be legal. - for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), - E = Uses.end(); I != E; ++I) { - const LSRUse &LU = *I; - for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), - JE = LU.Formulae.end(); J != JE; ++J) - assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI) && - "Illegal formula generated!"); + for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), E = Uses.end(); + I != E; ++I) { + const LSRUse &LU = *I; + for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), + JE = LU.Formulae.end(); + J != JE; ++J) + assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, + *J) && "Illegal formula generated!"); }; #endif @@ -4743,7 +4734,7 @@ void LSRInstance::print(raw_ostream &OS) const { print_uses(OS); } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LSRInstance::dump() const { print(errs()); errs() << '\n'; } @@ -4752,13 +4743,9 @@ void LSRInstance::dump() const { namespace { class LoopStrengthReduce : public LoopPass { - /// TLI - Keep a pointer of a TargetLowering to consult for determining - /// transformation profitability. - const TargetLowering *const TLI; - public: static char ID; // Pass ID, replacement for typeid - explicit LoopStrengthReduce(const TargetLowering *tli = 0); + LoopStrengthReduce(); private: bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -4770,6 +4757,7 @@ private: char LoopStrengthReduce::ID = 0; INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) INITIALIZE_PASS_DEPENDENCY(DominatorTree) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(IVUsers) @@ -4779,14 +4767,13 @@ INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) -Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) { - return new LoopStrengthReduce(TLI); +Pass *llvm::createLoopStrengthReducePass() { + return new LoopStrengthReduce(); } -LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli) - : LoopPass(ID), TLI(tli) { - initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); - } +LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) { + initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); +} void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { // We split critical edges, so we change the CFG. However, we do update @@ -4805,24 +4792,27 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredID(LoopSimplifyID); AU.addRequired<IVUsers>(); AU.addPreserved<IVUsers>(); + AU.addRequired<TargetTransformInfo>(); } bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { bool Changed = false; // Run the main LSR transformation. - Changed |= LSRInstance(TLI, L, this).getChanged(); + Changed |= LSRInstance(L, this).getChanged(); // Remove any extra phis created by processing inner loops. Changed |= DeleteDeadPHIs(L->getHeader()); - if (EnablePhiElim) { + if (EnablePhiElim && L->isLoopSimplifyForm()) { SmallVector<WeakVH, 16> DeadInsts; SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr"); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif - unsigned numFolded = Rewriter. - replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), DeadInsts, TLI); + unsigned numFolded = + Rewriter.replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), + DeadInsts, + &getAnalysis<TargetTransformInfo>()); if (numFolded) { Changed = true; DeleteTriviallyDeadInstructions(DeadInsts); diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index 09a186f..e0f915b 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -13,16 +13,16 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "loop-unroll" -#include "llvm/IntrinsicInst.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/UnrollLoop.h" -#include "llvm/Target/TargetData.h" #include <climits> using namespace llvm; @@ -113,12 +113,13 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) { /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, - const TargetData *TD) { + bool &NotDuplicatable, const DataLayout *TD) { CodeMetrics Metrics; for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) Metrics.analyzeBasicBlock(*I, TD); NumCalls = Metrics.NumInlineCandidates; + NotDuplicatable = Metrics.notDuplicatable; unsigned LoopSize = Metrics.NumInsts; @@ -145,7 +146,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // not user specified. unsigned Threshold = CurrentThreshold; if (!UserThreshold && - Header->getParent()->hasFnAttr(Attribute::OptimizeForSize)) + Header->getParent()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize)) Threshold = OptSizeUnrollThreshold; // Find trip count and trip multiple if count is not available @@ -178,10 +181,17 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Enforce the threshold. if (Threshold != NoThreshold) { - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); unsigned NumInlineCandidates; - unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, TD); + bool notDuplicatable; + unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, + notDuplicatable, TD); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); + if (notDuplicatable) { + DEBUG(dbgs() << " Not unrolling loop which contains non duplicatable" + << " instructions.\n"); + return false; + } if (NumInlineCandidates != 0) { DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); return false; diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 58f7739..68d4423 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -28,25 +28,25 @@ #define DEBUG_TYPE "loop-unswitch" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <map> #include <set> @@ -248,6 +248,13 @@ bool LUAnalysisCache::countLoop(const Loop* L) { Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5); Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation); MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount; + + if (Metrics.notDuplicatable) { + DEBUG(dbgs() << "NOT unswitching loop %" + << L->getHeader()->getName() << ", contents cannot be " + << "duplicated!\n"); + return false; + } } if (!Props.CanBeUnswitchedCount) { @@ -638,7 +645,9 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) { // Check to see if it would be profitable to unswitch current loop. // Do not do non-trivial unswitch while optimizing for size. - if (OptimizeForSize || F->hasFnAttr(Attribute::OptimizeForSize)) + if (OptimizeForSize || + F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize)) return false; UnswitchNontrivialCondition(LoopCond, Val, currentLoop); @@ -906,13 +915,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, /// specified. static void RemoveFromWorklist(Instruction *I, std::vector<Instruction*> &Worklist) { - std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(), - Worklist.end(), I); - while (WI != Worklist.end()) { - unsigned Offset = WI-Worklist.begin(); - Worklist.erase(WI); - WI = std::find(Worklist.begin()+Offset, Worklist.end(), I); - } + + Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I), + Worklist.end()); } /// ReplaceUsesOfWith - When we find that I really equals V, remove I from the diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp index 7419a65..8ced494 100644 --- a/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/lib/Transforms/Scalar/LowerAtomic.cpp @@ -14,9 +14,9 @@ #define DEBUG_TYPE "loweratomic" #include "llvm/Transforms/Scalar.h" -#include "llvm/Function.h" -#include "llvm/IRBuilder.h" -#include "llvm/IntrinsicInst.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" using namespace llvm; diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 2a5ee33..be0f0e8 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -14,20 +14,20 @@ #define DEBUG_TYPE "memcpyopt" #include "llvm/Transforms/Scalar.h" -#include "llvm/GlobalVariable.h" -#include "llvm/IRBuilder.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include <list> @@ -38,8 +38,8 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred"); STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); -static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, - bool &VariableIdxFound, const TargetData &TD){ +static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, + bool &VariableIdxFound, const DataLayout &TD){ // Skip over the first indices. gep_type_iterator GTI = gep_type_begin(GEP); for (unsigned i = 1; i != Idx; ++i, ++GTI) @@ -72,11 +72,11 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, /// constant offset, and return that constant offset. For example, Ptr1 might /// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, - const TargetData &TD) { + const DataLayout &TD) { Ptr1 = Ptr1->stripPointerCasts(); Ptr2 = Ptr2->stripPointerCasts(); - GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1); - GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2); + GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1); + GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2); bool VariableIdxFound = false; @@ -141,12 +141,12 @@ struct MemsetRange { /// TheStores - The actual stores that make up this range. SmallVector<Instruction*, 16> TheStores; - bool isProfitableToUseMemset(const TargetData &TD) const; + bool isProfitableToUseMemset(const DataLayout &TD) const; }; } // end anon namespace -bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const { +bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const { // If we found more than 4 stores to merge or 16 bytes, use memset. if (TheStores.size() >= 4 || End-Start >= 16) return true; @@ -192,9 +192,9 @@ class MemsetRanges { /// because each element is relatively large and expensive to copy. std::list<MemsetRange> Ranges; typedef std::list<MemsetRange>::iterator range_iterator; - const TargetData &TD; + const DataLayout &TD; public: - MemsetRanges(const TargetData &td) : TD(td) {} + MemsetRanges(const DataLayout &td) : TD(td) {} typedef std::list<MemsetRange>::const_iterator const_iterator; const_iterator begin() const { return Ranges.begin(); } @@ -302,7 +302,7 @@ namespace { class MemCpyOpt : public FunctionPass { MemoryDependenceAnalysis *MD; TargetLibraryInfo *TLI; - const TargetData *TD; + const DataLayout *TD; public: static char ID; // Pass identification, replacement for typeid MemCpyOpt() : FunctionPass(ID) { @@ -332,7 +332,7 @@ namespace { bool processMemCpy(MemCpyInst *M); bool processMemMove(MemMoveInst *M); bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc, - uint64_t cpyLen, CallInst *C); + uint64_t cpyLen, unsigned cpyAlign, CallInst *C); bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, uint64_t MSize); bool processByValArgument(CallSite CS, unsigned ArgNo); @@ -509,10 +509,18 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { } if (C) { + unsigned storeAlign = SI->getAlignment(); + if (!storeAlign) + storeAlign = TD->getABITypeAlignment(SI->getOperand(0)->getType()); + unsigned loadAlign = LI->getAlignment(); + if (!loadAlign) + loadAlign = TD->getABITypeAlignment(LI->getType()); + bool changed = performCallSlotOptzn(LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), - TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); + TD->getTypeStoreSize(SI->getOperand(0)->getType()), + std::min(storeAlign, loadAlign), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); @@ -559,7 +567,8 @@ bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { /// the call write its result directly into the destination of the memcpy. bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, Value *cpyDest, Value *cpySrc, - uint64_t cpyLen, CallInst *C) { + uint64_t cpyLen, unsigned cpyAlign, + CallInst *C) { // The general transformation to keep in mind is // // call @func(..., src, ...) @@ -625,6 +634,16 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return false; } + // Check that dest points to memory that is at least as aligned as src. + unsigned srcAlign = srcAlloca->getAlignment(); + if (!srcAlign) + srcAlign = TD->getABITypeAlignment(srcAlloca->getAllocatedType()); + bool isDestSufficientlyAligned = srcAlign <= cpyAlign; + // If dest is not aligned enough and we can't increase its alignment then + // bail out. + if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest)) + return false; + // Check that src is not accessed except via the call and the memcpy. This // guarantees that it holds only undefined values when passed in (so the final // memcpy can be dropped), that it is not read or written between the call and @@ -673,20 +692,26 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, bool changedArgument = false; for (unsigned i = 0; i < CS.arg_size(); ++i) if (CS.getArgument(i)->stripPointerCasts() == cpySrc) { - if (cpySrc->getType() != cpyDest->getType()) - cpyDest = CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), - cpyDest->getName(), C); + Value *Dest = cpySrc->getType() == cpyDest->getType() ? cpyDest + : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), + cpyDest->getName(), C); changedArgument = true; - if (CS.getArgument(i)->getType() == cpyDest->getType()) - CS.setArgument(i, cpyDest); + if (CS.getArgument(i)->getType() == Dest->getType()) + CS.setArgument(i, Dest); else - CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, - CS.getArgument(i)->getType(), cpyDest->getName(), C)); + CS.setArgument(i, CastInst::CreatePointerCast(Dest, + CS.getArgument(i)->getType(), Dest->getName(), C)); } if (!changedArgument) return false; + // If the destination wasn't sufficiently aligned then increase its alignment. + if (!isDestSufficientlyAligned) { + assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!"); + cast<AllocaInst>(cpyDest)->setAlignment(srcAlign); + } + // Drop any cached information about the call, because we may have changed // its dependence information by changing its parameter. MD->removeInstruction(C); @@ -813,7 +838,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { if (DepInfo.isClobber()) { if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { if (performCallSlotOptzn(M, M->getDest(), M->getSource(), - CopySize->getZExtValue(), C)) { + CopySize->getZExtValue(), M->getAlignment(), + C)) { MD->removeInstruction(M); M->eraseFromParent(); return true; @@ -974,7 +1000,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { bool MemCpyOpt::runOnFunction(Function &F) { bool MadeChange = false; MD = &getAnalysis<MemoryDependenceAnalysis>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); // If we don't have at least memset and memcpy, there is little point of doing diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp index dce8e8b..e6ec841 100644 --- a/lib/Transforms/Scalar/ObjCARC.cpp +++ b/lib/Transforms/Scalar/ObjCARC.cpp @@ -29,8 +29,10 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "objc-arc" -#include "llvm/Support/CommandLine.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; // A handy option to enable/disable all optimizations in this file. @@ -131,12 +133,12 @@ namespace { // ARC Utilities. //===----------------------------------------------------------------------===// -#include "llvm/Intrinsics.h" -#include "llvm/Module.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" #include "llvm/Support/CallSite.h" -#include "llvm/ADT/StringSwitch.h" +#include "llvm/Transforms/Utils/Local.h" namespace { /// InstructionClass - A simple classification for instructions. @@ -659,9 +661,9 @@ static bool DoesObjCBlockEscape(const Value *BlockPtr) { // ARC AliasAnalysis. //===----------------------------------------------------------------------===// -#include "llvm/Pass.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Passes.h" +#include "llvm/Pass.h" namespace { /// ObjCARCAliasAnalysis - This is a simple alias analysis @@ -885,25 +887,33 @@ bool ObjCARCExpand::runOnFunction(Function &F) { for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) { Instruction *Inst = &*I; + DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n"); + switch (GetBasicInstructionClass(Inst)) { case IC_Retain: case IC_RetainRV: case IC_Autorelease: case IC_AutoreleaseRV: case IC_FusedRetainAutorelease: - case IC_FusedRetainAutoreleaseRV: + case IC_FusedRetainAutoreleaseRV: { // These calls return their argument verbatim, as a low-level // optimization. However, this makes high-level optimizations // harder. Undo any uses of this optimization that the front-end // emitted here. We'll redo them in the contract pass. Changed = true; - Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0)); + Value *Value = cast<CallInst>(Inst)->getArgOperand(0); + DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst << "\n" + " New = " << *Value << "\n"); + Inst->replaceAllUsesWith(Value); break; + } default: break; } } + DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n"); + return Changed; } @@ -911,8 +921,8 @@ bool ObjCARCExpand::runOnFunction(Function &F) { // ARC autorelease pool elimination. //===----------------------------------------------------------------------===// -#include "llvm/Constants.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/IR/Constants.h" namespace { /// ObjCARCAPElim - Autorelease pool elimination. @@ -985,6 +995,9 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) { // zap the pair. if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) { Changed = true; + DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop autorelease pair:\n" + << " Pop: " << *Inst << "\n" + << " Push: " << *Push << "\n"); Inst->eraseFromParent(); Push->eraseFromParent(); } @@ -1092,10 +1105,10 @@ bool ObjCARCAPElim::runOnModule(Module &M) { // TODO: Delete release+retain pairs (rare). -#include "llvm/LLVMContext.h" -#include "llvm/Support/CFG.h" -#include "llvm/ADT/Statistic.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/CFG.h" STATISTIC(NumNoops, "Number of no-op objc calls eliminated"); STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated"); @@ -1120,9 +1133,8 @@ namespace { bool relatedSelect(const SelectInst *A, const Value *B); bool relatedPHI(const PHINode *A, const Value *B); - // Do not implement. - void operator=(const ProvenanceAnalysis &); - ProvenanceAnalysis(const ProvenanceAnalysis &); + void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION; + ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION; public: ProvenanceAnalysis() {} @@ -1597,6 +1609,12 @@ void BBState::MergePred(const BBState &Other) { // loop backedge. Loop backedges are special. TopDownPathCount += Other.TopDownPathCount; + // Check for overflow. If we have overflow, fall back to conservative behavior. + if (TopDownPathCount < Other.TopDownPathCount) { + clearTopDownPointers(); + return; + } + // For each entry in the other set, if our set has an entry with the same key, // merge the entries. Otherwise, copy the entry and merge it with an empty // entry. @@ -1622,6 +1640,12 @@ void BBState::MergeSucc(const BBState &Other) { // loop backedge. Loop backedges are special. BottomUpPathCount += Other.BottomUpPathCount; + // Check for overflow. If we have overflow, fall back to conservative behavior. + if (BottomUpPathCount < Other.BottomUpPathCount) { + clearBottomUpPointers(); + return; + } + // For each entry in the other set, if our set has an entry with the // same key, merge the entries. Otherwise, copy the entry and merge // it with an empty entry. @@ -1776,10 +1800,12 @@ Constant *ObjCARCOpt::getRetainRVCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *Params[] = { I8X }; FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); RetainRVCallee = M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy, - Attributes); + Attribute); } return RetainRVCallee; } @@ -1790,10 +1816,12 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *Params[] = { I8X }; FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); AutoreleaseRVCallee = M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy, - Attributes); + Attribute); } return AutoreleaseRVCallee; } @@ -1802,12 +1830,14 @@ Constant *ObjCARCOpt::getReleaseCallee(Module *M) { if (!ReleaseCallee) { LLVMContext &C = M->getContext(); Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); ReleaseCallee = M->getOrInsertFunction( "objc_release", FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false), - Attributes); + Attribute); } return ReleaseCallee; } @@ -1816,12 +1846,14 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) { if (!RetainCallee) { LLVMContext &C = M->getContext(); Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); RetainCallee = M->getOrInsertFunction( "objc_retain", FunctionType::get(Params[0], Params, /*isVarArg=*/false), - Attributes); + Attribute); } return RetainCallee; } @@ -1836,7 +1868,7 @@ Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) { M->getOrInsertFunction( "objc_retainBlock", FunctionType::get(Params[0], Params, /*isVarArg=*/false), - AttrListPtr()); + AttributeSet()); } return RetainBlockCallee; } @@ -1845,12 +1877,14 @@ Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) { if (!AutoreleaseCallee) { LLVMContext &C = M->getContext(); Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); AutoreleaseCallee = M->getOrInsertFunction( "objc_autorelease", FunctionType::get(Params[0], Params, /*isVarArg=*/false), - Attributes); + Attribute); } return AutoreleaseCallee; } @@ -2165,7 +2199,17 @@ ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) { // Turn it to an objc_retainAutoreleasedReturnValue.. Changed = true; ++NumPeeps; + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainCall: Transforming " + "objc_retainAutoreleasedReturnValue => " + "objc_retain since the operand is not a return value.\n" + " Old: " + << *Retain << "\n"); + cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent())); + + DEBUG(dbgs() << " New: " + << *Retain << "\n"); } /// OptimizeRetainRVCall - Turn objc_retainAutoreleasedReturnValue into @@ -2203,6 +2247,11 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { GetObjCArg(I) == Arg) { Changed = true; ++NumPeeps; + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Erasing " << *I << "\n" + << " Erasing " << *RetainRV + << "\n"); + EraseInstruction(I); EraseInstruction(RetainRV); return true; @@ -2212,7 +2261,18 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { // Turn it to a plain objc_retain. Changed = true; ++NumPeeps; + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Transforming " + "objc_retainAutoreleasedReturnValue => " + "objc_retain since the operand is not a return value.\n" + " Old: " + << *RetainRV << "\n"); + cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent())); + + DEBUG(dbgs() << " New: " + << *RetainRV << "\n"); + return false; } @@ -2238,8 +2298,20 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV) { Changed = true; ++NumPeeps; + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeAutoreleaseRVCall: Transforming " + "objc_autoreleaseReturnValue => " + "objc_autorelease since its operand is not used as a return " + "value.\n" + " Old: " + << *AutoreleaseRV << "\n"); + cast<CallInst>(AutoreleaseRV)-> setCalledFunction(getAutoreleaseCallee(F.getParent())); + + DEBUG(dbgs() << " New: " + << *AutoreleaseRV << "\n"); + } /// OptimizeIndividualCalls - Visit each call, one at a time, and make @@ -2251,6 +2323,10 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // Visit all objc_* calls in F. for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Visiting: " << + *Inst << "\n"); + InstructionClass Class = GetBasicInstructionClass(Inst); switch (Class) { @@ -2267,6 +2343,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { case IC_NoopCast: Changed = true; ++NumNoops; + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Erasing no-op cast:" + " " << *Inst << "\n"); EraseInstruction(Inst); continue; @@ -2283,7 +2361,13 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), Constant::getNullValue(Ty), CI); - CI->replaceAllUsesWith(UndefValue::get(CI->getType())); + llvm::Value *NewValue = UndefValue::get(CI->getType()); + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null " + "pointer-to-weak-pointer is undefined behavior.\n" + " Old = " << *CI << + "\n New = " << + *NewValue << "\n"); + CI->replaceAllUsesWith(NewValue); CI->eraseFromParent(); continue; } @@ -2299,7 +2383,15 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), Constant::getNullValue(Ty), CI); - CI->replaceAllUsesWith(UndefValue::get(CI->getType())); + + llvm::Value *NewValue = UndefValue::get(CI->getType()); + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null " + "pointer-to-weak-pointer is undefined behavior.\n" + " Old = " << *CI << + "\n New = " << + *NewValue << "\n"); + + CI->replaceAllUsesWith(NewValue); CI->eraseFromParent(); continue; } @@ -2333,6 +2425,14 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { Call->getArgOperand(0), "", Call); NewCall->setMetadata(ImpreciseReleaseMDKind, MDNode::get(C, ArrayRef<Value *>())); + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Replacing " + "objc_autorelease(x) with objc_release(x) since x is " + "otherwise unused.\n" + " Old: " << *Call << + "\n New: " << + *NewCall << "\n"); + EraseInstruction(Call); Inst = NewCall; Class = IC_Release; @@ -2343,12 +2443,17 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // a tail keyword. if (IsAlwaysTail(Class)) { Changed = true; + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Adding tail keyword" + " to function since it can never be passed stack args: " << *Inst << + "\n"); cast<CallInst>(Inst)->setTailCall(); } // Set nounwind as needed. if (IsNoThrow(Class)) { Changed = true; + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Found no throw" + " class. Setting nounwind on: " << *Inst << "\n"); cast<CallInst>(Inst)->setDoesNotThrow(); } @@ -2363,6 +2468,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { if (isNullOrUndef(Arg)) { Changed = true; ++NumNoops; + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: ARC calls with " + " null are no-ops. Erasing: " << *Inst << "\n"); EraseInstruction(Inst); continue; } @@ -2464,6 +2571,9 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { } } } while (!Worklist.empty()); + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Finished Queue.\n\n"); + } } @@ -3367,6 +3477,10 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { // queries instead. for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Visiting: " << *Inst << + "\n"); + InstructionClass Class = GetBasicInstructionClass(Inst); if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained) continue; @@ -3512,6 +3626,9 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { done:; } } + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Finished List.\n\n"); + } /// OptimizeSequences - Identify program paths which execute sequences of @@ -3537,19 +3654,19 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) { } /// OptimizeReturns - Look for this pattern: -/// +/// \code /// %call = call i8* @something(...) /// %2 = call i8* @objc_retain(i8* %call) /// %3 = call i8* @objc_autorelease(i8* %2) /// ret i8* %3 -/// +/// \endcode /// And delete the retain and autorelease. /// /// Otherwise if it's just this: -/// +/// \code /// %3 = call i8* @objc_autorelease(i8* %2) /// ret i8* %3 -/// +/// \endcode /// convert the autorelease to autoreleaseRV. void ObjCARCOpt::OptimizeReturns(Function &F) { if (!F.getReturnType()->isPointerTy()) @@ -3560,6 +3677,9 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { BasicBlock *BB = FI; ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back()); + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Visiting: " << *Ret << "\n"); + if (!Ret) continue; const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0)); @@ -3633,6 +3753,9 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { // If so, we can zap the retain and autorelease. Changed = true; ++NumRets; + DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Erasing: " << *Retain + << "\n Erasing: " + << *Autorelease << "\n"); EraseInstruction(Retain); EraseInstruction(Autorelease); } @@ -3643,6 +3766,9 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { DependingInstructions.clear(); Visited.clear(); } + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Finished List.\n\n"); + } bool ObjCARCOpt::doInitialization(Module &M) { @@ -3734,9 +3860,9 @@ void ObjCARCOpt::releaseMemory() { // TODO: ObjCARCContract could insert PHI nodes when uses aren't // dominated by single calls. -#include "llvm/Operator.h" -#include "llvm/InlineAsm.h" #include "llvm/Analysis/Dominators.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Operator.h" STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed"); @@ -3818,15 +3944,16 @@ Constant *ObjCARCContract::getStoreStrongCallee(Module *M) { Type *I8XX = PointerType::getUnqual(I8X); Type *Params[] = { I8XX, I8X }; - AttrListPtr Attributes = AttrListPtr() - .addAttr(~0u, Attribute::NoUnwind) - .addAttr(1, Attribute::NoCapture); + AttributeSet Attribute = AttributeSet() + .addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)) + .addAttr(M->getContext(), 1, Attribute::get(C, Attribute::NoCapture)); StoreStrongCallee = M->getOrInsertFunction( "objc_storeStrong", FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false), - Attributes); + Attribute); } return StoreStrongCallee; } @@ -3837,9 +3964,11 @@ Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *Params[] = { I8X }; FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); RetainAutoreleaseCallee = - M->getOrInsertFunction("objc_retainAutorelease", FTy, Attributes); + M->getOrInsertFunction("objc_retainAutorelease", FTy, Attribute); } return RetainAutoreleaseCallee; } @@ -3850,10 +3979,12 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *Params[] = { I8X }; FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); RetainAutoreleaseRVCallee = M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy, - Attributes); + Attribute); } return RetainAutoreleaseRVCallee; } @@ -3897,11 +4028,19 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, Changed = true; ++NumPeeps; + DEBUG(dbgs() << "ObjCARCContract::ContractAutorelease: Fusing " + "retain/autorelease. Erasing: " << *Autorelease << "\n" + " Old Retain: " + << *Retain << "\n"); + if (Class == IC_AutoreleaseRV) Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent())); else Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent())); + DEBUG(dbgs() << " New Retain: " + << *Retain << "\n"); + EraseInstruction(Autorelease); return true; } @@ -4052,6 +4191,8 @@ bool ObjCARCContract::runOnFunction(Function &F) { for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; + DEBUG(dbgs() << "ObjCARCContract: Visiting: " << *Inst << "\n"); + // Only these library routines return their argument. In particular, // objc_retainBlock does not necessarily return its argument. InstructionClass Class = GetBasicInstructionClass(Inst); @@ -4089,6 +4230,8 @@ bool ObjCARCContract::runOnFunction(Function &F) { } while (isNoopInstruction(BBI)); if (&*BBI == GetObjCArg(Inst)) { + DEBUG(dbgs() << "ObjCARCContract: Adding inline asm marker for " + "retainAutoreleasedReturnValue optimization.\n"); Changed = true; InlineAsm *IA = InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()), @@ -4108,6 +4251,10 @@ bool ObjCARCContract::runOnFunction(Function &F) { ConstantPointerNull::get(cast<PointerType>(CI->getType())); Changed = true; new StoreInst(Null, CI->getArgOperand(0), CI); + + DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n" + << " New = " << *Null << "\n"); + CI->replaceAllUsesWith(Null); CI->eraseFromParent(); } @@ -4127,6 +4274,8 @@ bool ObjCARCContract::runOnFunction(Function &F) { continue; } + DEBUG(dbgs() << "ObjCARCContract: Finished List.\n\n"); + // Don't use GetObjCArg because we don't want to look through bitcasts // and such; to do the replacement, the argument must have type i8*. const Value *Arg = cast<CallInst>(Inst)->getArgOperand(0); diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index 09687d8..0da3746 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -22,24 +22,24 @@ #define DEBUG_TYPE "reassociate" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/IRBuilder.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Pass.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Assembly/Writer.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; @@ -339,36 +339,6 @@ static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { } } -/// EvaluateRepeatedConstant - Compute C op C op ... op C where the constant C -/// is repeated Weight times. -static Constant *EvaluateRepeatedConstant(unsigned Opcode, Constant *C, - APInt Weight) { - // For addition the result can be efficiently computed as the product of the - // constant and the weight. - if (Opcode == Instruction::Add) - return ConstantExpr::getMul(C, ConstantInt::get(C->getContext(), Weight)); - - // The weight might be huge, so compute by repeated squaring to ensure that - // compile time is proportional to the logarithm of the weight. - Constant *Result = 0; - Constant *Power = C; // Successively C, C op C, (C op C) op (C op C) etc. - // Visit the bits in Weight. - while (Weight != 0) { - // If the current bit in Weight is non-zero do Result = Result op Power. - if (Weight[0]) - Result = Result ? ConstantExpr::get(Opcode, Result, Power) : Power; - // Move on to the next bit if any more are non-zero. - Weight = Weight.lshr(1); - if (Weight.isMinValue()) - break; - // Square the power. - Power = ConstantExpr::get(Opcode, Power, Power); - } - - assert(Result && "Only positive weights supported!"); - return Result; -} - typedef std::pair<Value*, APInt> RepeatedValue; /// LinearizeExprTree - Given an associative binary expression, return the leaf @@ -382,9 +352,7 @@ typedef std::pair<Value*, APInt> RepeatedValue; /// op /// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times /// -/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct, and -/// they are all non-constant except possibly for the last one, which if it is -/// constant will have weight one (Ops[N].second === 1). +/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct. /// /// This routine may modify the function, in which case it returns 'true'. The /// changes it makes may well be destructive, changing the value computed by 'I' @@ -455,10 +423,6 @@ static bool LinearizeExprTree(BinaryOperator *I, assert(Instruction::isAssociative(Opcode) && Instruction::isCommutative(Opcode) && "Expected an associative and commutative operation!"); - // If we see an absorbing element then the entire expression must be equal to - // it. For example, if this is a multiplication expression and zero occurs as - // an operand somewhere in it then the result of the expression must be zero. - Constant *Absorber = ConstantExpr::getBinOpAbsorber(Opcode, I->getType()); // Visit all operands of the expression, keeping track of their weight (the // number of paths from the expression root to the operand, or if you like @@ -506,13 +470,6 @@ static bool LinearizeExprTree(BinaryOperator *I, DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n"); assert(!Op->use_empty() && "No uses, so how did we get to it?!"); - // If the expression contains an absorbing element then there is no need - // to analyze it further: it must evaluate to the absorbing element. - if (Op == Absorber && !Weight.isMinValue()) { - Ops.push_back(std::make_pair(Absorber, APInt(Bitwidth, 1))); - return MadeChange; - } - // If this is a binary operation of the right kind with only one use then // add its operands to the expression. if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { @@ -604,7 +561,6 @@ static bool LinearizeExprTree(BinaryOperator *I, // The leaves, repeated according to their weights, represent the linearized // form of the expression. - Constant *Cst = 0; // Accumulate constants here. for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) { Value *V = LeafOrder[i]; LeafMap::iterator It = Leaves.find(V); @@ -618,31 +574,14 @@ static bool LinearizeExprTree(BinaryOperator *I, continue; // Ensure the leaf is only output once. It->second = 0; - // Glob all constants together into Cst. - if (Constant *C = dyn_cast<Constant>(V)) { - C = EvaluateRepeatedConstant(Opcode, C, Weight); - Cst = Cst ? ConstantExpr::get(Opcode, Cst, C) : C; - continue; - } - // Add non-constant Ops.push_back(std::make_pair(V, Weight)); } - // Add any constants back into Ops, all globbed together and reduced to having - // weight 1 for the convenience of users. - Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType()); - if (Cst && Cst != Identity) { - // If combining multiple constants resulted in the absorber then the entire - // expression must evaluate to the absorber. - if (Cst == Absorber) - Ops.clear(); - Ops.push_back(std::make_pair(Cst, APInt(Bitwidth, 1))); - } - // For nilpotent operations or addition there may be no operands, for example // because the expression was "X xor X" or consisted of 2^Bitwidth additions: // in both cases the weight reduces to 0 causing the value to be skipped. if (Ops.empty()) { + Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType()); assert(Identity && "Associative operation without identity!"); Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1))); } @@ -656,8 +595,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { assert(Ops.size() > 1 && "Single values should be used directly!"); - // Since our optimizations never increase the number of operations, the new - // expression can always be written by reusing the existing binary operators + // Since our optimizations should never increase the number of operations, the + // new expression can usually be written reusing the existing binary operators // from the original expression tree, without creating any new instructions, // though the rewritten expression may have a completely different topology. // We take care to not change anything if the new expression will be the same @@ -671,6 +610,20 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, unsigned Opcode = I->getOpcode(); BinaryOperator *Op = I; + /// NotRewritable - The operands being written will be the leaves of the new + /// expression and must not be used as inner nodes (via NodesToRewrite) by + /// mistake. Inner nodes are always reassociable, and usually leaves are not + /// (if they were they would have been incorporated into the expression and so + /// would not be leaves), so most of the time there is no danger of this. But + /// in rare cases a leaf may become reassociable if an optimization kills uses + /// of it, or it may momentarily become reassociable during rewriting (below) + /// due it being removed as an operand of one of its uses. Ensure that misuse + /// of leaf nodes as inner nodes cannot occur by remembering all of the future + /// leaves and refusing to reuse any of them as inner nodes. + SmallPtrSet<Value*, 8> NotRewritable; + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + NotRewritable.insert(Ops[i].Op); + // ExpressionChanged - Non-null if the rewritten expression differs from the // original in some non-trivial way, requiring the clearing of optional flags. // Flags are cleared from the operator in ExpressionChanged up to I inclusive. @@ -703,12 +656,14 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, // the old operands with the new ones. DEBUG(dbgs() << "RA: " << *Op << '\n'); if (NewLHS != OldLHS) { - if (BinaryOperator *BO = isReassociableOp(OldLHS, Opcode)) + BinaryOperator *BO = isReassociableOp(OldLHS, Opcode); + if (BO && !NotRewritable.count(BO)) NodesToRewrite.push_back(BO); Op->setOperand(0, NewLHS); } if (NewRHS != OldRHS) { - if (BinaryOperator *BO = isReassociableOp(OldRHS, Opcode)) + BinaryOperator *BO = isReassociableOp(OldRHS, Opcode); + if (BO && !NotRewritable.count(BO)) NodesToRewrite.push_back(BO); Op->setOperand(1, NewRHS); } @@ -732,7 +687,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, Op->swapOperands(); } else { // Overwrite with the new right-hand side. - if (BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode)) + BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode); + if (BO && !NotRewritable.count(BO)) NodesToRewrite.push_back(BO); Op->setOperand(1, NewRHS); ExpressionChanged = Op; @@ -745,7 +701,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, // Now deal with the left-hand side. If this is already an operation node // from the original expression then just rewrite the rest of the expression // into it. - if (BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode)) { + BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode); + if (BO && !NotRewritable.count(BO)) { Op = BO; continue; } @@ -1446,9 +1403,26 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { // Now that we have the linearized expression tree, try to optimize it. // Start by folding any constants that we found. - if (Ops.size() == 1) return Ops[0].Op; - + Constant *Cst = 0; unsigned Opcode = I->getOpcode(); + while (!Ops.empty() && isa<Constant>(Ops.back().Op)) { + Constant *C = cast<Constant>(Ops.pop_back_val().Op); + Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C; + } + // If there was nothing but constants then we are done. + if (Ops.empty()) + return Cst; + + // Put the combined constant back at the end of the operand list, except if + // there is no point. For example, an add of 0 gets dropped here, while a + // multiplication by zero turns the whole expression into zero. + if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) { + if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType())) + return Cst; + Ops.push_back(ValueEntry(0, Cst)); + } + + if (Ops.size() == 1) return Ops[0].Op; // Handle destructive annihilation due to identities between elements in the // argument list here. diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp index ea1de63..07f540a 100644 --- a/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/lib/Transforms/Scalar/Reg2Mem.cpp @@ -18,15 +18,15 @@ #define DEBUG_TYPE "reg2mem" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Pass.h" -#include "llvm/Function.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" -#include "llvm/BasicBlock.h" -#include "llvm/Instructions.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/CFG.h" +#include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 2c39aab..3e935d8 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -19,26 +19,26 @@ #define DEBUG_TYPE "sccp" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/IPO.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Instructions.h" -#include "llvm/Pass.h" -#include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Support/CallSite.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/InstVisitor.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/InstVisitor.h" +#include "llvm/Pass.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; @@ -153,7 +153,7 @@ namespace { /// Constant Propagation. /// class SCCPSolver : public InstVisitor<SCCPSolver> { - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable. DenseMap<Value*, LatticeVal> ValueState; // The state each value is in. @@ -205,7 +205,7 @@ class SCCPSolver : public InstVisitor<SCCPSolver> { typedef std::pair<BasicBlock*, BasicBlock*> Edge; DenseSet<Edge> KnownFeasibleEdges; public: - SCCPSolver(const TargetData *td, const TargetLibraryInfo *tli) + SCCPSolver(const DataLayout *td, const TargetLibraryInfo *tli) : TD(td), TLI(tli) {} /// MarkBlockExecutable - This method can be used by clients to mark all of @@ -1564,7 +1564,7 @@ static void DeleteInstructionInBlock(BasicBlock *BB) { // bool SCCP::runOnFunction(Function &F) { DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); SCCPSolver Solver(TD, TLI); @@ -1693,7 +1693,7 @@ static bool AddressIsTaken(const GlobalValue *GV) { } bool IPSCCP::runOnModule(Module &M) { - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); SCCPSolver Solver(TD, TLI); diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp new file mode 100644 index 0000000..4204171 --- /dev/null +++ b/lib/Transforms/Scalar/SROA.cpp @@ -0,0 +1,3711 @@ +//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This transformation implements the well known scalar replacement of +/// aggregates transformation. It tries to identify promotable elements of an +/// aggregate alloca, and promote them to registers. It will also try to +/// convert uses of an element (or set of elements) of an alloca into a vector +/// or bitfield-style integer scalar if appropriate. +/// +/// It works to do this with minimal slicing of the alloca so that regions +/// which are merely transferred in and out of external memory remain unchanged +/// and are not decomposed to scalar code. +/// +/// Because this also performs alloca promotion, it can be thought of as also +/// serving the purpose of SSA formation. The algorithm iterates on the +/// function until all opportunities for promotion have been realized. +/// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sroa" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/PtrUseVisitor.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/InstVisitor.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +using namespace llvm; + +STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); +STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); +STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); +STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); +STATISTIC(NumDeleted, "Number of instructions deleted"); +STATISTIC(NumVectorized, "Number of vectorized aggregates"); + +/// Hidden option to force the pass to not use DomTree and mem2reg, instead +/// forming SSA values through the SSAUpdater infrastructure. +static cl::opt<bool> +ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); + +namespace { +/// \brief Alloca partitioning representation. +/// +/// This class represents a partitioning of an alloca into slices, and +/// information about the nature of uses of each slice of the alloca. The goal +/// is that this information is sufficient to decide if and how to split the +/// alloca apart and replace slices with scalars. It is also intended that this +/// structure can capture the relevant information needed both to decide about +/// and to enact these transformations. +class AllocaPartitioning { +public: + /// \brief A common base class for representing a half-open byte range. + struct ByteRange { + /// \brief The beginning offset of the range. + uint64_t BeginOffset; + + /// \brief The ending offset, not included in the range. + uint64_t EndOffset; + + ByteRange() : BeginOffset(), EndOffset() {} + ByteRange(uint64_t BeginOffset, uint64_t EndOffset) + : BeginOffset(BeginOffset), EndOffset(EndOffset) {} + + /// \brief Support for ordering ranges. + /// + /// This provides an ordering over ranges such that start offsets are + /// always increasing, and within equal start offsets, the end offsets are + /// decreasing. Thus the spanning range comes first in a cluster with the + /// same start position. + bool operator<(const ByteRange &RHS) const { + if (BeginOffset < RHS.BeginOffset) return true; + if (BeginOffset > RHS.BeginOffset) return false; + if (EndOffset > RHS.EndOffset) return true; + return false; + } + + /// \brief Support comparison with a single offset to allow binary searches. + friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) { + return LHS.BeginOffset < RHSOffset; + } + + friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, + const ByteRange &RHS) { + return LHSOffset < RHS.BeginOffset; + } + + bool operator==(const ByteRange &RHS) const { + return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset; + } + bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); } + }; + + /// \brief A partition of an alloca. + /// + /// This structure represents a contiguous partition of the alloca. These are + /// formed by examining the uses of the alloca. During formation, they may + /// overlap but once an AllocaPartitioning is built, the Partitions within it + /// are all disjoint. + struct Partition : public ByteRange { + /// \brief Whether this partition is splittable into smaller partitions. + /// + /// We flag partitions as splittable when they are formed entirely due to + /// accesses by trivially splittable operations such as memset and memcpy. + bool IsSplittable; + + /// \brief Test whether a partition has been marked as dead. + bool isDead() const { + if (BeginOffset == UINT64_MAX) { + assert(EndOffset == UINT64_MAX); + return true; + } + return false; + } + + /// \brief Kill a partition. + /// This is accomplished by setting both its beginning and end offset to + /// the maximum possible value. + void kill() { + assert(!isDead() && "He's Dead, Jim!"); + BeginOffset = EndOffset = UINT64_MAX; + } + + Partition() : ByteRange(), IsSplittable() {} + Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) + : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} + }; + + /// \brief A particular use of a partition of the alloca. + /// + /// This structure is used to associate uses of a partition with it. They + /// mark the range of bytes which are referenced by a particular instruction, + /// and includes a handle to the user itself and the pointer value in use. + /// The bounds of these uses are determined by intersecting the bounds of the + /// memory use itself with a particular partition. As a consequence there is + /// intentionally overlap between various uses of the same partition. + struct PartitionUse : public ByteRange { + /// \brief The use in question. Provides access to both user and used value. + /// + /// Note that this may be null if the partition use is *dead*, that is, it + /// should be ignored. + Use *U; + + PartitionUse() : ByteRange(), U() {} + PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U) + : ByteRange(BeginOffset, EndOffset), U(U) {} + }; + + /// \brief Construct a partitioning of a particular alloca. + /// + /// Construction does most of the work for partitioning the alloca. This + /// performs the necessary walks of users and builds a partitioning from it. + AllocaPartitioning(const DataLayout &TD, AllocaInst &AI); + + /// \brief Test whether a pointer to the allocation escapes our analysis. + /// + /// If this is true, the partitioning is never fully built and should be + /// ignored. + bool isEscaped() const { return PointerEscapingInstr; } + + /// \brief Support for iterating over the partitions. + /// @{ + typedef SmallVectorImpl<Partition>::iterator iterator; + iterator begin() { return Partitions.begin(); } + iterator end() { return Partitions.end(); } + + typedef SmallVectorImpl<Partition>::const_iterator const_iterator; + const_iterator begin() const { return Partitions.begin(); } + const_iterator end() const { return Partitions.end(); } + /// @} + + /// \brief Support for iterating over and manipulating a particular + /// partition's uses. + /// + /// The iteration support provided for uses is more limited, but also + /// includes some manipulation routines to support rewriting the uses of + /// partitions during SROA. + /// @{ + typedef SmallVectorImpl<PartitionUse>::iterator use_iterator; + use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); } + use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } + use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } + use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } + + typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator; + const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } + const_use_iterator use_begin(const_iterator I) const { + return Uses[I - begin()].begin(); + } + const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); } + const_use_iterator use_end(const_iterator I) const { + return Uses[I - begin()].end(); + } + + unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); } + unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); } + const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const { + return Uses[PIdx][UIdx]; + } + const PartitionUse &getUse(const_iterator I, unsigned UIdx) const { + return Uses[I - begin()][UIdx]; + } + + void use_push_back(unsigned Idx, const PartitionUse &PU) { + Uses[Idx].push_back(PU); + } + void use_push_back(const_iterator I, const PartitionUse &PU) { + Uses[I - begin()].push_back(PU); + } + /// @} + + /// \brief Allow iterating the dead users for this alloca. + /// + /// These are instructions which will never actually use the alloca as they + /// are outside the allocated range. They are safe to replace with undef and + /// delete. + /// @{ + typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; + dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } + dead_user_iterator dead_user_end() const { return DeadUsers.end(); } + /// @} + + /// \brief Allow iterating the dead expressions referring to this alloca. + /// + /// These are operands which have cannot actually be used to refer to the + /// alloca as they are outside its range and the user doesn't correct for + /// that. These mostly consist of PHI node inputs and the like which we just + /// need to replace with undef. + /// @{ + typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; + dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } + dead_op_iterator dead_op_end() const { return DeadOperands.end(); } + /// @} + + /// \brief MemTransferInst auxiliary data. + /// This struct provides some auxiliary data about memory transfer + /// intrinsics such as memcpy and memmove. These intrinsics can use two + /// different ranges within the same alloca, and provide other challenges to + /// correctly represent. We stash extra data to help us untangle this + /// after the partitioning is complete. + struct MemTransferOffsets { + /// The destination begin and end offsets when the destination is within + /// this alloca. If the end offset is zero the destination is not within + /// this alloca. + uint64_t DestBegin, DestEnd; + + /// The source begin and end offsets when the source is within this alloca. + /// If the end offset is zero, the source is not within this alloca. + uint64_t SourceBegin, SourceEnd; + + /// Flag for whether an alloca is splittable. + bool IsSplittable; + }; + MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { + return MemTransferInstData.lookup(&II); + } + + /// \brief Map from a PHI or select operand back to a partition. + /// + /// When manipulating PHI nodes or selects, they can use more than one + /// partition of an alloca. We store a special mapping to allow finding the + /// partition referenced by each of these operands, if any. + iterator findPartitionForPHIOrSelectOperand(Use *U) { + SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt + = PHIOrSelectOpMap.find(U); + if (MapIt == PHIOrSelectOpMap.end()) + return end(); + + return begin() + MapIt->second.first; + } + + /// \brief Map from a PHI or select operand back to the specific use of + /// a partition. + /// + /// Similar to mapping these operands back to the partitions, this maps + /// directly to the use structure of that partition. + use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) { + SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt + = PHIOrSelectOpMap.find(U); + assert(MapIt != PHIOrSelectOpMap.end()); + return Uses[MapIt->second.first].begin() + MapIt->second.second; + } + + /// \brief Compute a common type among the uses of a particular partition. + /// + /// This routines walks all of the uses of a particular partition and tries + /// to find a common type between them. Untyped operations such as memset and + /// memcpy are ignored. + Type *getCommonType(iterator I) const; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; + void printUsers(raw_ostream &OS, const_iterator I, + StringRef Indent = " ") const; + void print(raw_ostream &OS) const; + void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; + void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; +#endif + +private: + template <typename DerivedT, typename RetT = void> class BuilderBase; + class PartitionBuilder; + friend class AllocaPartitioning::PartitionBuilder; + class UseBuilder; + friend class AllocaPartitioning::UseBuilder; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// \brief Handle to alloca instruction to simplify method interfaces. + AllocaInst &AI; +#endif + + /// \brief The instruction responsible for this alloca having no partitioning. + /// + /// When an instruction (potentially) escapes the pointer to the alloca, we + /// store a pointer to that here and abort trying to partition the alloca. + /// This will be null if the alloca is partitioned successfully. + Instruction *PointerEscapingInstr; + + /// \brief The partitions of the alloca. + /// + /// We store a vector of the partitions over the alloca here. This vector is + /// sorted by increasing begin offset, and then by decreasing end offset. See + /// the Partition inner class for more details. Initially (during + /// construction) there are overlaps, but we form a disjoint sequence of + /// partitions while finishing construction and a fully constructed object is + /// expected to always have this as a disjoint space. + SmallVector<Partition, 8> Partitions; + + /// \brief The uses of the partitions. + /// + /// This is essentially a mapping from each partition to a list of uses of + /// that partition. The mapping is done with a Uses vector that has the exact + /// same number of entries as the partition vector. Each entry is itself + /// a vector of the uses. + SmallVector<SmallVector<PartitionUse, 2>, 8> Uses; + + /// \brief Instructions which will become dead if we rewrite the alloca. + /// + /// Note that these are not separated by partition. This is because we expect + /// a partitioned alloca to be completely rewritten or not rewritten at all. + /// If rewritten, all these instructions can simply be removed and replaced + /// with undef as they come from outside of the allocated space. + SmallVector<Instruction *, 8> DeadUsers; + + /// \brief Operands which will become dead if we rewrite the alloca. + /// + /// These are operands that in their particular use can be replaced with + /// undef when we rewrite the alloca. These show up in out-of-bounds inputs + /// to PHI nodes and the like. They aren't entirely dead (there might be + /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we + /// want to swap this particular input for undef to simplify the use lists of + /// the alloca. + SmallVector<Use *, 8> DeadOperands; + + /// \brief The underlying storage for auxiliary memcpy and memset info. + SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData; + + /// \brief A side datastructure used when building up the partitions and uses. + /// + /// This mapping is only really used during the initial building of the + /// partitioning so that we can retain information about PHI and select nodes + /// processed. + SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes; + + /// \brief Auxiliary information for particular PHI or select operands. + SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap; + + /// \brief A utility routine called from the constructor. + /// + /// This does what it says on the tin. It is the key of the alloca partition + /// splitting and merging. After it is called we have the desired disjoint + /// collection of partitions. + void splitAndMergePartitions(); +}; +} + +static Value *foldSelectInst(SelectInst &SI) { + // If the condition being selected on is a constant or the same value is + // being selected between, fold the select. Yes this does (rarely) happen + // early on. + if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) + return SI.getOperand(1+CI->isZero()); + if (SI.getOperand(1) == SI.getOperand(2)) { + return SI.getOperand(1); + } + return 0; +} + +/// \brief Builder for the alloca partitioning. +/// +/// This class builds an alloca partitioning by recursively visiting the uses +/// of an alloca and splitting the partitions for each load and store at each +/// offset. +class AllocaPartitioning::PartitionBuilder + : public PtrUseVisitor<PartitionBuilder> { + friend class PtrUseVisitor<PartitionBuilder>; + friend class InstVisitor<PartitionBuilder>; + typedef PtrUseVisitor<PartitionBuilder> Base; + + const uint64_t AllocSize; + AllocaPartitioning &P; + + SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap; + +public: + PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P) + : PtrUseVisitor<PartitionBuilder>(DL), + AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), + P(P) {} + +private: + void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, + bool IsSplittable = false) { + // Completely skip uses which have a zero size or start either before or + // past the end of the allocation. + if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) { + DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset + << " which has zero size or starts outside of the " + << AllocSize << " byte alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + return; + } + + uint64_t BeginOffset = Offset.getZExtValue(); + uint64_t EndOffset = BeginOffset + Size; + + // Clamp the end offset to the end of the allocation. Note that this is + // formulated to handle even the case where "BeginOffset + Size" overflows. + // NOTE! This may appear superficially to be something we could ignore + // entirely, but that is not so! There may be PHI-node uses where some + // instructions are dead but not others. We can't completely ignore the + // PHI node, and so have to record at least the information here. + assert(AllocSize >= BeginOffset); // Established above. + if (Size > AllocSize - BeginOffset) { + DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset + << " to remain within the " << AllocSize << " byte alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + EndOffset = AllocSize; + } + + Partition New(BeginOffset, EndOffset, IsSplittable); + P.Partitions.push_back(New); + } + + void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, + bool IsVolatile) { + uint64_t Size = DL.getTypeStoreSize(Ty); + + // If this memory access can be shown to *statically* extend outside the + // bounds of of the allocation, it's behavior is undefined, so simply + // ignore it. Note that this is more strict than the generic clamping + // behavior of insertUse. We also try to handle cases which might run the + // risk of overflow. + // FIXME: We should instead consider the pointer to have escaped if this + // function is being instrumented for addressing bugs or race conditions. + if (Offset.isNegative() || Size > AllocSize || + Offset.ugt(AllocSize - Size)) { + DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte " + << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset + << " which extends past the end of the " << AllocSize + << " byte alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + return; + } + + // We allow splitting of loads and stores where the type is an integer type + // and which cover the entire alloca. Such integer loads and stores + // often require decomposition into fine grained loads and stores. + bool IsSplittable = false; + if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) + IsSplittable = !IsVolatile && ITy->getBitWidth() == AllocSize*8; + + insertUse(I, Offset, Size, IsSplittable); + } + + void visitLoadInst(LoadInst &LI) { + assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && + "All simple FCA loads should have been pre-split"); + + if (!IsOffsetKnown) + return PI.setAborted(&LI); + + return handleLoadOrStore(LI.getType(), LI, Offset, LI.isVolatile()); + } + + void visitStoreInst(StoreInst &SI) { + Value *ValOp = SI.getValueOperand(); + if (ValOp == *U) + return PI.setEscapedAndAborted(&SI); + if (!IsOffsetKnown) + return PI.setAborted(&SI); + + assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && + "All simple FCA stores should have been pre-split"); + handleLoadOrStore(ValOp->getType(), SI, Offset, SI.isVolatile()); + } + + + void visitMemSetInst(MemSetInst &II) { + assert(II.getRawDest() == *U && "Pointer use is not the destination?"); + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + if ((Length && Length->getValue() == 0) || + (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) + // Zero-length mem transfer intrinsics can be ignored entirely. + return; + + if (!IsOffsetKnown) + return PI.setAborted(&II); + + insertUse(II, Offset, + Length ? Length->getLimitedValue() + : AllocSize - Offset.getLimitedValue(), + (bool)Length); + } + + void visitMemTransferInst(MemTransferInst &II) { + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + if ((Length && Length->getValue() == 0) || + (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) + // Zero-length mem transfer intrinsics can be ignored entirely. + return; + + if (!IsOffsetKnown) + return PI.setAborted(&II); + + uint64_t RawOffset = Offset.getLimitedValue(); + uint64_t Size = Length ? Length->getLimitedValue() + : AllocSize - RawOffset; + + MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; + + // Only intrinsics with a constant length can be split. + Offsets.IsSplittable = Length; + + if (*U == II.getRawDest()) { + Offsets.DestBegin = RawOffset; + Offsets.DestEnd = RawOffset + Size; + } + if (*U == II.getRawSource()) { + Offsets.SourceBegin = RawOffset; + Offsets.SourceEnd = RawOffset + Size; + } + + // If we have set up end offsets for both the source and the destination, + // we have found both sides of this transfer pointing at the same alloca. + bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd; + if (SeenBothEnds && II.getRawDest() != II.getRawSource()) { + unsigned PrevIdx = MemTransferPartitionMap[&II]; + + // Check if the begin offsets match and this is a non-volatile transfer. + // In that case, we can completely elide the transfer. + if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) { + P.Partitions[PrevIdx].kill(); + return; + } + + // Otherwise we have an offset transfer within the same alloca. We can't + // split those. + P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false; + } else if (SeenBothEnds) { + // Handle the case where this exact use provides both ends of the + // operation. + assert(II.getRawDest() == II.getRawSource()); + + // For non-volatile transfers this is a no-op. + if (!II.isVolatile()) + return; + + // Otherwise just suppress splitting. + Offsets.IsSplittable = false; + } + + + // Insert the use now that we've fixed up the splittable nature. + insertUse(II, Offset, Size, Offsets.IsSplittable); + + // Setup the mapping from intrinsic to partition of we've not seen both + // ends of this transfer. + if (!SeenBothEnds) { + unsigned NewIdx = P.Partitions.size() - 1; + bool Inserted + = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second; + assert(Inserted && + "Already have intrinsic in map but haven't seen both ends"); + (void)Inserted; + } + } + + // Disable SRoA for any intrinsics except for lifetime invariants. + // FIXME: What about debug instrinsics? This matches old behavior, but + // doesn't make sense. + void visitIntrinsicInst(IntrinsicInst &II) { + if (!IsOffsetKnown) + return PI.setAborted(&II); + + if (II.getIntrinsicID() == Intrinsic::lifetime_start || + II.getIntrinsicID() == Intrinsic::lifetime_end) { + ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); + uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), + Length->getLimitedValue()); + insertUse(II, Offset, Size, true); + return; + } + + Base::visitIntrinsicInst(II); + } + + Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { + // We consider any PHI or select that results in a direct load or store of + // the same offset to be a viable use for partitioning purposes. These uses + // are considered unsplittable and the size is the maximum loaded or stored + // size. + SmallPtrSet<Instruction *, 4> Visited; + SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; + Visited.insert(Root); + Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); + // If there are no loads or stores, the access is dead. We mark that as + // a size zero access. + Size = 0; + do { + Instruction *I, *UsedI; + llvm::tie(UsedI, I) = Uses.pop_back_val(); + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + Size = std::max(Size, DL.getTypeStoreSize(LI->getType())); + continue; + } + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + Value *Op = SI->getOperand(0); + if (Op == UsedI) + return SI; + Size = std::max(Size, DL.getTypeStoreSize(Op->getType())); + continue; + } + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { + if (!GEP->hasAllZeroIndices()) + return GEP; + } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && + !isa<SelectInst>(I)) { + return I; + } + + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; + ++UI) + if (Visited.insert(cast<Instruction>(*UI))) + Uses.push_back(std::make_pair(I, cast<Instruction>(*UI))); + } while (!Uses.empty()); + + return 0; + } + + void visitPHINode(PHINode &PN) { + if (PN.use_empty()) + return; + if (!IsOffsetKnown) + return PI.setAborted(&PN); + + // See if we already have computed info on this node. + std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN]; + if (PHIInfo.first) { + PHIInfo.second = true; + insertUse(PN, Offset, PHIInfo.first); + return; + } + + // Check for an unsafe use of the PHI node. + if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first)) + return PI.setAborted(UnsafeI); + + insertUse(PN, Offset, PHIInfo.first); + } + + void visitSelectInst(SelectInst &SI) { + if (SI.use_empty()) + return; + if (Value *Result = foldSelectInst(SI)) { + if (Result == *U) + // If the result of the constant fold will be the pointer, recurse + // through the select as if we had RAUW'ed it. + enqueueUsers(SI); + + return; + } + if (!IsOffsetKnown) + return PI.setAborted(&SI); + + // See if we already have computed info on this node. + std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI]; + if (SelectInfo.first) { + SelectInfo.second = true; + insertUse(SI, Offset, SelectInfo.first); + return; + } + + // Check for an unsafe use of the PHI node. + if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first)) + return PI.setAborted(UnsafeI); + + insertUse(SI, Offset, SelectInfo.first); + } + + /// \brief Disable SROA entirely if there are unhandled users of the alloca. + void visitInstruction(Instruction &I) { + PI.setAborted(&I); + } +}; + +/// \brief Use adder for the alloca partitioning. +/// +/// This class adds the uses of an alloca to all of the partitions which they +/// use. For splittable partitions, this can end up doing essentially a linear +/// walk of the partitions, but the number of steps remains bounded by the +/// total result instruction size: +/// - The number of partitions is a result of the number unsplittable +/// instructions using the alloca. +/// - The number of users of each partition is at worst the total number of +/// splittable instructions using the alloca. +/// Thus we will produce N * M instructions in the end, where N are the number +/// of unsplittable uses and M are the number of splittable. This visitor does +/// the exact same number of updates to the partitioning. +/// +/// In the more common case, this visitor will leverage the fact that the +/// partition space is pre-sorted, and do a logarithmic search for the +/// partition needed, making the total visit a classical ((N + M) * log(N)) +/// complexity operation. +class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> { + friend class PtrUseVisitor<UseBuilder>; + friend class InstVisitor<UseBuilder>; + typedef PtrUseVisitor<UseBuilder> Base; + + const uint64_t AllocSize; + AllocaPartitioning &P; + + /// \brief Set to de-duplicate dead instructions found in the use walk. + SmallPtrSet<Instruction *, 4> VisitedDeadInsts; + +public: + UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) + : PtrUseVisitor<UseBuilder>(TD), + AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), + P(P) {} + +private: + void markAsDead(Instruction &I) { + if (VisitedDeadInsts.insert(&I)) + P.DeadUsers.push_back(&I); + } + + void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) { + // If the use has a zero size or extends outside of the allocation, record + // it as a dead use for elimination later. + if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) + return markAsDead(User); + + uint64_t BeginOffset = Offset.getZExtValue(); + uint64_t EndOffset = BeginOffset + Size; + + // Clamp the end offset to the end of the allocation. Note that this is + // formulated to handle even the case where "BeginOffset + Size" overflows. + assert(AllocSize >= BeginOffset); // Established above. + if (Size > AllocSize - BeginOffset) + EndOffset = AllocSize; + + // NB: This only works if we have zero overlapping partitions. + iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset); + if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset) + B = llvm::prior(B); + for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset; + ++I) { + PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset), + std::min(I->EndOffset, EndOffset), U); + P.use_push_back(I, NewPU); + if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser())) + P.PHIOrSelectOpMap[U] + = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); + } + } + + void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset) { + uint64_t Size = DL.getTypeStoreSize(Ty); + + // If this memory access can be shown to *statically* extend outside the + // bounds of of the allocation, it's behavior is undefined, so simply + // ignore it. Note that this is more strict than the generic clamping + // behavior of insertUse. + if (Offset.isNegative() || Size > AllocSize || + Offset.ugt(AllocSize - Size)) + return markAsDead(I); + + insertUse(I, Offset, Size); + } + + void visitBitCastInst(BitCastInst &BC) { + if (BC.use_empty()) + return markAsDead(BC); + + return Base::visitBitCastInst(BC); + } + + void visitGetElementPtrInst(GetElementPtrInst &GEPI) { + if (GEPI.use_empty()) + return markAsDead(GEPI); + + return Base::visitGetElementPtrInst(GEPI); + } + + void visitLoadInst(LoadInst &LI) { + assert(IsOffsetKnown); + handleLoadOrStore(LI.getType(), LI, Offset); + } + + void visitStoreInst(StoreInst &SI) { + assert(IsOffsetKnown); + handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset); + } + + void visitMemSetInst(MemSetInst &II) { + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + if ((Length && Length->getValue() == 0) || + (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) + return markAsDead(II); + + assert(IsOffsetKnown); + insertUse(II, Offset, Length ? Length->getLimitedValue() + : AllocSize - Offset.getLimitedValue()); + } + + void visitMemTransferInst(MemTransferInst &II) { + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + if ((Length && Length->getValue() == 0) || + (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) + return markAsDead(II); + + assert(IsOffsetKnown); + uint64_t Size = Length ? Length->getLimitedValue() + : AllocSize - Offset.getLimitedValue(); + + MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; + if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd && + Offsets.DestBegin == Offsets.SourceBegin) + return markAsDead(II); // Skip identity transfers without side-effects. + + insertUse(II, Offset, Size); + } + + void visitIntrinsicInst(IntrinsicInst &II) { + assert(IsOffsetKnown); + assert(II.getIntrinsicID() == Intrinsic::lifetime_start || + II.getIntrinsicID() == Intrinsic::lifetime_end); + + ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); + insertUse(II, Offset, std::min(Length->getLimitedValue(), + AllocSize - Offset.getLimitedValue())); + } + + void insertPHIOrSelect(Instruction &User, const APInt &Offset) { + uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first; + + // For PHI and select operands outside the alloca, we can't nuke the entire + // phi or select -- the other side might still be relevant, so we special + // case them here and use a separate structure to track the operands + // themselves which should be replaced with undef. + if ((Offset.isNegative() && Offset.uge(Size)) || + (!Offset.isNegative() && Offset.uge(AllocSize))) { + P.DeadOperands.push_back(U); + return; + } + + insertUse(User, Offset, Size); + } + + void visitPHINode(PHINode &PN) { + if (PN.use_empty()) + return markAsDead(PN); + + assert(IsOffsetKnown); + insertPHIOrSelect(PN, Offset); + } + + void visitSelectInst(SelectInst &SI) { + if (SI.use_empty()) + return markAsDead(SI); + + if (Value *Result = foldSelectInst(SI)) { + if (Result == *U) + // If the result of the constant fold will be the pointer, recurse + // through the select as if we had RAUW'ed it. + enqueueUsers(SI); + else + // Otherwise the operand to the select is dead, and we can replace it + // with undef. + P.DeadOperands.push_back(U); + + return; + } + + assert(IsOffsetKnown); + insertPHIOrSelect(SI, Offset); + } + + /// \brief Unreachable, we've already visited the alloca once. + void visitInstruction(Instruction &I) { + llvm_unreachable("Unhandled instruction in use builder."); + } +}; + +void AllocaPartitioning::splitAndMergePartitions() { + size_t NumDeadPartitions = 0; + + // Track the range of splittable partitions that we pass when accumulating + // overlapping unsplittable partitions. + uint64_t SplitEndOffset = 0ull; + + Partition New(0ull, 0ull, false); + + for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) { + ++j; + + if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) { + assert(New.BeginOffset == New.EndOffset); + New = Partitions[i]; + } else { + assert(New.IsSplittable); + New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset); + } + assert(New.BeginOffset != New.EndOffset); + + // Scan the overlapping partitions. + while (j != e && New.EndOffset > Partitions[j].BeginOffset) { + // If the new partition we are forming is splittable, stop at the first + // unsplittable partition. + if (New.IsSplittable && !Partitions[j].IsSplittable) + break; + + // Grow the new partition to include any equally splittable range. 'j' is + // always equally splittable when New is splittable, but when New is not + // splittable, we may subsume some (or part of some) splitable partition + // without growing the new one. + if (New.IsSplittable == Partitions[j].IsSplittable) { + New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset); + } else { + assert(!New.IsSplittable); + assert(Partitions[j].IsSplittable); + SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset); + } + + Partitions[j].kill(); + ++NumDeadPartitions; + ++j; + } + + // If the new partition is splittable, chop off the end as soon as the + // unsplittable subsequent partition starts and ensure we eventually cover + // the splittable area. + if (j != e && New.IsSplittable) { + SplitEndOffset = std::max(SplitEndOffset, New.EndOffset); + New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); + } + + // Add the new partition if it differs from the original one and is + // non-empty. We can end up with an empty partition here if it was + // splittable but there is an unsplittable one that starts at the same + // offset. + if (New != Partitions[i]) { + if (New.BeginOffset != New.EndOffset) + Partitions.push_back(New); + // Mark the old one for removal. + Partitions[i].kill(); + ++NumDeadPartitions; + } + + New.BeginOffset = New.EndOffset; + if (!New.IsSplittable) { + New.EndOffset = std::max(New.EndOffset, SplitEndOffset); + if (j != e && !Partitions[j].IsSplittable) + New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); + New.IsSplittable = true; + // If there is a trailing splittable partition which won't be fused into + // the next splittable partition go ahead and add it onto the partitions + // list. + if (New.BeginOffset < New.EndOffset && + (j == e || !Partitions[j].IsSplittable || + New.EndOffset < Partitions[j].BeginOffset)) { + Partitions.push_back(New); + New.BeginOffset = New.EndOffset = 0ull; + } + } + } + + // Re-sort the partitions now that they have been split and merged into + // disjoint set of partitions. Also remove any of the dead partitions we've + // replaced in the process. + std::sort(Partitions.begin(), Partitions.end()); + if (NumDeadPartitions) { + assert(Partitions.back().isDead()); + assert((ptrdiff_t)NumDeadPartitions == + std::count(Partitions.begin(), Partitions.end(), Partitions.back())); + } + Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end()); +} + +AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI) + : +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + AI(AI), +#endif + PointerEscapingInstr(0) { + PartitionBuilder PB(TD, AI, *this); + PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI); + if (PtrI.isEscaped() || PtrI.isAborted()) { + // FIXME: We should sink the escape vs. abort info into the caller nicely, + // possibly by just storing the PtrInfo in the AllocaPartitioning. + PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() + : PtrI.getAbortingInst(); + assert(PointerEscapingInstr && "Did not track a bad instruction"); + return; + } + + // Sort the uses. This arranges for the offsets to be in ascending order, + // and the sizes to be in descending order. + std::sort(Partitions.begin(), Partitions.end()); + + // Remove any partitions from the back which are marked as dead. + while (!Partitions.empty() && Partitions.back().isDead()) + Partitions.pop_back(); + + if (Partitions.size() > 1) { + // Intersect splittability for all partitions with equal offsets and sizes. + // Then remove all but the first so that we have a sequence of non-equal but + // potentially overlapping partitions. + for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E; + I = J) { + ++J; + while (J != E && *I == *J) { + I->IsSplittable &= J->IsSplittable; + ++J; + } + } + Partitions.erase(std::unique(Partitions.begin(), Partitions.end()), + Partitions.end()); + + // Split splittable and merge unsplittable partitions into a disjoint set + // of partitions over the used space of the allocation. + splitAndMergePartitions(); + } + + // Now build up the user lists for each of these disjoint partitions by + // re-walking the recursive users of the alloca. + Uses.resize(Partitions.size()); + UseBuilder UB(TD, AI, *this); + PtrI = UB.visitPtr(AI); + assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!"); + assert(!PtrI.isAborted() && "Early aborted the visit of the pointer."); +} + +Type *AllocaPartitioning::getCommonType(iterator I) const { + Type *Ty = 0; + for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { + if (!UI->U) + continue; // Skip dead uses. + if (isa<IntrinsicInst>(*UI->U->getUser())) + continue; + if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) + continue; + + Type *UserTy = 0; + if (LoadInst *LI = dyn_cast<LoadInst>(UI->U->getUser())) { + UserTy = LI->getType(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(UI->U->getUser())) { + UserTy = SI->getValueOperand()->getType(); + } else { + return 0; // Bail if we have weird uses. + } + + if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { + // If the type is larger than the partition, skip it. We only encounter + // this for split integer operations where we want to use the type of the + // entity causing the split. + if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8) + continue; + + // If we have found an integer type use covering the alloca, use that + // regardless of the other types, as integers are often used for a "bucket + // of bits" type. + return ITy; + } + + if (Ty && Ty != UserTy) + return 0; + + Ty = UserTy; + } + return Ty; +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + +void AllocaPartitioning::print(raw_ostream &OS, const_iterator I, + StringRef Indent) const { + OS << Indent << "partition #" << (I - begin()) + << " [" << I->BeginOffset << "," << I->EndOffset << ")" + << (I->IsSplittable ? " (splittable)" : "") + << (Uses[I - begin()].empty() ? " (zero uses)" : "") + << "\n"; +} + +void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I, + StringRef Indent) const { + for (const_use_iterator UI = use_begin(I), UE = use_end(I); + UI != UE; ++UI) { + if (!UI->U) + continue; // Skip dead uses. + OS << Indent << " [" << UI->BeginOffset << "," << UI->EndOffset << ") " + << "used by: " << *UI->U->getUser() << "\n"; + if (MemTransferInst *II = dyn_cast<MemTransferInst>(UI->U->getUser())) { + const MemTransferOffsets &MTO = MemTransferInstData.lookup(II); + bool IsDest; + if (!MTO.IsSplittable) + IsDest = UI->BeginOffset == MTO.DestBegin; + else + IsDest = MTO.DestBegin != 0u; + OS << Indent << " (original " << (IsDest ? "dest" : "source") << ": " + << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin) + << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n"; + } + } +} + +void AllocaPartitioning::print(raw_ostream &OS) const { + if (PointerEscapingInstr) { + OS << "No partitioning for alloca: " << AI << "\n" + << " A pointer to this alloca escaped by:\n" + << " " << *PointerEscapingInstr << "\n"; + return; + } + + OS << "Partitioning of alloca: " << AI << "\n"; + unsigned Num = 0; + for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) { + print(OS, I); + printUsers(OS, I); + } +} + +void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); } +void AllocaPartitioning::dump() const { print(dbgs()); } + +#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + + +namespace { +/// \brief Implementation of LoadAndStorePromoter for promoting allocas. +/// +/// This subclass of LoadAndStorePromoter adds overrides to handle promoting +/// the loads and stores of an alloca instruction, as well as updating its +/// debug information. This is used when a domtree is unavailable and thus +/// mem2reg in its full form can't be used to handle promotion of allocas to +/// scalar values. +class AllocaPromoter : public LoadAndStorePromoter { + AllocaInst &AI; + DIBuilder &DIB; + + SmallVector<DbgDeclareInst *, 4> DDIs; + SmallVector<DbgValueInst *, 4> DVIs; + +public: + AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, + AllocaInst &AI, DIBuilder &DIB) + : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} + + void run(const SmallVectorImpl<Instruction*> &Insts) { + // Remember which alloca we're promoting (for isInstInList). + if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { + for (Value::use_iterator UI = DebugNode->use_begin(), + UE = DebugNode->use_end(); + UI != UE; ++UI) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) + DDIs.push_back(DDI); + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) + DVIs.push_back(DVI); + } + + LoadAndStorePromoter::run(Insts); + AI.eraseFromParent(); + while (!DDIs.empty()) + DDIs.pop_back_val()->eraseFromParent(); + while (!DVIs.empty()) + DVIs.pop_back_val()->eraseFromParent(); + } + + virtual bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &Insts) const { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->getOperand(0) == &AI; + return cast<StoreInst>(I)->getPointerOperand() == &AI; + } + + virtual void updateDebugInfo(Instruction *Inst) const { + for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), + E = DDIs.end(); I != E; ++I) { + DbgDeclareInst *DDI = *I; + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + ConvertDebugDeclareToDebugValue(DDI, SI, DIB); + else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) + ConvertDebugDeclareToDebugValue(DDI, LI, DIB); + } + for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), + E = DVIs.end(); I != E; ++I) { + DbgValueInst *DVI = *I; + Value *Arg = NULL; + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + // If an argument is zero extended then use argument directly. The ZExt + // may be zapped by an optimization pass in future. + if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) + Arg = dyn_cast<Argument>(ZExt->getOperand(0)); + if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) + Arg = dyn_cast<Argument>(SExt->getOperand(0)); + if (!Arg) + Arg = SI->getOperand(0); + } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + Arg = LI->getOperand(0); + } else { + continue; + } + Instruction *DbgVal = + DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), + Inst); + DbgVal->setDebugLoc(DVI->getDebugLoc()); + } + } +}; +} // end anon namespace + + +namespace { +/// \brief An optimization pass providing Scalar Replacement of Aggregates. +/// +/// This pass takes allocations which can be completely analyzed (that is, they +/// don't escape) and tries to turn them into scalar SSA values. There are +/// a few steps to this process. +/// +/// 1) It takes allocations of aggregates and analyzes the ways in which they +/// are used to try to split them into smaller allocations, ideally of +/// a single scalar data type. It will split up memcpy and memset accesses +/// as necessary and try to isolate invidual scalar accesses. +/// 2) It will transform accesses into forms which are suitable for SSA value +/// promotion. This can be replacing a memset with a scalar store of an +/// integer value, or it can involve speculating operations on a PHI or +/// select to be a PHI or select of the results. +/// 3) Finally, this will try to detect a pattern of accesses which map cleanly +/// onto insert and extract operations on a vector value, and convert them to +/// this form. By doing so, it will enable promotion of vector aggregates to +/// SSA vector values. +class SROA : public FunctionPass { + const bool RequiresDomTree; + + LLVMContext *C; + const DataLayout *TD; + DominatorTree *DT; + + /// \brief Worklist of alloca instructions to simplify. + /// + /// Each alloca in the function is added to this. Each new alloca formed gets + /// added to it as well to recursively simplify unless that alloca can be + /// directly promoted. Finally, each time we rewrite a use of an alloca other + /// the one being actively rewritten, we add it back onto the list if not + /// already present to ensure it is re-visited. + SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; + + /// \brief A collection of instructions to delete. + /// We try to batch deletions to simplify code and make things a bit more + /// efficient. + SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts; + + /// \brief Post-promotion worklist. + /// + /// Sometimes we discover an alloca which has a high probability of becoming + /// viable for SROA after a round of promotion takes place. In those cases, + /// the alloca is enqueued here for re-processing. + /// + /// Note that we have to be very careful to clear allocas out of this list in + /// the event they are deleted. + SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; + + /// \brief A collection of alloca instructions we can directly promote. + std::vector<AllocaInst *> PromotableAllocas; + +public: + SROA(bool RequiresDomTree = true) + : FunctionPass(ID), RequiresDomTree(RequiresDomTree), + C(0), TD(0), DT(0) { + initializeSROAPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F); + void getAnalysisUsage(AnalysisUsage &AU) const; + + const char *getPassName() const { return "SROA"; } + static char ID; + +private: + friend class PHIOrSelectSpeculator; + friend class AllocaPartitionRewriter; + friend class AllocaPartitionVectorRewriter; + + bool rewriteAllocaPartition(AllocaInst &AI, + AllocaPartitioning &P, + AllocaPartitioning::iterator PI); + bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P); + bool runOnAlloca(AllocaInst &AI); + void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); + bool promoteAllocas(Function &F); +}; +} + +char SROA::ID = 0; + +FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { + return new SROA(RequiresDomTree); +} + +INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", + false, false) + +namespace { +/// \brief Visitor to speculate PHIs and Selects where possible. +class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> { + // Befriend the base class so it can delegate to private visit methods. + friend class llvm::InstVisitor<PHIOrSelectSpeculator>; + + const DataLayout &TD; + AllocaPartitioning &P; + SROA &Pass; + +public: + PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass) + : TD(TD), P(P), Pass(Pass) {} + + /// \brief Visit the users of an alloca partition and rewrite them. + void visitUsers(AllocaPartitioning::const_iterator PI) { + // Note that we need to use an index here as the underlying vector of uses + // may be grown during speculation. However, we never need to re-visit the + // new uses, and so we can use the initial size bound. + for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) { + const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx); + if (!PU.U) + continue; // Skip dead use. + + visit(cast<Instruction>(PU.U->getUser())); + } + } + +private: + // By default, skip this instruction. + void visitInstruction(Instruction &I) {} + + /// PHI instructions that use an alloca and are subsequently loaded can be + /// rewritten to load both input pointers in the pred blocks and then PHI the + /// results, allowing the load of the alloca to be promoted. + /// From this: + /// %P2 = phi [i32* %Alloca, i32* %Other] + /// %V = load i32* %P2 + /// to: + /// %V1 = load i32* %Alloca -> will be mem2reg'd + /// ... + /// %V2 = load i32* %Other + /// ... + /// %V = phi [i32 %V1, i32 %V2] + /// + /// We can do this to a select if its only uses are loads and if the operands + /// to the select can be loaded unconditionally. + /// + /// FIXME: This should be hoisted into a generic utility, likely in + /// Transforms/Util/Local.h + bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) { + // For now, we can only do this promotion if the load is in the same block + // as the PHI, and if there are no stores between the phi and load. + // TODO: Allow recursive phi users. + // TODO: Allow stores. + BasicBlock *BB = PN.getParent(); + unsigned MaxAlign = 0; + for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || !LI->isSimple()) return false; + + // For now we only allow loads in the same block as the PHI. This is + // a common case that happens when instcombine merges two loads through + // a PHI. + if (LI->getParent() != BB) return false; + + // Ensure that there are no instructions between the PHI and the load that + // could store. + for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) + if (BBI->mayWriteToMemory()) + return false; + + MaxAlign = std::max(MaxAlign, LI->getAlignment()); + Loads.push_back(LI); + } + + // We can only transform this if it is safe to push the loads into the + // predecessor blocks. The only thing to watch out for is that we can't put + // a possibly trapping load in the predecessor if it is a critical edge. + for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; + ++Idx) { + TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); + Value *InVal = PN.getIncomingValue(Idx); + + // If the value is produced by the terminator of the predecessor (an + // invoke) or it has side-effects, there is no valid place to put a load + // in the predecessor. + if (TI == InVal || TI->mayHaveSideEffects()) + return false; + + // If the predecessor has a single successor, then the edge isn't + // critical. + if (TI->getNumSuccessors() == 1) + continue; + + // If this pointer is always safe to load, or if we can prove that there + // is already a load in the block, then we can move the load to the pred + // block. + if (InVal->isDereferenceablePointer() || + isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) + continue; + + return false; + } + + return true; + } + + void visitPHINode(PHINode &PN) { + DEBUG(dbgs() << " original: " << PN << "\n"); + + SmallVector<LoadInst *, 4> Loads; + if (!isSafePHIToSpeculate(PN, Loads)) + return; + + assert(!Loads.empty()); + + Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); + IRBuilder<> PHIBuilder(&PN); + PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), + PN.getName() + ".sroa.speculated"); + + // Get the TBAA tag and alignment to use from one of the loads. It doesn't + // matter which one we get and if any differ, it doesn't matter. + LoadInst *SomeLoad = cast<LoadInst>(Loads.back()); + MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); + unsigned Align = SomeLoad->getAlignment(); + + // Rewrite all loads of the PN to use the new PHI. + do { + LoadInst *LI = Loads.pop_back_val(); + LI->replaceAllUsesWith(NewPN); + Pass.DeadInsts.insert(LI); + } while (!Loads.empty()); + + // Inject loads into all of the pred blocks. + for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { + BasicBlock *Pred = PN.getIncomingBlock(Idx); + TerminatorInst *TI = Pred->getTerminator(); + Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx)); + Value *InVal = PN.getIncomingValue(Idx); + IRBuilder<> PredBuilder(TI); + + LoadInst *Load + = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." + + Pred->getName())); + ++NumLoadsSpeculated; + Load->setAlignment(Align); + if (TBAATag) + Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); + NewPN->addIncoming(Load, Pred); + + Instruction *Ptr = dyn_cast<Instruction>(InVal); + if (!Ptr) + // No uses to rewrite. + continue; + + // Try to lookup and rewrite any partition uses corresponding to this phi + // input. + AllocaPartitioning::iterator PI + = P.findPartitionForPHIOrSelectOperand(InUse); + if (PI == P.end()) + continue; + + // Replace the Use in the PartitionUse for this operand with the Use + // inside the load. + AllocaPartitioning::use_iterator UI + = P.findPartitionUseForPHIOrSelectOperand(InUse); + assert(isa<PHINode>(*UI->U->getUser())); + UI->U = &Load->getOperandUse(Load->getPointerOperandIndex()); + } + DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); + } + + /// Select instructions that use an alloca and are subsequently loaded can be + /// rewritten to load both input pointers and then select between the result, + /// allowing the load of the alloca to be promoted. + /// From this: + /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other + /// %V = load i32* %P2 + /// to: + /// %V1 = load i32* %Alloca -> will be mem2reg'd + /// %V2 = load i32* %Other + /// %V = select i1 %cond, i32 %V1, i32 %V2 + /// + /// We can do this to a select if its only uses are loads and if the operand + /// to the select can be loaded unconditionally. + bool isSafeSelectToSpeculate(SelectInst &SI, + SmallVectorImpl<LoadInst *> &Loads) { + Value *TValue = SI.getTrueValue(); + Value *FValue = SI.getFalseValue(); + bool TDerefable = TValue->isDereferenceablePointer(); + bool FDerefable = FValue->isDereferenceablePointer(); + + for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || !LI->isSimple()) return false; + + // Both operands to the select need to be dereferencable, either + // absolutely (e.g. allocas) or at this point because we can see other + // accesses to it. + if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, + LI->getAlignment(), &TD)) + return false; + if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, + LI->getAlignment(), &TD)) + return false; + Loads.push_back(LI); + } + + return true; + } + + void visitSelectInst(SelectInst &SI) { + DEBUG(dbgs() << " original: " << SI << "\n"); + IRBuilder<> IRB(&SI); + + // If the select isn't safe to speculate, just use simple logic to emit it. + SmallVector<LoadInst *, 4> Loads; + if (!isSafeSelectToSpeculate(SI, Loads)) + return; + + Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) }; + AllocaPartitioning::iterator PIs[2]; + AllocaPartitioning::PartitionUse PUs[2]; + for (unsigned i = 0, e = 2; i != e; ++i) { + PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]); + if (PIs[i] != P.end()) { + // If the pointer is within the partitioning, remove the select from + // its uses. We'll add in the new loads below. + AllocaPartitioning::use_iterator UI + = P.findPartitionUseForPHIOrSelectOperand(Ops[i]); + PUs[i] = *UI; + // Clear out the use here so that the offsets into the use list remain + // stable but this use is ignored when rewriting. + UI->U = 0; + } + } + + Value *TV = SI.getTrueValue(); + Value *FV = SI.getFalseValue(); + // Replace the loads of the select with a select of two loads. + while (!Loads.empty()) { + LoadInst *LI = Loads.pop_back_val(); + + IRB.SetInsertPoint(LI); + LoadInst *TL = + IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); + LoadInst *FL = + IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); + NumLoadsSpeculated += 2; + + // Transfer alignment and TBAA info if present. + TL->setAlignment(LI->getAlignment()); + FL->setAlignment(LI->getAlignment()); + if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { + TL->setMetadata(LLVMContext::MD_tbaa, Tag); + FL->setMetadata(LLVMContext::MD_tbaa, Tag); + } + + Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, + LI->getName() + ".sroa.speculated"); + + LoadInst *Loads[2] = { TL, FL }; + for (unsigned i = 0, e = 2; i != e; ++i) { + if (PIs[i] != P.end()) { + Use *LoadUse = &Loads[i]->getOperandUse(0); + assert(PUs[i].U->get() == LoadUse->get()); + PUs[i].U = LoadUse; + P.use_push_back(PIs[i], PUs[i]); + } + } + + DEBUG(dbgs() << " speculated to: " << *V << "\n"); + LI->replaceAllUsesWith(V); + Pass.DeadInsts.insert(LI); + } + } +}; +} + +/// \brief Build a GEP out of a base pointer and indices. +/// +/// This will return the BasePtr if that is valid, or build a new GEP +/// instruction using the IRBuilder if GEP-ing is needed. +static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr, + SmallVectorImpl<Value *> &Indices, + const Twine &Prefix) { + if (Indices.empty()) + return BasePtr; + + // A single zero index is a no-op, so check for this and avoid building a GEP + // in that case. + if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) + return BasePtr; + + return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx"); +} + +/// \brief Get a natural GEP off of the BasePtr walking through Ty toward +/// TargetTy without changing the offset of the pointer. +/// +/// This routine assumes we've already established a properly offset GEP with +/// Indices, and arrived at the Ty type. The goal is to continue to GEP with +/// zero-indices down through type layers until we find one the same as +/// TargetTy. If we can't find one with the same type, we at least try to use +/// one with the same size. If none of that works, we just produce the GEP as +/// indicated by Indices to have the correct offset. +static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD, + Value *BasePtr, Type *Ty, Type *TargetTy, + SmallVectorImpl<Value *> &Indices, + const Twine &Prefix) { + if (Ty == TargetTy) + return buildGEP(IRB, BasePtr, Indices, Prefix); + + // See if we can descend into a struct and locate a field with the correct + // type. + unsigned NumLayers = 0; + Type *ElementTy = Ty; + do { + if (ElementTy->isPointerTy()) + break; + if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { + ElementTy = SeqTy->getElementType(); + // Note that we use the default address space as this index is over an + // array or a vector, not a pointer. + Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0))); + } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { + if (STy->element_begin() == STy->element_end()) + break; // Nothing left to descend into. + ElementTy = *STy->element_begin(); + Indices.push_back(IRB.getInt32(0)); + } else { + break; + } + ++NumLayers; + } while (ElementTy != TargetTy); + if (ElementTy != TargetTy) + Indices.erase(Indices.end() - NumLayers, Indices.end()); + + return buildGEP(IRB, BasePtr, Indices, Prefix); +} + +/// \brief Recursively compute indices for a natural GEP. +/// +/// This is the recursive step for getNaturalGEPWithOffset that walks down the +/// element types adding appropriate indices for the GEP. +static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD, + Value *Ptr, Type *Ty, APInt &Offset, + Type *TargetTy, + SmallVectorImpl<Value *> &Indices, + const Twine &Prefix) { + if (Offset == 0) + return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix); + + // We can't recurse through pointer types. + if (Ty->isPointerTy()) + return 0; + + // We try to analyze GEPs over vectors here, but note that these GEPs are + // extremely poorly defined currently. The long-term goal is to remove GEPing + // over a vector from the IR completely. + if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { + unsigned ElementSizeInBits = TD.getTypeSizeInBits(VecTy->getScalarType()); + if (ElementSizeInBits % 8) + return 0; // GEPs over non-multiple of 8 size vector elements are invalid. + APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); + APInt NumSkippedElements = Offset.sdiv(ElementSize); + if (NumSkippedElements.ugt(VecTy->getNumElements())) + return 0; + Offset -= NumSkippedElements * ElementSize; + Indices.push_back(IRB.getInt(NumSkippedElements)); + return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), + Offset, TargetTy, Indices, Prefix); + } + + if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { + Type *ElementTy = ArrTy->getElementType(); + APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); + APInt NumSkippedElements = Offset.sdiv(ElementSize); + if (NumSkippedElements.ugt(ArrTy->getNumElements())) + return 0; + + Offset -= NumSkippedElements * ElementSize; + Indices.push_back(IRB.getInt(NumSkippedElements)); + return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + Indices, Prefix); + } + + StructType *STy = dyn_cast<StructType>(Ty); + if (!STy) + return 0; + + const StructLayout *SL = TD.getStructLayout(STy); + uint64_t StructOffset = Offset.getZExtValue(); + if (StructOffset >= SL->getSizeInBytes()) + return 0; + unsigned Index = SL->getElementContainingOffset(StructOffset); + Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); + Type *ElementTy = STy->getElementType(Index); + if (Offset.uge(TD.getTypeAllocSize(ElementTy))) + return 0; // The offset points into alignment padding. + + Indices.push_back(IRB.getInt32(Index)); + return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + Indices, Prefix); +} + +/// \brief Get a natural GEP from a base pointer to a particular offset and +/// resulting in a particular type. +/// +/// The goal is to produce a "natural" looking GEP that works with the existing +/// composite types to arrive at the appropriate offset and element type for +/// a pointer. TargetTy is the element type the returned GEP should point-to if +/// possible. We recurse by decreasing Offset, adding the appropriate index to +/// Indices, and setting Ty to the result subtype. +/// +/// If no natural GEP can be constructed, this function returns null. +static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD, + Value *Ptr, APInt Offset, Type *TargetTy, + SmallVectorImpl<Value *> &Indices, + const Twine &Prefix) { + PointerType *Ty = cast<PointerType>(Ptr->getType()); + + // Don't consider any GEPs through an i8* as natural unless the TargetTy is + // an i8. + if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8)) + return 0; + + Type *ElementTy = Ty->getElementType(); + if (!ElementTy->isSized()) + return 0; // We can't GEP through an unsized element. + APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); + if (ElementSize == 0) + return 0; // Zero-length arrays can't help us build a natural GEP. + APInt NumSkippedElements = Offset.sdiv(ElementSize); + + Offset -= NumSkippedElements * ElementSize; + Indices.push_back(IRB.getInt(NumSkippedElements)); + return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + Indices, Prefix); +} + +/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the +/// resulting pointer has PointerTy. +/// +/// This tries very hard to compute a "natural" GEP which arrives at the offset +/// and produces the pointer type desired. Where it cannot, it will try to use +/// the natural GEP to arrive at the offset and bitcast to the type. Where that +/// fails, it will try to use an existing i8* and GEP to the byte offset and +/// bitcast to the type. +/// +/// The strategy for finding the more natural GEPs is to peel off layers of the +/// pointer, walking back through bit casts and GEPs, searching for a base +/// pointer from which we can compute a natural GEP with the desired +/// properities. The algorithm tries to fold as many constant indices into +/// a single GEP as possible, thus making each GEP more independent of the +/// surrounding code. +static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD, + Value *Ptr, APInt Offset, Type *PointerTy, + const Twine &Prefix) { + // Even though we don't look through PHI nodes, we could be called on an + // instruction in an unreachable block, which may be on a cycle. + SmallPtrSet<Value *, 4> Visited; + Visited.insert(Ptr); + SmallVector<Value *, 4> Indices; + + // We may end up computing an offset pointer that has the wrong type. If we + // never are able to compute one directly that has the correct type, we'll + // fall back to it, so keep it around here. + Value *OffsetPtr = 0; + + // Remember any i8 pointer we come across to re-use if we need to do a raw + // byte offset. + Value *Int8Ptr = 0; + APInt Int8PtrOffset(Offset.getBitWidth(), 0); + + Type *TargetTy = PointerTy->getPointerElementType(); + + do { + // First fold any existing GEPs into the offset. + while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { + APInt GEPOffset(Offset.getBitWidth(), 0); + if (!GEP->accumulateConstantOffset(TD, GEPOffset)) + break; + Offset += GEPOffset; + Ptr = GEP->getPointerOperand(); + if (!Visited.insert(Ptr)) + break; + } + + // See if we can perform a natural GEP here. + Indices.clear(); + if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy, + Indices, Prefix)) { + if (P->getType() == PointerTy) { + // Zap any offset pointer that we ended up computing in previous rounds. + if (OffsetPtr && OffsetPtr->use_empty()) + if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) + I->eraseFromParent(); + return P; + } + if (!OffsetPtr) { + OffsetPtr = P; + } + } + + // Stash this pointer if we've found an i8*. + if (Ptr->getType()->isIntegerTy(8)) { + Int8Ptr = Ptr; + Int8PtrOffset = Offset; + } + + // Peel off a layer of the pointer and update the offset appropriately. + if (Operator::getOpcode(Ptr) == Instruction::BitCast) { + Ptr = cast<Operator>(Ptr)->getOperand(0); + } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { + if (GA->mayBeOverridden()) + break; + Ptr = GA->getAliasee(); + } else { + break; + } + assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); + } while (Visited.insert(Ptr)); + + if (!OffsetPtr) { + if (!Int8Ptr) { + Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(), + Prefix + ".raw_cast"); + Int8PtrOffset = Offset; + } + + OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : + IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), + Prefix + ".raw_idx"); + } + Ptr = OffsetPtr; + + // On the off chance we were targeting i8*, guard the bitcast here. + if (Ptr->getType() != PointerTy) + Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast"); + + return Ptr; +} + +/// \brief Test whether we can convert a value from the old to the new type. +/// +/// This predicate should be used to guard calls to convertValue in order to +/// ensure that we only try to convert viable values. The strategy is that we +/// will peel off single element struct and array wrappings to get to an +/// underlying value, and convert that value. +static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { + if (OldTy == NewTy) + return true; + if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) + return false; + if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) + return false; + + if (NewTy->isPointerTy() || OldTy->isPointerTy()) { + if (NewTy->isPointerTy() && OldTy->isPointerTy()) + return true; + if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) + return true; + return false; + } + + return true; +} + +/// \brief Generic routine to convert an SSA value to a value of a different +/// type. +/// +/// This will try various different casting techniques, such as bitcasts, +/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test +/// two types for viability with this routine. +static Value *convertValue(const DataLayout &DL, IRBuilder<> &IRB, Value *V, + Type *Ty) { + assert(canConvertValue(DL, V->getType(), Ty) && + "Value not convertable to type"); + if (V->getType() == Ty) + return V; + if (V->getType()->isIntegerTy() && Ty->isPointerTy()) + return IRB.CreateIntToPtr(V, Ty); + if (V->getType()->isPointerTy() && Ty->isIntegerTy()) + return IRB.CreatePtrToInt(V, Ty); + + return IRB.CreateBitCast(V, Ty); +} + +/// \brief Test whether the given alloca partition can be promoted to a vector. +/// +/// This is a quick test to check whether we can rewrite a particular alloca +/// partition (and its newly formed alloca) into a vector alloca with only +/// whole-vector loads and stores such that it could be promoted to a vector +/// SSA value. We only can ensure this for a limited set of operations, and we +/// don't want to do the rewrites unless we are confident that the result will +/// be promotable, so we have an early test here. +static bool isVectorPromotionViable(const DataLayout &TD, + Type *AllocaTy, + AllocaPartitioning &P, + uint64_t PartitionBeginOffset, + uint64_t PartitionEndOffset, + AllocaPartitioning::const_use_iterator I, + AllocaPartitioning::const_use_iterator E) { + VectorType *Ty = dyn_cast<VectorType>(AllocaTy); + if (!Ty) + return false; + + uint64_t ElementSize = TD.getTypeSizeInBits(Ty->getScalarType()); + + // While the definition of LLVM vectors is bitpacked, we don't support sizes + // that aren't byte sized. + if (ElementSize % 8) + return false; + assert((TD.getTypeSizeInBits(Ty) % 8) == 0 && + "vector size not a multiple of element size?"); + ElementSize /= 8; + + for (; I != E; ++I) { + if (!I->U) + continue; // Skip dead use. + + uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset; + uint64_t BeginIndex = BeginOffset / ElementSize; + if (BeginIndex * ElementSize != BeginOffset || + BeginIndex >= Ty->getNumElements()) + return false; + uint64_t EndOffset = I->EndOffset - PartitionBeginOffset; + uint64_t EndIndex = EndOffset / ElementSize; + if (EndIndex * ElementSize != EndOffset || + EndIndex > Ty->getNumElements()) + return false; + + assert(EndIndex > BeginIndex && "Empty vector!"); + uint64_t NumElements = EndIndex - BeginIndex; + Type *PartitionTy + = (NumElements == 1) ? Ty->getElementType() + : VectorType::get(Ty->getElementType(), NumElements); + + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { + if (MI->isVolatile()) + return false; + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { + const AllocaPartitioning::MemTransferOffsets &MTO + = P.getMemTransferOffsets(*MTI); + if (!MTO.IsSplittable) + return false; + } + } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) { + // Disable vector promotion when there are loads or stores of an FCA. + return false; + } else if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { + if (LI->isVolatile()) + return false; + if (!canConvertValue(TD, PartitionTy, LI->getType())) + return false; + } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { + if (SI->isVolatile()) + return false; + if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy)) + return false; + } else { + return false; + } + } + return true; +} + +/// \brief Test whether the given alloca partition's integer operations can be +/// widened to promotable ones. +/// +/// This is a quick test to check whether we can rewrite the integer loads and +/// stores to a particular alloca into wider loads and stores and be able to +/// promote the resulting alloca. +static bool isIntegerWideningViable(const DataLayout &TD, + Type *AllocaTy, + uint64_t AllocBeginOffset, + AllocaPartitioning &P, + AllocaPartitioning::const_use_iterator I, + AllocaPartitioning::const_use_iterator E) { + uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy); + // Don't create integer types larger than the maximum bitwidth. + if (SizeInBits > IntegerType::MAX_INT_BITS) + return false; + + // Don't try to handle allocas with bit-padding. + if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy)) + return false; + + // We need to ensure that an integer type with the appropriate bitwidth can + // be converted to the alloca type, whatever that is. We don't want to force + // the alloca itself to have an integer type if there is a more suitable one. + Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); + if (!canConvertValue(TD, AllocaTy, IntTy) || + !canConvertValue(TD, IntTy, AllocaTy)) + return false; + + uint64_t Size = TD.getTypeStoreSize(AllocaTy); + + // Check the uses to ensure the uses are (likely) promoteable integer uses. + // Also ensure that the alloca has a covering load or store. We don't want + // to widen the integer operotains only to fail to promote due to some other + // unsplittable entry (which we may make splittable later). + bool WholeAllocaOp = false; + for (; I != E; ++I) { + if (!I->U) + continue; // Skip dead use. + + uint64_t RelBegin = I->BeginOffset - AllocBeginOffset; + uint64_t RelEnd = I->EndOffset - AllocBeginOffset; + + // We can't reasonably handle cases where the load or store extends past + // the end of the aloca's type and into its padding. + if (RelEnd > Size) + return false; + + if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { + if (LI->isVolatile()) + return false; + if (RelBegin == 0 && RelEnd == Size) + WholeAllocaOp = true; + if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { + if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) + return false; + continue; + } + // Non-integer loads need to be convertible from the alloca type so that + // they are promotable. + if (RelBegin != 0 || RelEnd != Size || + !canConvertValue(TD, AllocaTy, LI->getType())) + return false; + } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { + Type *ValueTy = SI->getValueOperand()->getType(); + if (SI->isVolatile()) + return false; + if (RelBegin == 0 && RelEnd == Size) + WholeAllocaOp = true; + if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { + if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) + return false; + continue; + } + // Non-integer stores need to be convertible to the alloca type so that + // they are promotable. + if (RelBegin != 0 || RelEnd != Size || + !canConvertValue(TD, ValueTy, AllocaTy)) + return false; + } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { + if (MI->isVolatile() || !isa<Constant>(MI->getLength())) + return false; + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { + const AllocaPartitioning::MemTransferOffsets &MTO + = P.getMemTransferOffsets(*MTI); + if (!MTO.IsSplittable) + return false; + } + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->U->getUser())) { + if (II->getIntrinsicID() != Intrinsic::lifetime_start && + II->getIntrinsicID() != Intrinsic::lifetime_end) + return false; + } else { + return false; + } + } + return WholeAllocaOp; +} + +static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V, + IntegerType *Ty, uint64_t Offset, + const Twine &Name) { + DEBUG(dbgs() << " start: " << *V << "\n"); + IntegerType *IntTy = cast<IntegerType>(V->getType()); + assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && + "Element extends past full value"); + uint64_t ShAmt = 8*Offset; + if (DL.isBigEndian()) + ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + if (ShAmt) { + V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); + DEBUG(dbgs() << " shifted: " << *V << "\n"); + } + assert(Ty->getBitWidth() <= IntTy->getBitWidth() && + "Cannot extract to a larger integer!"); + if (Ty != IntTy) { + V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); + DEBUG(dbgs() << " trunced: " << *V << "\n"); + } + return V; +} + +static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old, + Value *V, uint64_t Offset, const Twine &Name) { + IntegerType *IntTy = cast<IntegerType>(Old->getType()); + IntegerType *Ty = cast<IntegerType>(V->getType()); + assert(Ty->getBitWidth() <= IntTy->getBitWidth() && + "Cannot insert a larger integer!"); + DEBUG(dbgs() << " start: " << *V << "\n"); + if (Ty != IntTy) { + V = IRB.CreateZExt(V, IntTy, Name + ".ext"); + DEBUG(dbgs() << " extended: " << *V << "\n"); + } + assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && + "Element store outside of alloca store"); + uint64_t ShAmt = 8*Offset; + if (DL.isBigEndian()) + ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + if (ShAmt) { + V = IRB.CreateShl(V, ShAmt, Name + ".shift"); + DEBUG(dbgs() << " shifted: " << *V << "\n"); + } + + if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { + APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); + Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); + DEBUG(dbgs() << " masked: " << *Old << "\n"); + V = IRB.CreateOr(Old, V, Name + ".insert"); + DEBUG(dbgs() << " inserted: " << *V << "\n"); + } + return V; +} + +static Value *extractVector(IRBuilder<> &IRB, Value *V, + unsigned BeginIndex, unsigned EndIndex, + const Twine &Name) { + VectorType *VecTy = cast<VectorType>(V->getType()); + unsigned NumElements = EndIndex - BeginIndex; + assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); + + if (NumElements == VecTy->getNumElements()) + return V; + + if (NumElements == 1) { + V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), + Name + ".extract"); + DEBUG(dbgs() << " extract: " << *V << "\n"); + return V; + } + + SmallVector<Constant*, 8> Mask; + Mask.reserve(NumElements); + for (unsigned i = BeginIndex; i != EndIndex; ++i) + Mask.push_back(IRB.getInt32(i)); + V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), + ConstantVector::get(Mask), + Name + ".extract"); + DEBUG(dbgs() << " shuffle: " << *V << "\n"); + return V; +} + +static Value *insertVector(IRBuilder<> &IRB, Value *Old, Value *V, + unsigned BeginIndex, const Twine &Name) { + VectorType *VecTy = cast<VectorType>(Old->getType()); + assert(VecTy && "Can only insert a vector into a vector"); + + VectorType *Ty = dyn_cast<VectorType>(V->getType()); + if (!Ty) { + // Single element to insert. + V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), + Name + ".insert"); + DEBUG(dbgs() << " insert: " << *V << "\n"); + return V; + } + + assert(Ty->getNumElements() <= VecTy->getNumElements() && + "Too many elements!"); + if (Ty->getNumElements() == VecTy->getNumElements()) { + assert(V->getType() == VecTy && "Vector type mismatch"); + return V; + } + unsigned EndIndex = BeginIndex + Ty->getNumElements(); + + // When inserting a smaller vector into the larger to store, we first + // use a shuffle vector to widen it with undef elements, and then + // a second shuffle vector to select between the loaded vector and the + // incoming vector. + SmallVector<Constant*, 8> Mask; + Mask.reserve(VecTy->getNumElements()); + for (unsigned i = 0; i != VecTy->getNumElements(); ++i) + if (i >= BeginIndex && i < EndIndex) + Mask.push_back(IRB.getInt32(i - BeginIndex)); + else + Mask.push_back(UndefValue::get(IRB.getInt32Ty())); + V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), + ConstantVector::get(Mask), + Name + ".expand"); + DEBUG(dbgs() << " shuffle1: " << *V << "\n"); + + Mask.clear(); + for (unsigned i = 0; i != VecTy->getNumElements(); ++i) + if (i >= BeginIndex && i < EndIndex) + Mask.push_back(IRB.getInt32(i)); + else + Mask.push_back(IRB.getInt32(i + VecTy->getNumElements())); + V = IRB.CreateShuffleVector(V, Old, ConstantVector::get(Mask), + Name + "insert"); + DEBUG(dbgs() << " shuffle2: " << *V << "\n"); + return V; +} + +namespace { +/// \brief Visitor to rewrite instructions using a partition of an alloca to +/// use a new alloca. +/// +/// Also implements the rewriting to vector-based accesses when the partition +/// passes the isVectorPromotionViable predicate. Most of the rewriting logic +/// lives here. +class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, + bool> { + // Befriend the base class so it can delegate to private visit methods. + friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>; + + const DataLayout &TD; + AllocaPartitioning &P; + SROA &Pass; + AllocaInst &OldAI, &NewAI; + const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; + Type *NewAllocaTy; + + // If we are rewriting an alloca partition which can be written as pure + // vector operations, we stash extra information here. When VecTy is + // non-null, we have some strict guarantees about the rewriten alloca: + // - The new alloca is exactly the size of the vector type here. + // - The accesses all either map to the entire vector or to a single + // element. + // - The set of accessing instructions is only one of those handled above + // in isVectorPromotionViable. Generally these are the same access kinds + // which are promotable via mem2reg. + VectorType *VecTy; + Type *ElementTy; + uint64_t ElementSize; + + // This is a convenience and flag variable that will be null unless the new + // alloca's integer operations should be widened to this integer type due to + // passing isIntegerWideningViable above. If it is non-null, the desired + // integer type will be stored here for easy access during rewriting. + IntegerType *IntTy; + + // The offset of the partition user currently being rewritten. + uint64_t BeginOffset, EndOffset; + Use *OldUse; + Instruction *OldPtr; + + // The name prefix to use when rewriting instructions for this alloca. + std::string NamePrefix; + +public: + AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P, + AllocaPartitioning::iterator PI, + SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, + uint64_t NewBeginOffset, uint64_t NewEndOffset) + : TD(TD), P(P), Pass(Pass), + OldAI(OldAI), NewAI(NewAI), + NewAllocaBeginOffset(NewBeginOffset), + NewAllocaEndOffset(NewEndOffset), + NewAllocaTy(NewAI.getAllocatedType()), + VecTy(), ElementTy(), ElementSize(), IntTy(), + BeginOffset(), EndOffset() { + } + + /// \brief Visit the users of the alloca partition and rewrite them. + bool visitUsers(AllocaPartitioning::const_use_iterator I, + AllocaPartitioning::const_use_iterator E) { + if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P, + NewAllocaBeginOffset, NewAllocaEndOffset, + I, E)) { + ++NumVectorized; + VecTy = cast<VectorType>(NewAI.getAllocatedType()); + ElementTy = VecTy->getElementType(); + assert((TD.getTypeSizeInBits(VecTy->getScalarType()) % 8) == 0 && + "Only multiple-of-8 sized vector elements are viable"); + ElementSize = TD.getTypeSizeInBits(VecTy->getScalarType()) / 8; + } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(), + NewAllocaBeginOffset, P, I, E)) { + IntTy = Type::getIntNTy(NewAI.getContext(), + TD.getTypeSizeInBits(NewAI.getAllocatedType())); + } + bool CanSROA = true; + for (; I != E; ++I) { + if (!I->U) + continue; // Skip dead uses. + BeginOffset = I->BeginOffset; + EndOffset = I->EndOffset; + OldUse = I->U; + OldPtr = cast<Instruction>(I->U->get()); + NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str(); + CanSROA &= visit(cast<Instruction>(I->U->getUser())); + } + if (VecTy) { + assert(CanSROA); + VecTy = 0; + ElementTy = 0; + ElementSize = 0; + } + if (IntTy) { + assert(CanSROA); + IntTy = 0; + } + return CanSROA; + } + +private: + // Every instruction which can end up as a user must have a rewrite rule. + bool visitInstruction(Instruction &I) { + DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); + llvm_unreachable("No rewrite rule for this instruction!"); + } + + Twine getName(const Twine &Suffix) { + return NamePrefix + Suffix; + } + + Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) { + assert(BeginOffset >= NewAllocaBeginOffset); + APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset); + return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName("")); + } + + /// \brief Compute suitable alignment to access an offset into the new alloca. + unsigned getOffsetAlign(uint64_t Offset) { + unsigned NewAIAlign = NewAI.getAlignment(); + if (!NewAIAlign) + NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType()); + return MinAlign(NewAIAlign, Offset); + } + + /// \brief Compute suitable alignment to access this partition of the new + /// alloca. + unsigned getPartitionAlign() { + return getOffsetAlign(BeginOffset - NewAllocaBeginOffset); + } + + /// \brief Compute suitable alignment to access a type at an offset of the + /// new alloca. + /// + /// \returns zero if the type's ABI alignment is a suitable alignment, + /// otherwise returns the maximal suitable alignment. + unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { + unsigned Align = getOffsetAlign(Offset); + return Align == TD.getABITypeAlignment(Ty) ? 0 : Align; + } + + /// \brief Compute suitable alignment to access a type at the beginning of + /// this partition of the new alloca. + /// + /// See \c getOffsetTypeAlign for details; this routine delegates to it. + unsigned getPartitionTypeAlign(Type *Ty) { + return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset); + } + + unsigned getIndex(uint64_t Offset) { + assert(VecTy && "Can only call getIndex when rewriting a vector"); + uint64_t RelOffset = Offset - NewAllocaBeginOffset; + assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); + uint32_t Index = RelOffset / ElementSize; + assert(Index * ElementSize == RelOffset); + return Index; + } + + void deleteIfTriviallyDead(Value *V) { + Instruction *I = cast<Instruction>(V); + if (isInstructionTriviallyDead(I)) + Pass.DeadInsts.insert(I); + } + + Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB) { + unsigned BeginIndex = getIndex(BeginOffset); + unsigned EndIndex = getIndex(EndOffset); + assert(EndIndex > BeginIndex && "Empty vector!"); + + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + return extractVector(IRB, V, BeginIndex, EndIndex, getName(".vec")); + } + + Value *rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) { + assert(IntTy && "We cannot insert an integer to the alloca"); + assert(!LI.isVolatile()); + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + V = convertValue(TD, IRB, V, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + if (Offset > 0 || EndOffset < NewAllocaEndOffset) + V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset, + getName(".extract")); + return V; + } + + bool visitLoadInst(LoadInst &LI) { + DEBUG(dbgs() << " original: " << LI << "\n"); + Value *OldOp = LI.getOperand(0); + assert(OldOp == OldPtr); + IRBuilder<> IRB(&LI); + + uint64_t Size = EndOffset - BeginOffset; + bool IsSplitIntLoad = Size < TD.getTypeStoreSize(LI.getType()); + + // If this memory access can be shown to *statically* extend outside the + // bounds of the original allocation it's behavior is undefined. Rather + // than trying to transform it, just replace it with undef. + // FIXME: We should do something more clever for functions being + // instrumented by asan. + // FIXME: Eventually, once ASan and friends can flush out bugs here, this + // should be transformed to a load of null making it unreachable. + uint64_t OldAllocSize = TD.getTypeAllocSize(OldAI.getAllocatedType()); + if (TD.getTypeStoreSize(LI.getType()) > OldAllocSize) { + LI.replaceAllUsesWith(UndefValue::get(LI.getType())); + Pass.DeadInsts.insert(&LI); + deleteIfTriviallyDead(OldOp); + DEBUG(dbgs() << " to: undef!!\n"); + return true; + } + + Type *TargetTy = IsSplitIntLoad ? Type::getIntNTy(LI.getContext(), Size * 8) + : LI.getType(); + bool IsPtrAdjusted = false; + Value *V; + if (VecTy) { + V = rewriteVectorizedLoadInst(IRB); + } else if (IntTy && LI.getType()->isIntegerTy()) { + V = rewriteIntegerLoad(IRB, LI); + } else if (BeginOffset == NewAllocaBeginOffset && + canConvertValue(TD, NewAllocaTy, LI.getType())) { + V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + LI.isVolatile(), getName(".load")); + } else { + Type *LTy = TargetTy->getPointerTo(); + V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy), + getPartitionTypeAlign(TargetTy), + LI.isVolatile(), getName(".load")); + IsPtrAdjusted = true; + } + V = convertValue(TD, IRB, V, TargetTy); + + if (IsSplitIntLoad) { + assert(!LI.isVolatile()); + assert(LI.getType()->isIntegerTy() && + "Only integer type loads and stores are split"); + assert(LI.getType()->getIntegerBitWidth() == + TD.getTypeStoreSizeInBits(LI.getType()) && + "Non-byte-multiple bit width"); + assert(LI.getType()->getIntegerBitWidth() == + TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) && + "Only alloca-wide loads can be split and recomposed"); + // Move the insertion point just past the load so that we can refer to it. + IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI))); + // Create a placeholder value with the same type as LI to use as the + // basis for the new value. This allows us to replace the uses of LI with + // the computed value, and then replace the placeholder with LI, leaving + // LI only used for this computation. + Value *Placeholder + = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); + V = insertInteger(TD, IRB, Placeholder, V, BeginOffset, + getName(".insert")); + LI.replaceAllUsesWith(V); + Placeholder->replaceAllUsesWith(&LI); + delete Placeholder; + } else { + LI.replaceAllUsesWith(V); + } + + Pass.DeadInsts.insert(&LI); + deleteIfTriviallyDead(OldOp); + DEBUG(dbgs() << " to: " << *V << "\n"); + return !LI.isVolatile() && !IsPtrAdjusted; + } + + bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V, + StoreInst &SI, Value *OldOp) { + unsigned BeginIndex = getIndex(BeginOffset); + unsigned EndIndex = getIndex(EndOffset); + assert(EndIndex > BeginIndex && "Empty vector!"); + unsigned NumElements = EndIndex - BeginIndex; + assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); + Type *PartitionTy + = (NumElements == 1) ? ElementTy + : VectorType::get(ElementTy, NumElements); + if (V->getType() != PartitionTy) + V = convertValue(TD, IRB, V, PartitionTy); + + // Mix in the existing elements. + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + V = insertVector(IRB, Old, V, BeginIndex, getName(".vec")); + + StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); + Pass.DeadInsts.insert(&SI); + + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + return true; + } + + bool rewriteIntegerStore(IRBuilder<> &IRB, Value *V, StoreInst &SI) { + assert(IntTy && "We cannot extract an integer from the alloca"); + assert(!SI.isVolatile()); + if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset, + getName(".insert")); + } + V = convertValue(TD, IRB, V, NewAllocaTy); + StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); + Pass.DeadInsts.insert(&SI); + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + return true; + } + + bool visitStoreInst(StoreInst &SI) { + DEBUG(dbgs() << " original: " << SI << "\n"); + Value *OldOp = SI.getOperand(1); + assert(OldOp == OldPtr); + IRBuilder<> IRB(&SI); + + Value *V = SI.getValueOperand(); + + // Strip all inbounds GEPs and pointer casts to try to dig out any root + // alloca that should be re-examined after promoting this alloca. + if (V->getType()->isPointerTy()) + if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) + Pass.PostPromotionWorklist.insert(AI); + + uint64_t Size = EndOffset - BeginOffset; + if (Size < TD.getTypeStoreSize(V->getType())) { + assert(!SI.isVolatile()); + assert(V->getType()->isIntegerTy() && + "Only integer type loads and stores are split"); + assert(V->getType()->getIntegerBitWidth() == + TD.getTypeStoreSizeInBits(V->getType()) && + "Non-byte-multiple bit width"); + assert(V->getType()->getIntegerBitWidth() == + TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) && + "Only alloca-wide stores can be split and recomposed"); + IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8); + V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset, + getName(".extract")); + } + + if (VecTy) + return rewriteVectorizedStoreInst(IRB, V, SI, OldOp); + if (IntTy && V->getType()->isIntegerTy()) + return rewriteIntegerStore(IRB, V, SI); + + StoreInst *NewSI; + if (BeginOffset == NewAllocaBeginOffset && + canConvertValue(TD, V->getType(), NewAllocaTy)) { + V = convertValue(TD, IRB, V, NewAllocaTy); + NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), + SI.isVolatile()); + } else { + Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo()); + NewSI = IRB.CreateAlignedStore(V, NewPtr, + getPartitionTypeAlign(V->getType()), + SI.isVolatile()); + } + (void)NewSI; + Pass.DeadInsts.insert(&SI); + deleteIfTriviallyDead(OldOp); + + DEBUG(dbgs() << " to: " << *NewSI << "\n"); + return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile(); + } + + /// \brief Compute an integer value from splatting an i8 across the given + /// number of bytes. + /// + /// Note that this routine assumes an i8 is a byte. If that isn't true, don't + /// call this routine. + /// FIXME: Heed the abvice above. + /// + /// \param V The i8 value to splat. + /// \param Size The number of bytes in the output (assuming i8 is one byte) + Value *getIntegerSplat(IRBuilder<> &IRB, Value *V, unsigned Size) { + assert(Size > 0 && "Expected a positive number of bytes."); + IntegerType *VTy = cast<IntegerType>(V->getType()); + assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte"); + if (Size == 1) + return V; + + Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); + V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")), + ConstantExpr::getUDiv( + Constant::getAllOnesValue(SplatIntTy), + ConstantExpr::getZExt( + Constant::getAllOnesValue(V->getType()), + SplatIntTy)), + getName(".isplat")); + return V; + } + + /// \brief Compute a vector splat for a given element value. + Value *getVectorSplat(IRBuilder<> &IRB, Value *V, unsigned NumElements) { + V = IRB.CreateVectorSplat(NumElements, V, NamePrefix); + DEBUG(dbgs() << " splat: " << *V << "\n"); + return V; + } + + bool visitMemSetInst(MemSetInst &II) { + DEBUG(dbgs() << " original: " << II << "\n"); + IRBuilder<> IRB(&II); + assert(II.getRawDest() == OldPtr); + + // If the memset has a variable size, it cannot be split, just adjust the + // pointer to the new alloca. + if (!isa<Constant>(II.getLength())) { + II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); + Type *CstTy = II.getAlignmentCst()->getType(); + II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign())); + + deleteIfTriviallyDead(OldPtr); + return false; + } + + // Record this instruction for deletion. + Pass.DeadInsts.insert(&II); + + Type *AllocaTy = NewAI.getAllocatedType(); + Type *ScalarTy = AllocaTy->getScalarType(); + + // If this doesn't map cleanly onto the alloca type, and that type isn't + // a single value type, just emit a memset. + if (!VecTy && !IntTy && + (BeginOffset != NewAllocaBeginOffset || + EndOffset != NewAllocaEndOffset || + !AllocaTy->isSingleValueType() || + !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) || + TD.getTypeSizeInBits(ScalarTy)%8 != 0)) { + Type *SizeTy = II.getLength()->getType(); + Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); + CallInst *New + = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB, + II.getRawDest()->getType()), + II.getValue(), Size, getPartitionAlign(), + II.isVolatile()); + (void)New; + DEBUG(dbgs() << " to: " << *New << "\n"); + return false; + } + + // If we can represent this as a simple value, we have to build the actual + // value to store, which requires expanding the byte present in memset to + // a sensible representation for the alloca type. This is essentially + // splatting the byte to a sufficiently wide integer, splatting it across + // any desired vector width, and bitcasting to the final type. + Value *V; + + if (VecTy) { + // If this is a memset of a vectorized alloca, insert it. + assert(ElementTy == ScalarTy); + + unsigned BeginIndex = getIndex(BeginOffset); + unsigned EndIndex = getIndex(EndOffset); + assert(EndIndex > BeginIndex && "Empty vector!"); + unsigned NumElements = EndIndex - BeginIndex; + assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); + + Value *Splat = getIntegerSplat(IRB, II.getValue(), + TD.getTypeSizeInBits(ElementTy)/8); + Splat = convertValue(TD, IRB, Splat, ElementTy); + if (NumElements > 1) + Splat = getVectorSplat(IRB, Splat, NumElements); + + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + V = insertVector(IRB, Old, Splat, BeginIndex, getName(".vec")); + } else if (IntTy) { + // If this is a memset on an alloca where we can widen stores, insert the + // set integer. + assert(!II.isVolatile()); + + uint64_t Size = EndOffset - BeginOffset; + V = getIntegerSplat(IRB, II.getValue(), Size); + + if (IntTy && (BeginOffset != NewAllocaBeginOffset || + EndOffset != NewAllocaBeginOffset)) { + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert")); + } else { + assert(V->getType() == IntTy && + "Wrong type for an alloca wide integer!"); + } + V = convertValue(TD, IRB, V, AllocaTy); + } else { + // Established these invariants above. + assert(BeginOffset == NewAllocaBeginOffset); + assert(EndOffset == NewAllocaEndOffset); + + V = getIntegerSplat(IRB, II.getValue(), + TD.getTypeSizeInBits(ScalarTy)/8); + if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy)) + V = getVectorSplat(IRB, V, AllocaVecTy->getNumElements()); + + V = convertValue(TD, IRB, V, AllocaTy); + } + + Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), + II.isVolatile()); + (void)New; + DEBUG(dbgs() << " to: " << *New << "\n"); + return !II.isVolatile(); + } + + bool visitMemTransferInst(MemTransferInst &II) { + // Rewriting of memory transfer instructions can be a bit tricky. We break + // them into two categories: split intrinsics and unsplit intrinsics. + + DEBUG(dbgs() << " original: " << II << "\n"); + IRBuilder<> IRB(&II); + + assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); + bool IsDest = II.getRawDest() == OldPtr; + + const AllocaPartitioning::MemTransferOffsets &MTO + = P.getMemTransferOffsets(II); + + // Compute the relative offset within the transfer. + unsigned IntPtrWidth = TD.getPointerSizeInBits(); + APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin + : MTO.SourceBegin)); + + unsigned Align = II.getAlignment(); + if (Align > 1) + Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), + MinAlign(II.getAlignment(), getPartitionAlign())); + + // For unsplit intrinsics, we simply modify the source and destination + // pointers in place. This isn't just an optimization, it is a matter of + // correctness. With unsplit intrinsics we may be dealing with transfers + // within a single alloca before SROA ran, or with transfers that have + // a variable length. We may also be dealing with memmove instead of + // memcpy, and so simply updating the pointers is the necessary for us to + // update both source and dest of a single call. + if (!MTO.IsSplittable) { + Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); + if (IsDest) + II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); + else + II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType())); + + Type *CstTy = II.getAlignmentCst()->getType(); + II.setAlignment(ConstantInt::get(CstTy, Align)); + + DEBUG(dbgs() << " to: " << II << "\n"); + deleteIfTriviallyDead(OldOp); + return false; + } + // For split transfer intrinsics we have an incredibly useful assurance: + // the source and destination do not reside within the same alloca, and at + // least one of them does not escape. This means that we can replace + // memmove with memcpy, and we don't need to worry about all manner of + // downsides to splitting and transforming the operations. + + // If this doesn't map cleanly onto the alloca type, and that type isn't + // a single value type, just emit a memcpy. + bool EmitMemCpy + = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset || + EndOffset != NewAllocaEndOffset || + !NewAI.getAllocatedType()->isSingleValueType()); + + // If we're just going to emit a memcpy, the alloca hasn't changed, and the + // size hasn't been shrunk based on analysis of the viable range, this is + // a no-op. + if (EmitMemCpy && &OldAI == &NewAI) { + uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin; + uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd; + // Ensure the start lines up. + assert(BeginOffset == OrigBegin); + (void)OrigBegin; + + // Rewrite the size as needed. + if (EndOffset != OrigEnd) + II.setLength(ConstantInt::get(II.getLength()->getType(), + EndOffset - BeginOffset)); + return false; + } + // Record this instruction for deletion. + Pass.DeadInsts.insert(&II); + + // Strip all inbounds GEPs and pointer casts to try to dig out any root + // alloca that should be re-examined after rewriting this instruction. + Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); + if (AllocaInst *AI + = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) + Pass.Worklist.insert(AI); + + if (EmitMemCpy) { + Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() + : II.getRawDest()->getType(); + + // Compute the other pointer, folding as much as possible to produce + // a single, simple GEP in most cases. + OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, + getName("." + OtherPtr->getName())); + + Value *OurPtr + = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType() + : II.getRawSource()->getType()); + Type *SizeTy = II.getLength()->getType(); + Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); + + CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, + IsDest ? OtherPtr : OurPtr, + Size, Align, II.isVolatile()); + (void)New; + DEBUG(dbgs() << " to: " << *New << "\n"); + return false; + } + + // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy + // is equivalent to 1, but that isn't true if we end up rewriting this as + // a load or store. + if (!Align) + Align = 1; + + bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset && + EndOffset == NewAllocaEndOffset; + uint64_t Size = EndOffset - BeginOffset; + unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0; + unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0; + unsigned NumElements = EndIndex - BeginIndex; + IntegerType *SubIntTy + = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; + + Type *OtherPtrTy = NewAI.getType(); + if (VecTy && !IsWholeAlloca) { + if (NumElements == 1) + OtherPtrTy = VecTy->getElementType(); + else + OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements); + + OtherPtrTy = OtherPtrTy->getPointerTo(); + } else if (IntTy && !IsWholeAlloca) { + OtherPtrTy = SubIntTy->getPointerTo(); + } + + Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, + getName("." + OtherPtr->getName())); + Value *DstPtr = &NewAI; + if (!IsDest) + std::swap(SrcPtr, DstPtr); + + Value *Src; + if (VecTy && !IsWholeAlloca && !IsDest) { + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + Src = extractVector(IRB, Src, BeginIndex, EndIndex, getName(".vec")); + } else if (IntTy && !IsWholeAlloca && !IsDest) { + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + Src = convertValue(TD, IRB, Src, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract")); + } else { + Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), + getName(".copyload")); + } + + if (VecTy && !IsWholeAlloca && IsDest) { + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Src = insertVector(IRB, Old, Src, BeginIndex, getName(".vec")); + } else if (IntTy && !IsWholeAlloca && IsDest) { + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert")); + Src = convertValue(TD, IRB, Src, NewAllocaTy); + } + + StoreInst *Store = cast<StoreInst>( + IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile())); + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + return !II.isVolatile(); + } + + bool visitIntrinsicInst(IntrinsicInst &II) { + assert(II.getIntrinsicID() == Intrinsic::lifetime_start || + II.getIntrinsicID() == Intrinsic::lifetime_end); + DEBUG(dbgs() << " original: " << II << "\n"); + IRBuilder<> IRB(&II); + assert(II.getArgOperand(1) == OldPtr); + + // Record this instruction for deletion. + Pass.DeadInsts.insert(&II); + + ConstantInt *Size + = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), + EndOffset - BeginOffset); + Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); + Value *New; + if (II.getIntrinsicID() == Intrinsic::lifetime_start) + New = IRB.CreateLifetimeStart(Ptr, Size); + else + New = IRB.CreateLifetimeEnd(Ptr, Size); + + DEBUG(dbgs() << " to: " << *New << "\n"); + return true; + } + + bool visitPHINode(PHINode &PN) { + DEBUG(dbgs() << " original: " << PN << "\n"); + + // We would like to compute a new pointer in only one place, but have it be + // as local as possible to the PHI. To do that, we re-use the location of + // the old pointer, which necessarily must be in the right position to + // dominate the PHI. + IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr)); + + Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); + // Replace the operands which were using the old pointer. + std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); + + DEBUG(dbgs() << " to: " << PN << "\n"); + deleteIfTriviallyDead(OldPtr); + return false; + } + + bool visitSelectInst(SelectInst &SI) { + DEBUG(dbgs() << " original: " << SI << "\n"); + IRBuilder<> IRB(&SI); + + // Find the operand we need to rewrite here. + bool IsTrueVal = SI.getTrueValue() == OldPtr; + if (IsTrueVal) + assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!"); + else + assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!"); + + Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType()); + SI.setOperand(IsTrueVal ? 1 : 2, NewPtr); + DEBUG(dbgs() << " to: " << SI << "\n"); + deleteIfTriviallyDead(OldPtr); + return false; + } + +}; +} + +namespace { +/// \brief Visitor to rewrite aggregate loads and stores as scalar. +/// +/// This pass aggressively rewrites all aggregate loads and stores on +/// a particular pointer (or any pointer derived from it which we can identify) +/// with scalar loads and stores. +class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { + // Befriend the base class so it can delegate to private visit methods. + friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; + + const DataLayout &TD; + + /// Queue of pointer uses to analyze and potentially rewrite. + SmallVector<Use *, 8> Queue; + + /// Set to prevent us from cycling with phi nodes and loops. + SmallPtrSet<User *, 8> Visited; + + /// The current pointer use being rewritten. This is used to dig up the used + /// value (as opposed to the user). + Use *U; + +public: + AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {} + + /// Rewrite loads and stores through a pointer and all pointers derived from + /// it. + bool rewrite(Instruction &I) { + DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); + enqueueUsers(I); + bool Changed = false; + while (!Queue.empty()) { + U = Queue.pop_back_val(); + Changed |= visit(cast<Instruction>(U->getUser())); + } + return Changed; + } + +private: + /// Enqueue all the users of the given instruction for further processing. + /// This uses a set to de-duplicate users. + void enqueueUsers(Instruction &I) { + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; + ++UI) + if (Visited.insert(*UI)) + Queue.push_back(&UI.getUse()); + } + + // Conservative default is to not rewrite anything. + bool visitInstruction(Instruction &I) { return false; } + + /// \brief Generic recursive split emission class. + template <typename Derived> + class OpSplitter { + protected: + /// The builder used to form new instructions. + IRBuilder<> IRB; + /// The indices which to be used with insert- or extractvalue to select the + /// appropriate value within the aggregate. + SmallVector<unsigned, 4> Indices; + /// The indices to a GEP instruction which will move Ptr to the correct slot + /// within the aggregate. + SmallVector<Value *, 4> GEPIndices; + /// The base pointer of the original op, used as a base for GEPing the + /// split operations. + Value *Ptr; + + /// Initialize the splitter with an insertion point, Ptr and start with a + /// single zero GEP index. + OpSplitter(Instruction *InsertionPoint, Value *Ptr) + : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} + + public: + /// \brief Generic recursive split emission routine. + /// + /// This method recursively splits an aggregate op (load or store) into + /// scalar or vector ops. It splits recursively until it hits a single value + /// and emits that single value operation via the template argument. + /// + /// The logic of this routine relies on GEPs and insertvalue and + /// extractvalue all operating with the same fundamental index list, merely + /// formatted differently (GEPs need actual values). + /// + /// \param Ty The type being split recursively into smaller ops. + /// \param Agg The aggregate value being built up or stored, depending on + /// whether this is splitting a load or a store respectively. + void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { + if (Ty->isSingleValueType()) + return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name); + + if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { + unsigned OldSize = Indices.size(); + (void)OldSize; + for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; + ++Idx) { + assert(Indices.size() == OldSize && "Did not return to the old size"); + Indices.push_back(Idx); + GEPIndices.push_back(IRB.getInt32(Idx)); + emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); + GEPIndices.pop_back(); + Indices.pop_back(); + } + return; + } + + if (StructType *STy = dyn_cast<StructType>(Ty)) { + unsigned OldSize = Indices.size(); + (void)OldSize; + for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; + ++Idx) { + assert(Indices.size() == OldSize && "Did not return to the old size"); + Indices.push_back(Idx); + GEPIndices.push_back(IRB.getInt32(Idx)); + emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); + GEPIndices.pop_back(); + Indices.pop_back(); + } + return; + } + + llvm_unreachable("Only arrays and structs are aggregate loadable types"); + } + }; + + struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { + LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) + : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} + + /// Emit a leaf load of a single value. This is called at the leaves of the + /// recursive emission to actually load values. + void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { + assert(Ty->isSingleValueType()); + // Load the single value and insert it using the indices. + Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices, + Name + ".gep"), + Name + ".load"); + Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); + DEBUG(dbgs() << " to: " << *Load << "\n"); + } + }; + + bool visitLoadInst(LoadInst &LI) { + assert(LI.getPointerOperand() == *U); + if (!LI.isSimple() || LI.getType()->isSingleValueType()) + return false; + + // We have an aggregate being loaded, split it apart. + DEBUG(dbgs() << " original: " << LI << "\n"); + LoadOpSplitter Splitter(&LI, *U); + Value *V = UndefValue::get(LI.getType()); + Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); + LI.replaceAllUsesWith(V); + LI.eraseFromParent(); + return true; + } + + struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { + StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) + : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} + + /// Emit a leaf store of a single value. This is called at the leaves of the + /// recursive emission to actually produce stores. + void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { + assert(Ty->isSingleValueType()); + // Extract the single value and store it using the indices. + Value *Store = IRB.CreateStore( + IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), + IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + } + }; + + bool visitStoreInst(StoreInst &SI) { + if (!SI.isSimple() || SI.getPointerOperand() != *U) + return false; + Value *V = SI.getValueOperand(); + if (V->getType()->isSingleValueType()) + return false; + + // We have an aggregate being stored, split it apart. + DEBUG(dbgs() << " original: " << SI << "\n"); + StoreOpSplitter Splitter(&SI, *U); + Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); + SI.eraseFromParent(); + return true; + } + + bool visitBitCastInst(BitCastInst &BC) { + enqueueUsers(BC); + return false; + } + + bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { + enqueueUsers(GEPI); + return false; + } + + bool visitPHINode(PHINode &PN) { + enqueueUsers(PN); + return false; + } + + bool visitSelectInst(SelectInst &SI) { + enqueueUsers(SI); + return false; + } +}; +} + +/// \brief Strip aggregate type wrapping. +/// +/// This removes no-op aggregate types wrapping an underlying type. It will +/// strip as many layers of types as it can without changing either the type +/// size or the allocated size. +static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { + if (Ty->isSingleValueType()) + return Ty; + + uint64_t AllocSize = DL.getTypeAllocSize(Ty); + uint64_t TypeSize = DL.getTypeSizeInBits(Ty); + + Type *InnerTy; + if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { + InnerTy = ArrTy->getElementType(); + } else if (StructType *STy = dyn_cast<StructType>(Ty)) { + const StructLayout *SL = DL.getStructLayout(STy); + unsigned Index = SL->getElementContainingOffset(0); + InnerTy = STy->getElementType(Index); + } else { + return Ty; + } + + if (AllocSize > DL.getTypeAllocSize(InnerTy) || + TypeSize > DL.getTypeSizeInBits(InnerTy)) + return Ty; + + return stripAggregateTypeWrapping(DL, InnerTy); +} + +/// \brief Try to find a partition of the aggregate type passed in for a given +/// offset and size. +/// +/// This recurses through the aggregate type and tries to compute a subtype +/// based on the offset and size. When the offset and size span a sub-section +/// of an array, it will even compute a new array type for that sub-section, +/// and the same for structs. +/// +/// Note that this routine is very strict and tries to find a partition of the +/// type which produces the *exact* right offset and size. It is not forgiving +/// when the size or offset cause either end of type-based partition to be off. +/// Also, this is a best-effort routine. It is reasonable to give up and not +/// return a type if necessary. +static Type *getTypePartition(const DataLayout &TD, Type *Ty, + uint64_t Offset, uint64_t Size) { + if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size) + return stripAggregateTypeWrapping(TD, Ty); + if (Offset > TD.getTypeAllocSize(Ty) || + (TD.getTypeAllocSize(Ty) - Offset) < Size) + return 0; + + if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { + // We can't partition pointers... + if (SeqTy->isPointerTy()) + return 0; + + Type *ElementTy = SeqTy->getElementType(); + uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); + uint64_t NumSkippedElements = Offset / ElementSize; + if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) + if (NumSkippedElements >= ArrTy->getNumElements()) + return 0; + if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) + if (NumSkippedElements >= VecTy->getNumElements()) + return 0; + Offset -= NumSkippedElements * ElementSize; + + // First check if we need to recurse. + if (Offset > 0 || Size < ElementSize) { + // Bail if the partition ends in a different array element. + if ((Offset + Size) > ElementSize) + return 0; + // Recurse through the element type trying to peel off offset bytes. + return getTypePartition(TD, ElementTy, Offset, Size); + } + assert(Offset == 0); + + if (Size == ElementSize) + return stripAggregateTypeWrapping(TD, ElementTy); + assert(Size > ElementSize); + uint64_t NumElements = Size / ElementSize; + if (NumElements * ElementSize != Size) + return 0; + return ArrayType::get(ElementTy, NumElements); + } + + StructType *STy = dyn_cast<StructType>(Ty); + if (!STy) + return 0; + + const StructLayout *SL = TD.getStructLayout(STy); + if (Offset >= SL->getSizeInBytes()) + return 0; + uint64_t EndOffset = Offset + Size; + if (EndOffset > SL->getSizeInBytes()) + return 0; + + unsigned Index = SL->getElementContainingOffset(Offset); + Offset -= SL->getElementOffset(Index); + + Type *ElementTy = STy->getElementType(Index); + uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); + if (Offset >= ElementSize) + return 0; // The offset points into alignment padding. + + // See if any partition must be contained by the element. + if (Offset > 0 || Size < ElementSize) { + if ((Offset + Size) > ElementSize) + return 0; + return getTypePartition(TD, ElementTy, Offset, Size); + } + assert(Offset == 0); + + if (Size == ElementSize) + return stripAggregateTypeWrapping(TD, ElementTy); + + StructType::element_iterator EI = STy->element_begin() + Index, + EE = STy->element_end(); + if (EndOffset < SL->getSizeInBytes()) { + unsigned EndIndex = SL->getElementContainingOffset(EndOffset); + if (Index == EndIndex) + return 0; // Within a single element and its padding. + + // Don't try to form "natural" types if the elements don't line up with the + // expected size. + // FIXME: We could potentially recurse down through the last element in the + // sub-struct to find a natural end point. + if (SL->getElementOffset(EndIndex) != EndOffset) + return 0; + + assert(Index < EndIndex); + EE = STy->element_begin() + EndIndex; + } + + // Try to build up a sub-structure. + StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), + STy->isPacked()); + const StructLayout *SubSL = TD.getStructLayout(SubTy); + if (Size != SubSL->getSizeInBytes()) + return 0; // The sub-struct doesn't have quite the size needed. + + return SubTy; +} + +/// \brief Rewrite an alloca partition's users. +/// +/// This routine drives both of the rewriting goals of the SROA pass. It tries +/// to rewrite uses of an alloca partition to be conducive for SSA value +/// promotion. If the partition needs a new, more refined alloca, this will +/// build that new alloca, preserving as much type information as possible, and +/// rewrite the uses of the old alloca to point at the new one and have the +/// appropriate new offsets. It also evaluates how successful the rewrite was +/// at enabling promotion and if it was successful queues the alloca to be +/// promoted. +bool SROA::rewriteAllocaPartition(AllocaInst &AI, + AllocaPartitioning &P, + AllocaPartitioning::iterator PI) { + uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset; + bool IsLive = false; + for (AllocaPartitioning::use_iterator UI = P.use_begin(PI), + UE = P.use_end(PI); + UI != UE && !IsLive; ++UI) + if (UI->U) + IsLive = true; + if (!IsLive) + return false; // No live uses left of this partition. + + DEBUG(dbgs() << "Speculating PHIs and selects in partition " + << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n"); + + PHIOrSelectSpeculator Speculator(*TD, P, *this); + DEBUG(dbgs() << " speculating "); + DEBUG(P.print(dbgs(), PI, "")); + Speculator.visitUsers(PI); + + // Try to compute a friendly type for this partition of the alloca. This + // won't always succeed, in which case we fall back to a legal integer type + // or an i8 array of an appropriate size. + Type *AllocaTy = 0; + if (Type *PartitionTy = P.getCommonType(PI)) + if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize) + AllocaTy = PartitionTy; + if (!AllocaTy) + if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(), + PI->BeginOffset, AllocaSize)) + AllocaTy = PartitionTy; + if ((!AllocaTy || + (AllocaTy->isArrayTy() && + AllocaTy->getArrayElementType()->isIntegerTy())) && + TD->isLegalInteger(AllocaSize * 8)) + AllocaTy = Type::getIntNTy(*C, AllocaSize * 8); + if (!AllocaTy) + AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize); + assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize); + + // Check for the case where we're going to rewrite to a new alloca of the + // exact same type as the original, and with the same access offsets. In that + // case, re-use the existing alloca, but still run through the rewriter to + // performe phi and select speculation. + AllocaInst *NewAI; + if (AllocaTy == AI.getAllocatedType()) { + assert(PI->BeginOffset == 0 && + "Non-zero begin offset but same alloca type"); + assert(PI == P.begin() && "Begin offset is zero on later partition"); + NewAI = &AI; + } else { + unsigned Alignment = AI.getAlignment(); + if (!Alignment) { + // The minimum alignment which users can rely on when the explicit + // alignment is omitted or zero is that required by the ABI for this + // type. + Alignment = TD->getABITypeAlignment(AI.getAllocatedType()); + } + Alignment = MinAlign(Alignment, PI->BeginOffset); + // If we will get at least this much alignment from the type alone, leave + // the alloca's alignment unconstrained. + if (Alignment <= TD->getABITypeAlignment(AllocaTy)) + Alignment = 0; + NewAI = new AllocaInst(AllocaTy, 0, Alignment, + AI.getName() + ".sroa." + Twine(PI - P.begin()), + &AI); + ++NumNewAllocas; + } + + DEBUG(dbgs() << "Rewriting alloca partition " + << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: " + << *NewAI << "\n"); + + // Track the high watermark of the post-promotion worklist. We will reset it + // to this point if the alloca is not in fact scheduled for promotion. + unsigned PPWOldSize = PostPromotionWorklist.size(); + + AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI, + PI->BeginOffset, PI->EndOffset); + DEBUG(dbgs() << " rewriting "); + DEBUG(P.print(dbgs(), PI, "")); + bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI)); + if (Promotable) { + DEBUG(dbgs() << " and queuing for promotion\n"); + PromotableAllocas.push_back(NewAI); + } else if (NewAI != &AI) { + // If we can't promote the alloca, iterate on it to check for new + // refinements exposed by splitting the current alloca. Don't iterate on an + // alloca which didn't actually change and didn't get promoted. + Worklist.insert(NewAI); + } + + // Drop any post-promotion work items if promotion didn't happen. + if (!Promotable) + while (PostPromotionWorklist.size() > PPWOldSize) + PostPromotionWorklist.pop_back(); + + return true; +} + +/// \brief Walks the partitioning of an alloca rewriting uses of each partition. +bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { + bool Changed = false; + for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE; + ++PI) + Changed |= rewriteAllocaPartition(AI, P, PI); + + return Changed; +} + +/// \brief Analyze an alloca for SROA. +/// +/// This analyzes the alloca to ensure we can reason about it, builds +/// a partitioning of the alloca, and then hands it off to be split and +/// rewritten as needed. +bool SROA::runOnAlloca(AllocaInst &AI) { + DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); + ++NumAllocasAnalyzed; + + // Special case dead allocas, as they're trivial. + if (AI.use_empty()) { + AI.eraseFromParent(); + return true; + } + + // Skip alloca forms that this analysis can't handle. + if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || + TD->getTypeAllocSize(AI.getAllocatedType()) == 0) + return false; + + bool Changed = false; + + // First, split any FCA loads and stores touching this alloca to promote + // better splitting and promotion opportunities. + AggLoadStoreRewriter AggRewriter(*TD); + Changed |= AggRewriter.rewrite(AI); + + // Build the partition set using a recursive instruction-visiting builder. + AllocaPartitioning P(*TD, AI); + DEBUG(P.print(dbgs())); + if (P.isEscaped()) + return Changed; + + // Delete all the dead users of this alloca before splitting and rewriting it. + for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), + DE = P.dead_user_end(); + DI != DE; ++DI) { + Changed = true; + (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); + DeadInsts.insert(*DI); + } + for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(), + DE = P.dead_op_end(); + DO != DE; ++DO) { + Value *OldV = **DO; + // Clobber the use with an undef value. + **DO = UndefValue::get(OldV->getType()); + if (Instruction *OldI = dyn_cast<Instruction>(OldV)) + if (isInstructionTriviallyDead(OldI)) { + Changed = true; + DeadInsts.insert(OldI); + } + } + + // No partitions to split. Leave the dead alloca for a later pass to clean up. + if (P.begin() == P.end()) + return Changed; + + return splitAlloca(AI, P) || Changed; +} + +/// \brief Delete the dead instructions accumulated in this run. +/// +/// Recursively deletes the dead instructions we've accumulated. This is done +/// at the very end to maximize locality of the recursive delete and to +/// minimize the problems of invalidated instruction pointers as such pointers +/// are used heavily in the intermediate stages of the algorithm. +/// +/// We also record the alloca instructions deleted here so that they aren't +/// subsequently handed to mem2reg to promote. +void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { + while (!DeadInsts.empty()) { + Instruction *I = DeadInsts.pop_back_val(); + DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); + + I->replaceAllUsesWith(UndefValue::get(I->getType())); + + for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) + if (Instruction *U = dyn_cast<Instruction>(*OI)) { + // Zero out the operand and see if it becomes trivially dead. + *OI = 0; + if (isInstructionTriviallyDead(U)) + DeadInsts.insert(U); + } + + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) + DeletedAllocas.insert(AI); + + ++NumDeleted; + I->eraseFromParent(); + } +} + +/// \brief Promote the allocas, using the best available technique. +/// +/// This attempts to promote whatever allocas have been identified as viable in +/// the PromotableAllocas list. If that list is empty, there is nothing to do. +/// If there is a domtree available, we attempt to promote using the full power +/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is +/// based on the SSAUpdater utilities. This function returns whether any +/// promotion occured. +bool SROA::promoteAllocas(Function &F) { + if (PromotableAllocas.empty()) + return false; + + NumPromoted += PromotableAllocas.size(); + + if (DT && !ForceSSAUpdater) { + DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); + PromoteMemToReg(PromotableAllocas, *DT); + PromotableAllocas.clear(); + return true; + } + + DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); + SSAUpdater SSA; + DIBuilder DIB(*F.getParent()); + SmallVector<Instruction*, 64> Insts; + + for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { + AllocaInst *AI = PromotableAllocas[Idx]; + for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); + UI != UE;) { + Instruction *I = cast<Instruction>(*UI++); + // FIXME: Currently the SSAUpdater infrastructure doesn't reason about + // lifetime intrinsics and so we strip them (and the bitcasts+GEPs + // leading to them) here. Eventually it should use them to optimize the + // scalar values produced. + if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { + assert(onlyUsedByLifetimeMarkers(I) && + "Found a bitcast used outside of a lifetime marker."); + while (!I->use_empty()) + cast<Instruction>(*I->use_begin())->eraseFromParent(); + I->eraseFromParent(); + continue; + } + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + assert(II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end); + II->eraseFromParent(); + continue; + } + + Insts.push_back(I); + } + AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); + Insts.clear(); + } + + PromotableAllocas.clear(); + return true; +} + +namespace { + /// \brief A predicate to test whether an alloca belongs to a set. + class IsAllocaInSet { + typedef SmallPtrSet<AllocaInst *, 4> SetType; + const SetType &Set; + + public: + typedef AllocaInst *argument_type; + + IsAllocaInSet(const SetType &Set) : Set(Set) {} + bool operator()(AllocaInst *AI) const { return Set.count(AI); } + }; +} + +bool SROA::runOnFunction(Function &F) { + DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); + C = &F.getContext(); + TD = getAnalysisIfAvailable<DataLayout>(); + if (!TD) { + DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); + return false; + } + DT = getAnalysisIfAvailable<DominatorTree>(); + + BasicBlock &EntryBB = F.getEntryBlock(); + for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end()); + I != E; ++I) + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) + Worklist.insert(AI); + + bool Changed = false; + // A set of deleted alloca instruction pointers which should be removed from + // the list of promotable allocas. + SmallPtrSet<AllocaInst *, 4> DeletedAllocas; + + do { + while (!Worklist.empty()) { + Changed |= runOnAlloca(*Worklist.pop_back_val()); + deleteDeadInstructions(DeletedAllocas); + + // Remove the deleted allocas from various lists so that we don't try to + // continue processing them. + if (!DeletedAllocas.empty()) { + Worklist.remove_if(IsAllocaInSet(DeletedAllocas)); + PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas)); + PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), + PromotableAllocas.end(), + IsAllocaInSet(DeletedAllocas)), + PromotableAllocas.end()); + DeletedAllocas.clear(); + } + } + + Changed |= promoteAllocas(F); + + Worklist = PostPromotionWorklist; + PostPromotionWorklist.clear(); + } while (!Worklist.empty()); + + return Changed; +} + +void SROA::getAnalysisUsage(AnalysisUsage &AU) const { + if (RequiresDomTree) + AU.addRequired<DominatorTree>(); + AU.setPreservesCFG(); +} diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 48318c8..35d2fa0 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -13,14 +13,14 @@ // //===----------------------------------------------------------------------===// -#include "llvm-c/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar.h" #include "llvm-c/Initialization.h" -#include "llvm/InitializePasses.h" -#include "llvm/PassManager.h" +#include "llvm-c/Transforms/Scalar.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/Verifier.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Transforms/Scalar.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassManager.h" using namespace llvm; @@ -59,6 +59,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeRegToMemPass(Registry); initializeSCCPPass(Registry); initializeIPSCCPPass(Registry); + initializeSROAPass(Registry); initializeSROA_DTPass(Registry); initializeSROA_SSAUpPass(Registry); initializeCFGSimplifyPassPass(Registry); diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 8090fdf..e590a37 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -21,32 +21,32 @@ #define DEBUG_TYPE "scalarrepl" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/DIBuilder.h" -#include "llvm/DebugInfo.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/GlobalVariable.h" -#include "llvm/IRBuilder.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" -#include "llvm/Operator.h" -#include "llvm/Pass.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -87,7 +87,7 @@ namespace { private: bool HasDomTree; - TargetData *TD; + DataLayout *TD; /// DeadInsts - Keep track of instructions we have made dead, so that /// we can remove them after we are done working. @@ -258,7 +258,7 @@ namespace { class ConvertToScalarInfo { /// AllocaSize - The size of the alloca being considered in bytes. unsigned AllocaSize; - const TargetData &TD; + const DataLayout &TD; unsigned ScalarLoadThreshold; /// IsNotTrivial - This is set to true if there is some access to the object @@ -301,7 +301,7 @@ class ConvertToScalarInfo { bool HadDynamicAccess; public: - explicit ConvertToScalarInfo(unsigned Size, const TargetData &td, + explicit ConvertToScalarInfo(unsigned Size, const DataLayout &td, unsigned SLT) : AllocaSize(Size), TD(td), ScalarLoadThreshold(SLT), IsNotTrivial(false), ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false), @@ -1020,11 +1020,11 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, bool SROA::runOnFunction(Function &F) { - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); bool Changed = performPromotion(F); - // FIXME: ScalarRepl currently depends on TargetData more than it + // FIXME: ScalarRepl currently depends on DataLayout more than it // theoretically needs to. It should be refactored in order to support // target-independent IR. Until this is done, just skip the actual // scalar-replacement portion of this pass. @@ -1134,7 +1134,7 @@ public: /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { +static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *TD) { bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(); bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(); @@ -1172,7 +1172,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { +static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) { // For now, we can only do this promotion if the load is in the same block as // the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. @@ -1236,7 +1236,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { /// direct (non-volatile) loads and stores to it. If the alloca is close but /// not quite there, this will transform the code to allow promotion. As such, /// it is a non-pure predicate. -static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { +static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { SetVector<Instruction*, SmallVector<Instruction*, 4>, SmallPtrSet<Instruction*, 4> > InstsToRewrite; @@ -2537,7 +2537,7 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, /// HasPadding - Return true if the specified type has any structure or /// alignment padding in between the elements that would be split apart /// by SROA; return false otherwise. -static bool HasPadding(Type *Ty, const TargetData &TD) { +static bool HasPadding(Type *Ty, const DataLayout &TD) { if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { Ty = ATy->getElementType(); return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty); diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 6d27db1..c243d34 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -23,18 +23,19 @@ #define DEBUG_TYPE "simplifycfg" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Constants.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Module.h" -#include "llvm/Attributes.h" -#include "llvm/Support/CFG.h" -#include "llvm/Pass.h" -#include "llvm/Target/TargetData.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CFG.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; STATISTIC(NumSimpl, "Number of blocks simplified"); @@ -47,12 +48,19 @@ namespace { } virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetTransformInfo>(); + } }; } char CFGSimplifyPass::ID = 0; -INITIALIZE_PASS(CFGSimplifyPass, "simplifycfg", - "Simplify the CFG", false, false) +INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", + false, false) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", + false, false) // Public interface to the CFGSimplification pass FunctionPass *llvm::createCFGSimplificationPass() { @@ -110,13 +118,11 @@ static bool markAliveBlocks(BasicBlock *BB, SmallVector<BasicBlock*, 128> Worklist; Worklist.push_back(BB); + Reachable.insert(BB); bool Changed = false; do { BB = Worklist.pop_back_val(); - if (!Reachable.insert(BB)) - continue; - // Do a quick scan of the basic block, turning any obviously unreachable // instructions into LLVM unreachable insts. The instruction combining pass // canonicalizes unreachable insts into stores to null or undef. @@ -175,7 +181,8 @@ static bool markAliveBlocks(BasicBlock *BB, Changed |= ConstantFoldTerminator(BB, true); for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) - Worklist.push_back(*SI); + if (Reachable.insert(*SI)) + Worklist.push_back(*SI); } while (!Worklist.empty()); return Changed; } @@ -293,7 +300,8 @@ static bool mergeEmptyReturnBlocks(Function &F) { /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. -static bool iterativelySimplifyCFG(Function &F, const TargetData *TD) { +static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, + const DataLayout *TD) { bool Changed = false; bool LocalChange = true; while (LocalChange) { @@ -302,7 +310,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetData *TD) { // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TD)) { + if (SimplifyCFG(BBIt++, TTI, TD)) { LocalChange = true; ++NumSimpl; } @@ -316,10 +324,11 @@ static bool iterativelySimplifyCFG(Function &F, const TargetData *TD) { // simplify the CFG. // bool CFGSimplifyPass::runOnFunction(Function &F) { - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); bool EverChanged = removeUnreachableBlocksFromFn(F); EverChanged |= mergeEmptyReturnBlocks(F); - EverChanged |= iterativelySimplifyCFG(F, TD); + EverChanged |= iterativelySimplifyCFG(F, TTI, TD); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -333,7 +342,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { return true; do { - EverChanged = iterativelySimplifyCFG(F, TD); + EverChanged = iterativelySimplifyCFG(F, TTI, TD); EverChanged |= removeUnreachableBlocksFromFn(F); } while (EverChanged); diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp index 65311fe..d5cefa3 100644 --- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -17,32 +17,26 @@ #define DEBUG_TYPE "simplify-libcalls" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BuildLibCalls.h" -#include "llvm/IRBuilder.h" -#include "llvm/Intrinsics.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringMap.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Config/config.h" // FIXME: Shouldn't depend on host! +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Config/config.h" // FIXME: Shouldn't depend on host! +#include "llvm/Transforms/Utils/BuildLibCalls.h" using namespace llvm; -STATISTIC(NumSimplified, "Number of library calls simplified"); STATISTIC(NumAnnotated, "Number of attributes added to library functions"); -static cl::opt<bool> UnsafeFPShrink("enable-double-float-shrink", cl::Hidden, - cl::init(false), - cl::desc("Enable unsafe double to float " - "shrinking for math lib calls")); //===----------------------------------------------------------------------===// // Optimizer Base Class //===----------------------------------------------------------------------===// @@ -53,7 +47,7 @@ namespace { class LibCallOptimization { protected: Function *Caller; - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; LLVMContext* Context; public: @@ -68,7 +62,7 @@ public: virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) =0; - Value *OptimizeCall(CallInst *CI, const TargetData *TD, + Value *OptimizeCall(CallInst *CI, const DataLayout *TD, const TargetLibraryInfo *TLI, IRBuilder<> &B) { Caller = CI->getParent()->getParent(); this->TD = TD; @@ -87,1470 +81,6 @@ public: //===----------------------------------------------------------------------===// -// Helper Functions -//===----------------------------------------------------------------------===// - -/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the -/// value is equal or not-equal to zero. -static bool IsOnlyUsedInZeroEqualityComparison(Value *V) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); - UI != E; ++UI) { - if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) - if (IC->isEquality()) - if (Constant *C = dyn_cast<Constant>(IC->getOperand(1))) - if (C->isNullValue()) - continue; - // Unknown instruction. - return false; - } - return true; -} - -static bool CallHasFloatingPointArgument(const CallInst *CI) { - for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end(); - it != e; ++it) { - if ((*it)->getType()->isFloatingPointTy()) - return true; - } - return false; -} - -/// IsOnlyUsedInEqualityComparison - Return true if it is only used in equality -/// comparisons with With. -static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); - UI != E; ++UI) { - if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) - if (IC->isEquality() && IC->getOperand(1) == With) - continue; - // Unknown instruction. - return false; - } - return true; -} - -//===----------------------------------------------------------------------===// -// String and Memory LibCall Optimizations -//===----------------------------------------------------------------------===// - -//===---------------------------------------===// -// 'strcat' Optimizations -namespace { -struct StrCatOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strcat" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getReturnType() != B.getInt8PtrTy() || - FT->getParamType(0) != FT->getReturnType() || - FT->getParamType(1) != FT->getReturnType()) - return 0; - - // Extract some information from the instruction - Value *Dst = CI->getArgOperand(0); - Value *Src = CI->getArgOperand(1); - - // See if we can get the length of the input string. - uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; - --Len; // Unbias length. - - // Handle the simple, do-nothing case: strcat(x, "") -> x - if (Len == 0) - return Dst; - - // These optimizations require TargetData. - if (!TD) return 0; - - return EmitStrLenMemCpy(Src, Dst, Len, B); - } - - Value *EmitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B) { - // We need to find the end of the destination string. That's where the - // memory is to be moved to. We just generate a call to strlen. - Value *DstLen = EmitStrLen(Dst, B, TD, TLI); - if (!DstLen) - return 0; - - // Now that we have the destination's length, we must index into the - // destination's pointer to get the actual memcpy destination (end of - // the string .. we're concatenating). - Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr"); - - // We have enough information to now generate the memcpy call to do the - // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - B.CreateMemCpy(CpyDst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1); - return Dst; - } -}; - -//===---------------------------------------===// -// 'strncat' Optimizations - -struct StrNCatOpt : public StrCatOpt { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strncat" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || - FT->getReturnType() != B.getInt8PtrTy() || - FT->getParamType(0) != FT->getReturnType() || - FT->getParamType(1) != FT->getReturnType() || - !FT->getParamType(2)->isIntegerTy()) - return 0; - - // Extract some information from the instruction - Value *Dst = CI->getArgOperand(0); - Value *Src = CI->getArgOperand(1); - uint64_t Len; - - // We don't do anything if length is not constant - if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) - Len = LengthArg->getZExtValue(); - else - return 0; - - // See if we can get the length of the input string. - uint64_t SrcLen = GetStringLength(Src); - if (SrcLen == 0) return 0; - --SrcLen; // Unbias length. - - // Handle the simple, do-nothing cases: - // strncat(x, "", c) -> x - // strncat(x, c, 0) -> x - if (SrcLen == 0 || Len == 0) return Dst; - - // These optimizations require TargetData. - if (!TD) return 0; - - // We don't optimize this case - if (Len < SrcLen) return 0; - - // strncat(x, s, c) -> strcat(x, s) - // s is constant so the strcat can be optimized further - return EmitStrLenMemCpy(Src, Dst, SrcLen, B); - } -}; - -//===---------------------------------------===// -// 'strchr' Optimizations - -struct StrChrOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strchr" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getReturnType() != B.getInt8PtrTy() || - FT->getParamType(0) != FT->getReturnType() || - !FT->getParamType(1)->isIntegerTy(32)) - return 0; - - Value *SrcStr = CI->getArgOperand(0); - - // If the second operand is non-constant, see if we can compute the length - // of the input string and turn this into memchr. - ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); - if (CharC == 0) { - // These optimizations require TargetData. - if (!TD) return 0; - - uint64_t Len = GetStringLength(SrcStr); - if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32. - return 0; - - return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul. - ConstantInt::get(TD->getIntPtrType(*Context), Len), - B, TD, TLI); - } - - // Otherwise, the character is a constant, see if the first argument is - // a string literal. If so, we can constant fold. - StringRef Str; - if (!getConstantStringInfo(SrcStr, Str)) - return 0; - - // Compute the offset, make sure to handle the case when we're searching for - // zero (a weird way to spell strlen). - size_t I = CharC->getSExtValue() == 0 ? - Str.size() : Str.find(CharC->getSExtValue()); - if (I == StringRef::npos) // Didn't find the char. strchr returns null. - return Constant::getNullValue(CI->getType()); - - // strchr(s+n,c) -> gep(s+n+i,c) - return B.CreateGEP(SrcStr, B.getInt64(I), "strchr"); - } -}; - -//===---------------------------------------===// -// 'strrchr' Optimizations - -struct StrRChrOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strrchr" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getReturnType() != B.getInt8PtrTy() || - FT->getParamType(0) != FT->getReturnType() || - !FT->getParamType(1)->isIntegerTy(32)) - return 0; - - Value *SrcStr = CI->getArgOperand(0); - ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); - - // Cannot fold anything if we're not looking for a constant. - if (!CharC) - return 0; - - StringRef Str; - if (!getConstantStringInfo(SrcStr, Str)) { - // strrchr(s, 0) -> strchr(s, 0) - if (TD && CharC->isZero()) - return EmitStrChr(SrcStr, '\0', B, TD, TLI); - return 0; - } - - // Compute the offset. - size_t I = CharC->getSExtValue() == 0 ? - Str.size() : Str.rfind(CharC->getSExtValue()); - if (I == StringRef::npos) // Didn't find the char. Return null. - return Constant::getNullValue(CI->getType()); - - // strrchr(s+n,c) -> gep(s+n+i,c) - return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr"); - } -}; - -//===---------------------------------------===// -// 'strcmp' Optimizations - -struct StrCmpOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strcmp" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - !FT->getReturnType()->isIntegerTy(32) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy()) - return 0; - - Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); - if (Str1P == Str2P) // strcmp(x,x) -> 0 - return ConstantInt::get(CI->getType(), 0); - - StringRef Str1, Str2; - bool HasStr1 = getConstantStringInfo(Str1P, Str1); - bool HasStr2 = getConstantStringInfo(Str2P, Str2); - - // strcmp(x, y) -> cnst (if both x and y are constant strings) - if (HasStr1 && HasStr2) - return ConstantInt::get(CI->getType(), Str1.compare(Str2)); - - if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x - return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), - CI->getType())); - - if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x - return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); - - // strcmp(P, "x") -> memcmp(P, "x", 2) - uint64_t Len1 = GetStringLength(Str1P); - uint64_t Len2 = GetStringLength(Str2P); - if (Len1 && Len2) { - // These optimizations require TargetData. - if (!TD) return 0; - - return EmitMemCmp(Str1P, Str2P, - ConstantInt::get(TD->getIntPtrType(*Context), - std::min(Len1, Len2)), B, TD, TLI); - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'strncmp' Optimizations - -struct StrNCmpOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strncmp" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || - !FT->getReturnType()->isIntegerTy(32) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy() || - !FT->getParamType(2)->isIntegerTy()) - return 0; - - Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); - if (Str1P == Str2P) // strncmp(x,x,n) -> 0 - return ConstantInt::get(CI->getType(), 0); - - // Get the length argument if it is constant. - uint64_t Length; - if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) - Length = LengthArg->getZExtValue(); - else - return 0; - - if (Length == 0) // strncmp(x,y,0) -> 0 - return ConstantInt::get(CI->getType(), 0); - - if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) - return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD, TLI); - - StringRef Str1, Str2; - bool HasStr1 = getConstantStringInfo(Str1P, Str1); - bool HasStr2 = getConstantStringInfo(Str2P, Str2); - - // strncmp(x, y) -> cnst (if both x and y are constant strings) - if (HasStr1 && HasStr2) { - StringRef SubStr1 = Str1.substr(0, Length); - StringRef SubStr2 = Str2.substr(0, Length); - return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2)); - } - - if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x - return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), - CI->getType())); - - if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x - return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); - - return 0; - } -}; - - -//===---------------------------------------===// -// 'strcpy' Optimizations - -struct StrCpyOpt : public LibCallOptimization { - bool OptChkCall; // True if it's optimizing a __strcpy_chk libcall. - - StrCpyOpt(bool c) : OptChkCall(c) {} - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strcpy" function prototype. - unsigned NumParams = OptChkCall ? 3 : 2; - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != NumParams || - FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy()) - return 0; - - Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); - if (Dst == Src) // strcpy(x,x) -> x - return Src; - - // These optimizations require TargetData. - if (!TD) return 0; - - // See if we can get the length of the input string. - uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; - - // We have enough information to now generate the memcpy call to do the - // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - if (!OptChkCall || - !EmitMemCpyChk(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), - CI->getArgOperand(2), B, TD, TLI)) - B.CreateMemCpy(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); - return Dst; - } -}; - -//===---------------------------------------===// -// 'stpcpy' Optimizations - -struct StpCpyOpt: public LibCallOptimization { - bool OptChkCall; // True if it's optimizing a __stpcpy_chk libcall. - - StpCpyOpt(bool c) : OptChkCall(c) {} - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "stpcpy" function prototype. - unsigned NumParams = OptChkCall ? 3 : 2; - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != NumParams || - FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy()) - return 0; - - // These optimizations require TargetData. - if (!TD) return 0; - - Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); - if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) - Value *StrLen = EmitStrLen(Src, B, TD, TLI); - return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0; - } - - // See if we can get the length of the input string. - uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; - - Value *LenV = ConstantInt::get(TD->getIntPtrType(*Context), Len); - Value *DstEnd = B.CreateGEP(Dst, - ConstantInt::get(TD->getIntPtrType(*Context), - Len - 1)); - - // We have enough information to now generate the memcpy call to do the - // copy for us. Make a memcpy to copy the nul byte with align = 1. - if (!OptChkCall || !EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, - TD, TLI)) - B.CreateMemCpy(Dst, Src, LenV, 1); - return DstEnd; - } -}; - -//===---------------------------------------===// -// 'strncpy' Optimizations - -struct StrNCpyOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy() || - !FT->getParamType(2)->isIntegerTy()) - return 0; - - Value *Dst = CI->getArgOperand(0); - Value *Src = CI->getArgOperand(1); - Value *LenOp = CI->getArgOperand(2); - - // See if we can get the length of the input string. - uint64_t SrcLen = GetStringLength(Src); - if (SrcLen == 0) return 0; - --SrcLen; - - if (SrcLen == 0) { - // strncpy(x, "", y) -> memset(x, '\0', y, 1) - B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1); - return Dst; - } - - uint64_t Len; - if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp)) - Len = LengthArg->getZExtValue(); - else - return 0; - - if (Len == 0) return Dst; // strncpy(x, y, 0) -> x - - // These optimizations require TargetData. - if (!TD) return 0; - - // Let strncpy handle the zero padding - if (Len > SrcLen+1) return 0; - - // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant] - B.CreateMemCpy(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); - - return Dst; - } -}; - -//===---------------------------------------===// -// 'strlen' Optimizations - -struct StrLenOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 1 || - FT->getParamType(0) != B.getInt8PtrTy() || - !FT->getReturnType()->isIntegerTy()) - return 0; - - Value *Src = CI->getArgOperand(0); - - // Constant folding: strlen("xyz") -> 3 - if (uint64_t Len = GetStringLength(Src)) - return ConstantInt::get(CI->getType(), Len-1); - - // strlen(x) != 0 --> *x != 0 - // strlen(x) == 0 --> *x == 0 - if (IsOnlyUsedInZeroEqualityComparison(CI)) - return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType()); - return 0; - } -}; - - -//===---------------------------------------===// -// 'strpbrk' Optimizations - -struct StrPBrkOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getParamType(0) != B.getInt8PtrTy() || - FT->getParamType(1) != FT->getParamType(0) || - FT->getReturnType() != FT->getParamType(0)) - return 0; - - StringRef S1, S2; - bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); - bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); - - // strpbrk(s, "") -> NULL - // strpbrk("", s) -> NULL - if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) - return Constant::getNullValue(CI->getType()); - - // Constant folding. - if (HasS1 && HasS2) { - size_t I = S1.find_first_of(S2); - if (I == std::string::npos) // No match. - return Constant::getNullValue(CI->getType()); - - return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk"); - } - - // strpbrk(s, "a") -> strchr(s, 'a') - if (TD && HasS2 && S2.size() == 1) - return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD, TLI); - - return 0; - } -}; - -//===---------------------------------------===// -// 'strto*' Optimizations. This handles strtol, strtod, strtof, strtoul, etc. - -struct StrToOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy()) - return 0; - - Value *EndPtr = CI->getArgOperand(1); - if (isa<ConstantPointerNull>(EndPtr)) { - // With a null EndPtr, this function won't capture the main argument. - // It would be readonly too, except that it still may write to errno. - CI->addAttribute(1, Attribute::NoCapture); - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'strspn' Optimizations - -struct StrSpnOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getParamType(0) != B.getInt8PtrTy() || - FT->getParamType(1) != FT->getParamType(0) || - !FT->getReturnType()->isIntegerTy()) - return 0; - - StringRef S1, S2; - bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); - bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); - - // strspn(s, "") -> 0 - // strspn("", s) -> 0 - if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) - return Constant::getNullValue(CI->getType()); - - // Constant folding. - if (HasS1 && HasS2) { - size_t Pos = S1.find_first_not_of(S2); - if (Pos == StringRef::npos) Pos = S1.size(); - return ConstantInt::get(CI->getType(), Pos); - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'strcspn' Optimizations - -struct StrCSpnOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getParamType(0) != B.getInt8PtrTy() || - FT->getParamType(1) != FT->getParamType(0) || - !FT->getReturnType()->isIntegerTy()) - return 0; - - StringRef S1, S2; - bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); - bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); - - // strcspn("", s) -> 0 - if (HasS1 && S1.empty()) - return Constant::getNullValue(CI->getType()); - - // Constant folding. - if (HasS1 && HasS2) { - size_t Pos = S1.find_first_of(S2); - if (Pos == StringRef::npos) Pos = S1.size(); - return ConstantInt::get(CI->getType(), Pos); - } - - // strcspn(s, "") -> strlen(s) - if (TD && HasS2 && S2.empty()) - return EmitStrLen(CI->getArgOperand(0), B, TD, TLI); - - return 0; - } -}; - -//===---------------------------------------===// -// 'strstr' Optimizations - -struct StrStrOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - !FT->getReturnType()->isPointerTy()) - return 0; - - // fold strstr(x, x) -> x. - if (CI->getArgOperand(0) == CI->getArgOperand(1)) - return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); - - // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0 - if (TD && IsOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) { - Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI); - if (!StrLen) - return 0; - Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1), - StrLen, B, TD, TLI); - if (!StrNCmp) - return 0; - for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end(); - UI != UE; ) { - ICmpInst *Old = cast<ICmpInst>(*UI++); - Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp, - ConstantInt::getNullValue(StrNCmp->getType()), - "cmp"); - Old->replaceAllUsesWith(Cmp); - Old->eraseFromParent(); - } - return CI; - } - - // See if either input string is a constant string. - StringRef SearchStr, ToFindStr; - bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr); - bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr); - - // fold strstr(x, "") -> x. - if (HasStr2 && ToFindStr.empty()) - return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); - - // If both strings are known, constant fold it. - if (HasStr1 && HasStr2) { - std::string::size_type Offset = SearchStr.find(ToFindStr); - - if (Offset == StringRef::npos) // strstr("foo", "bar") -> null - return Constant::getNullValue(CI->getType()); - - // strstr("abcd", "bc") -> gep((char*)"abcd", 1) - Value *Result = CastToCStr(CI->getArgOperand(0), B); - Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr"); - return B.CreateBitCast(Result, CI->getType()); - } - - // fold strstr(x, "y") -> strchr(x, 'y'). - if (HasStr2 && ToFindStr.size() == 1) { - Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI); - return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0; - } - return 0; - } -}; - - -//===---------------------------------------===// -// 'memcmp' Optimizations - -struct MemCmpOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - !FT->getReturnType()->isIntegerTy(32)) - return 0; - - Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1); - - if (LHS == RHS) // memcmp(s,s,x) -> 0 - return Constant::getNullValue(CI->getType()); - - // Make sure we have a constant length. - ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); - if (!LenC) return 0; - uint64_t Len = LenC->getZExtValue(); - - if (Len == 0) // memcmp(s1,s2,0) -> 0 - return Constant::getNullValue(CI->getType()); - - // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS - if (Len == 1) { - Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"), - CI->getType(), "lhsv"); - Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"), - CI->getType(), "rhsv"); - return B.CreateSub(LHSV, RHSV, "chardiff"); - } - - // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant) - StringRef LHSStr, RHSStr; - if (getConstantStringInfo(LHS, LHSStr) && - getConstantStringInfo(RHS, RHSStr)) { - // Make sure we're not reading out-of-bounds memory. - if (Len > LHSStr.size() || Len > RHSStr.size()) - return 0; - uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len); - return ConstantInt::get(CI->getType(), Ret); - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'memcpy' Optimizations - -struct MemCpyOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // These optimizations require TargetData. - if (!TD) return 0; - - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != TD->getIntPtrType(*Context)) - return 0; - - // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1) - B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), 1); - return CI->getArgOperand(0); - } -}; - -//===---------------------------------------===// -// 'memmove' Optimizations - -struct MemMoveOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // These optimizations require TargetData. - if (!TD) return 0; - - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != TD->getIntPtrType(*Context)) - return 0; - - // memmove(x, y, n) -> llvm.memmove(x, y, n, 1) - B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), 1); - return CI->getArgOperand(0); - } -}; - -//===---------------------------------------===// -// 'memset' Optimizations - -struct MemSetOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // These optimizations require TargetData. - if (!TD) return 0; - - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isIntegerTy() || - FT->getParamType(2) != TD->getIntPtrType(*Context)) - return 0; - - // memset(p, v, n) -> llvm.memset(p, v, n, 1) - Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); - B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); - return CI->getArgOperand(0); - } -}; - -//===----------------------------------------------------------------------===// -// Math Library Optimizations -//===----------------------------------------------------------------------===// - -//===---------------------------------------===// -// Double -> Float Shrinking Optimizations for Unary Functions like 'floor' - -struct UnaryDoubleFPOpt : public LibCallOptimization { - bool CheckRetType; - UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {} - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() || - !FT->getParamType(0)->isDoubleTy()) - return 0; - - if (CheckRetType) { - // Check if all the uses for function like 'sin' are converted to float. - for (Value::use_iterator UseI = CI->use_begin(); UseI != CI->use_end(); - ++UseI) { - FPTruncInst *Cast = dyn_cast<FPTruncInst>(*UseI); - if (Cast == 0 || !Cast->getType()->isFloatTy()) - return 0; - } - } - - // If this is something like 'floor((double)floatval)', convert to floorf. - FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0)); - if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy()) - return 0; - - // floor((double)floatval) -> (double)floorf(floatval) - Value *V = Cast->getOperand(0); - V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes()); - return B.CreateFPExt(V, B.getDoubleTy()); - } -}; - -//===---------------------------------------===// -// 'cos*' Optimizations -struct CosOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - Value *Ret = NULL; - if (UnsafeFPShrink && Callee->getName() == "cos" && - TLI->has(LibFunc::cosf)) { - UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); - Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B); - } - - FunctionType *FT = Callee->getFunctionType(); - // Just make sure this has 1 argument of FP type, which matches the - // result type. - if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isFloatingPointTy()) - return Ret; - - // cos(-x) -> cos(x) - Value *Op1 = CI->getArgOperand(0); - if (BinaryOperator::isFNeg(Op1)) { - BinaryOperator *BinExpr = cast<BinaryOperator>(Op1); - return B.CreateCall(Callee, BinExpr->getOperand(1), "cos"); - } - return Ret; - } -}; - -//===---------------------------------------===// -// 'pow*' Optimizations - -struct PowOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - Value *Ret = NULL; - if (UnsafeFPShrink && Callee->getName() == "pow" && - TLI->has(LibFunc::powf)) { - UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); - Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B); - } - - FunctionType *FT = Callee->getFunctionType(); - // Just make sure this has 2 arguments of the same FP type, which match the - // result type. - if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - !FT->getParamType(0)->isFloatingPointTy()) - return Ret; - - Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1); - if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) { - if (Op1C->isExactlyValue(1.0)) // pow(1.0, x) -> 1.0 - return Op1C; - if (Op1C->isExactlyValue(2.0)) // pow(2.0, x) -> exp2(x) - return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes()); - } - - ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2); - if (Op2C == 0) return Ret; - - if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0 - return ConstantFP::get(CI->getType(), 1.0); - - if (Op2C->isExactlyValue(0.5)) { - // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))). - // This is faster than calling pow, and still handles negative zero - // and negative infinity correctly. - // TODO: In fast-math mode, this could be just sqrt(x). - // TODO: In finite-only mode, this could be just fabs(sqrt(x)). - Value *Inf = ConstantFP::getInfinity(CI->getType()); - Value *NegInf = ConstantFP::getInfinity(CI->getType(), true); - Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B, - Callee->getAttributes()); - Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B, - Callee->getAttributes()); - Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf); - Value *Sel = B.CreateSelect(FCmp, Inf, FAbs); - return Sel; - } - - if (Op2C->isExactlyValue(1.0)) // pow(x, 1.0) -> x - return Op1; - if (Op2C->isExactlyValue(2.0)) // pow(x, 2.0) -> x*x - return B.CreateFMul(Op1, Op1, "pow2"); - if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x - return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), - Op1, "powrecip"); - return 0; - } -}; - -//===---------------------------------------===// -// 'exp2' Optimizations - -struct Exp2Opt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - Value *Ret = NULL; - if (UnsafeFPShrink && Callee->getName() == "exp2" && - TLI->has(LibFunc::exp2)) { - UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); - Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B); - } - - FunctionType *FT = Callee->getFunctionType(); - // Just make sure this has 1 argument of FP type, which matches the - // result type. - if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isFloatingPointTy()) - return Ret; - - Value *Op = CI->getArgOperand(0); - // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x)) if sizeof(x) <= 32 - // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x)) if sizeof(x) < 32 - Value *LdExpArg = 0; - if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) { - if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32) - LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty()); - } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) { - if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32) - LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty()); - } - - if (LdExpArg) { - const char *Name; - if (Op->getType()->isFloatTy()) - Name = "ldexpf"; - else if (Op->getType()->isDoubleTy()) - Name = "ldexp"; - else - Name = "ldexpl"; - - Constant *One = ConstantFP::get(*Context, APFloat(1.0f)); - if (!Op->getType()->isFloatTy()) - One = ConstantExpr::getFPExtend(One, Op->getType()); - - Module *M = Caller->getParent(); - Value *Callee = M->getOrInsertFunction(Name, Op->getType(), - Op->getType(), - B.getInt32Ty(), NULL); - CallInst *CI = B.CreateCall2(Callee, One, LdExpArg); - if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) - CI->setCallingConv(F->getCallingConv()); - - return CI; - } - return Ret; - } -}; - -//===----------------------------------------------------------------------===// -// Integer Optimizations -//===----------------------------------------------------------------------===// - -//===---------------------------------------===// -// 'ffs*' Optimizations - -struct FFSOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - // Just make sure this has 2 arguments of the same FP type, which match the - // result type. - if (FT->getNumParams() != 1 || - !FT->getReturnType()->isIntegerTy(32) || - !FT->getParamType(0)->isIntegerTy()) - return 0; - - Value *Op = CI->getArgOperand(0); - - // Constant fold. - if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { - if (CI->getValue() == 0) // ffs(0) -> 0. - return Constant::getNullValue(CI->getType()); - // ffs(c) -> cttz(c)+1 - return B.getInt32(CI->getValue().countTrailingZeros() + 1); - } - - // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0 - Type *ArgType = Op->getType(); - Value *F = Intrinsic::getDeclaration(Callee->getParent(), - Intrinsic::cttz, ArgType); - Value *V = B.CreateCall2(F, Op, B.getFalse(), "cttz"); - V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1)); - V = B.CreateIntCast(V, B.getInt32Ty(), false); - - Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType)); - return B.CreateSelect(Cond, V, B.getInt32(0)); - } -}; - -//===---------------------------------------===// -// 'isdigit' Optimizations - -struct IsDigitOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - // We require integer(i32) - if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || - !FT->getParamType(0)->isIntegerTy(32)) - return 0; - - // isdigit(c) -> (c-'0') <u 10 - Value *Op = CI->getArgOperand(0); - Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp"); - Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit"); - return B.CreateZExt(Op, CI->getType()); - } -}; - -//===---------------------------------------===// -// 'isascii' Optimizations - -struct IsAsciiOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - // We require integer(i32) - if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || - !FT->getParamType(0)->isIntegerTy(32)) - return 0; - - // isascii(c) -> c <u 128 - Value *Op = CI->getArgOperand(0); - Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii"); - return B.CreateZExt(Op, CI->getType()); - } -}; - -//===---------------------------------------===// -// 'abs', 'labs', 'llabs' Optimizations - -struct AbsOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - // We require integer(integer) where the types agree. - if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || - FT->getParamType(0) != FT->getReturnType()) - return 0; - - // abs(x) -> x >s -1 ? x : -x - Value *Op = CI->getArgOperand(0); - Value *Pos = B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()), - "ispos"); - Value *Neg = B.CreateNeg(Op, "neg"); - return B.CreateSelect(Pos, Op, Neg); - } -}; - - -//===---------------------------------------===// -// 'toascii' Optimizations - -struct ToAsciiOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - // We require i32(i32) - if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isIntegerTy(32)) - return 0; - - // isascii(c) -> c & 0x7f - return B.CreateAnd(CI->getArgOperand(0), - ConstantInt::get(CI->getType(),0x7F)); - } -}; - -//===----------------------------------------------------------------------===// -// Formatting and IO Optimizations -//===----------------------------------------------------------------------===// - -//===---------------------------------------===// -// 'printf' Optimizations - -struct PrintFOpt : public LibCallOptimization { - Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI, - IRBuilder<> &B) { - // Check for a fixed format string. - StringRef FormatStr; - if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr)) - return 0; - - // Empty format string -> noop. - if (FormatStr.empty()) // Tolerate printf's declared void. - return CI->use_empty() ? (Value*)CI : - ConstantInt::get(CI->getType(), 0); - - // Do not do any of the following transformations if the printf return value - // is used, in general the printf return value is not compatible with either - // putchar() or puts(). - if (!CI->use_empty()) - return 0; - - // printf("x") -> putchar('x'), even for '%'. - if (FormatStr.size() == 1) { - Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD, TLI); - if (CI->use_empty() || !Res) return Res; - return B.CreateIntCast(Res, CI->getType(), true); - } - - // printf("foo\n") --> puts("foo") - if (FormatStr[FormatStr.size()-1] == '\n' && - FormatStr.find('%') == std::string::npos) { // no format characters. - // Create a string literal with no \n on it. We expect the constant merge - // pass to be run after this pass, to merge duplicate strings. - FormatStr = FormatStr.drop_back(); - Value *GV = B.CreateGlobalString(FormatStr, "str"); - Value *NewCI = EmitPutS(GV, B, TD, TLI); - return (CI->use_empty() || !NewCI) ? - NewCI : - ConstantInt::get(CI->getType(), FormatStr.size()+1); - } - - // Optimize specific format strings. - // printf("%c", chr) --> putchar(chr) - if (FormatStr == "%c" && CI->getNumArgOperands() > 1 && - CI->getArgOperand(1)->getType()->isIntegerTy()) { - Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD, TLI); - - if (CI->use_empty() || !Res) return Res; - return B.CreateIntCast(Res, CI->getType(), true); - } - - // printf("%s\n", str) --> puts(str) - if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 && - CI->getArgOperand(1)->getType()->isPointerTy()) { - return EmitPutS(CI->getArgOperand(1), B, TD, TLI); - } - return 0; - } - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Require one fixed pointer argument and an integer/void result. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || - !(FT->getReturnType()->isIntegerTy() || - FT->getReturnType()->isVoidTy())) - return 0; - - if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) { - return V; - } - - // printf(format, ...) -> iprintf(format, ...) if no floating point - // arguments. - if (TLI->has(LibFunc::iprintf) && !CallHasFloatingPointArgument(CI)) { - Module *M = B.GetInsertBlock()->getParent()->getParent(); - Constant *IPrintFFn = - M->getOrInsertFunction("iprintf", FT, Callee->getAttributes()); - CallInst *New = cast<CallInst>(CI->clone()); - New->setCalledFunction(IPrintFFn); - B.Insert(New); - return New; - } - return 0; - } -}; - -//===---------------------------------------===// -// 'sprintf' Optimizations - -struct SPrintFOpt : public LibCallOptimization { - Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI, - IRBuilder<> &B) { - // Check for a fixed format string. - StringRef FormatStr; - if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) - return 0; - - // If we just have a format string (nothing else crazy) transform it. - if (CI->getNumArgOperands() == 2) { - // Make sure there's no % in the constant array. We could try to handle - // %% -> % in the future if we cared. - for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) - if (FormatStr[i] == '%') - return 0; // we found a format specifier, bail out. - - // These optimizations require TargetData. - if (!TD) return 0; - - // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1) - B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), - ConstantInt::get(TD->getIntPtrType(*Context), // Copy the - FormatStr.size() + 1), 1); // nul byte. - return ConstantInt::get(CI->getType(), FormatStr.size()); - } - - // The remaining optimizations require the format string to be "%s" or "%c" - // and have an extra operand. - if (FormatStr.size() != 2 || FormatStr[0] != '%' || - CI->getNumArgOperands() < 3) - return 0; - - // Decode the second character of the format string. - if (FormatStr[1] == 'c') { - // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0 - if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; - Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char"); - Value *Ptr = CastToCStr(CI->getArgOperand(0), B); - B.CreateStore(V, Ptr); - Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul"); - B.CreateStore(B.getInt8(0), Ptr); - - return ConstantInt::get(CI->getType(), 1); - } - - if (FormatStr[1] == 's') { - // These optimizations require TargetData. - if (!TD) return 0; - - // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1) - if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0; - - Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD, TLI); - if (!Len) - return 0; - Value *IncLen = B.CreateAdd(Len, - ConstantInt::get(Len->getType(), 1), - "leninc"); - B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1); - - // The sprintf result is the unincremented number of bytes in the string. - return B.CreateIntCast(Len, CI->getType(), false); - } - return 0; - } - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Require two fixed pointer arguments and an integer result. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - !FT->getReturnType()->isIntegerTy()) - return 0; - - if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) { - return V; - } - - // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating - // point arguments. - if (TLI->has(LibFunc::siprintf) && !CallHasFloatingPointArgument(CI)) { - Module *M = B.GetInsertBlock()->getParent()->getParent(); - Constant *SIPrintFFn = - M->getOrInsertFunction("siprintf", FT, Callee->getAttributes()); - CallInst *New = cast<CallInst>(CI->clone()); - New->setCalledFunction(SIPrintFFn); - B.Insert(New); - return New; - } - return 0; - } -}; - -//===---------------------------------------===// -// 'fwrite' Optimizations - -struct FWriteOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Require a pointer, an integer, an integer, a pointer, returning integer. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isIntegerTy() || - !FT->getParamType(2)->isIntegerTy() || - !FT->getParamType(3)->isPointerTy() || - !FT->getReturnType()->isIntegerTy()) - return 0; - - // Get the element size and count. - ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); - ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); - if (!SizeC || !CountC) return 0; - uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue(); - - // If this is writing zero records, remove the call (it's a noop). - if (Bytes == 0) - return ConstantInt::get(CI->getType(), 0); - - // If this is writing one byte, turn it into fputc. - // This optimisation is only valid, if the return value is unused. - if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F) - Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char"); - Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TD, TLI); - return NewCI ? ConstantInt::get(CI->getType(), 1) : 0; - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'fputs' Optimizations - -struct FPutsOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // These optimizations require TargetData. - if (!TD) return 0; - - // Require two pointers. Also, we can't optimize if return value is used. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - !CI->use_empty()) - return 0; - - // fputs(s,F) --> fwrite(s,1,strlen(s),F) - uint64_t Len = GetStringLength(CI->getArgOperand(0)); - if (!Len) return 0; - // Known to have no uses (see above). - return EmitFWrite(CI->getArgOperand(0), - ConstantInt::get(TD->getIntPtrType(*Context), Len-1), - CI->getArgOperand(1), B, TD, TLI); - } -}; - -//===---------------------------------------===// -// 'fprintf' Optimizations - -struct FPrintFOpt : public LibCallOptimization { - Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI, - IRBuilder<> &B) { - // All the optimizations depend on the format string. - StringRef FormatStr; - if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) - return 0; - - // fprintf(F, "foo") --> fwrite("foo", 3, 1, F) - if (CI->getNumArgOperands() == 2) { - for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) - if (FormatStr[i] == '%') // Could handle %% -> % if we cared. - return 0; // We found a format specifier. - - // These optimizations require TargetData. - if (!TD) return 0; - - Value *NewCI = EmitFWrite(CI->getArgOperand(1), - ConstantInt::get(TD->getIntPtrType(*Context), - FormatStr.size()), - CI->getArgOperand(0), B, TD, TLI); - return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0; - } - - // The remaining optimizations require the format string to be "%s" or "%c" - // and have an extra operand. - if (FormatStr.size() != 2 || FormatStr[0] != '%' || - CI->getNumArgOperands() < 3) - return 0; - - // Decode the second character of the format string. - if (FormatStr[1] == 'c') { - // fprintf(F, "%c", chr) --> fputc(chr, F) - if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; - Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, - TD, TLI); - return NewCI ? ConstantInt::get(CI->getType(), 1) : 0; - } - - if (FormatStr[1] == 's') { - // fprintf(F, "%s", str) --> fputs(str, F) - if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty()) - return 0; - return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI); - } - return 0; - } - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Require two fixed paramters as pointers and integer result. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - !FT->getReturnType()->isIntegerTy()) - return 0; - - if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) { - return V; - } - - // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no - // floating point arguments. - if (TLI->has(LibFunc::fiprintf) && !CallHasFloatingPointArgument(CI)) { - Module *M = B.GetInsertBlock()->getParent()->getParent(); - Constant *FIPrintFFn = - M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes()); - CallInst *New = cast<CallInst>(CI->clone()); - New->setCalledFunction(FIPrintFFn); - B.Insert(New); - return New; - } - return 0; - } -}; - -//===---------------------------------------===// -// 'puts' Optimizations - -struct PutsOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Require one fixed pointer argument and an integer/void result. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || - !(FT->getReturnType()->isIntegerTy() || - FT->getReturnType()->isVoidTy())) - return 0; - - // Check for a constant string. - StringRef Str; - if (!getConstantStringInfo(CI->getArgOperand(0), Str)) - return 0; - - if (Str.empty() && CI->use_empty()) { - // puts("") -> putchar('\n') - Value *Res = EmitPutChar(B.getInt32('\n'), B, TD, TLI); - if (CI->use_empty() || !Res) return Res; - return B.CreateIntCast(Res, CI->getType(), true); - } - - return 0; - } -}; - -} // end anonymous namespace. - -//===----------------------------------------------------------------------===// // SimplifyLibCalls Pass Implementation //===----------------------------------------------------------------------===// @@ -1561,32 +91,11 @@ namespace { TargetLibraryInfo *TLI; StringMap<LibCallOptimization*> Optimizations; - // String and Memory LibCall Optimizations - StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrRChrOpt StrRChr; - StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; - StrCpyOpt StrCpy; StrCpyOpt StrCpyChk; - StpCpyOpt StpCpy; StpCpyOpt StpCpyChk; - StrNCpyOpt StrNCpy; - StrLenOpt StrLen; StrPBrkOpt StrPBrk; - StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr; - MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet; - // Math Library Optimizations - CosOpt Cos; PowOpt Pow; Exp2Opt Exp2; - UnaryDoubleFPOpt UnaryDoubleFP, UnsafeUnaryDoubleFP; - // Integer Optimizations - FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii; - ToAsciiOpt ToAscii; - // Formatting and IO Optimizations - SPrintFOpt SPrintF; PrintFOpt PrintF; - FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF; - PutsOpt Puts; bool Modified; // This is only used by doInitialization. public: static char ID; // Pass identification - SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true), - StpCpy(false), StpCpyChk(true), - UnaryDoubleFP(false), UnsafeUnaryDoubleFP(true) { + SimplifyLibCalls() : FunctionPass(ID) { initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); } void AddOpt(LibFunc::Func F, LibCallOptimization* Opt); @@ -1636,108 +145,6 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2, /// Optimizations - Populate the Optimizations map with all the optimizations /// we know. void SimplifyLibCalls::InitOptimizations() { - // String and Memory LibCall Optimizations - Optimizations["strcat"] = &StrCat; - Optimizations["strncat"] = &StrNCat; - Optimizations["strchr"] = &StrChr; - Optimizations["strrchr"] = &StrRChr; - Optimizations["strcmp"] = &StrCmp; - Optimizations["strncmp"] = &StrNCmp; - Optimizations["strcpy"] = &StrCpy; - Optimizations["strncpy"] = &StrNCpy; - Optimizations["stpcpy"] = &StpCpy; - Optimizations["strlen"] = &StrLen; - Optimizations["strpbrk"] = &StrPBrk; - Optimizations["strtol"] = &StrTo; - Optimizations["strtod"] = &StrTo; - Optimizations["strtof"] = &StrTo; - Optimizations["strtoul"] = &StrTo; - Optimizations["strtoll"] = &StrTo; - Optimizations["strtold"] = &StrTo; - Optimizations["strtoull"] = &StrTo; - Optimizations["strspn"] = &StrSpn; - Optimizations["strcspn"] = &StrCSpn; - Optimizations["strstr"] = &StrStr; - Optimizations["memcmp"] = &MemCmp; - AddOpt(LibFunc::memcpy, &MemCpy); - Optimizations["memmove"] = &MemMove; - AddOpt(LibFunc::memset, &MemSet); - - // _chk variants of String and Memory LibCall Optimizations. - Optimizations["__strcpy_chk"] = &StrCpyChk; - Optimizations["__stpcpy_chk"] = &StpCpyChk; - - // Math Library Optimizations - Optimizations["cosf"] = &Cos; - Optimizations["cos"] = &Cos; - Optimizations["cosl"] = &Cos; - Optimizations["powf"] = &Pow; - Optimizations["pow"] = &Pow; - Optimizations["powl"] = &Pow; - Optimizations["llvm.pow.f32"] = &Pow; - Optimizations["llvm.pow.f64"] = &Pow; - Optimizations["llvm.pow.f80"] = &Pow; - Optimizations["llvm.pow.f128"] = &Pow; - Optimizations["llvm.pow.ppcf128"] = &Pow; - Optimizations["exp2l"] = &Exp2; - Optimizations["exp2"] = &Exp2; - Optimizations["exp2f"] = &Exp2; - Optimizations["llvm.exp2.ppcf128"] = &Exp2; - Optimizations["llvm.exp2.f128"] = &Exp2; - Optimizations["llvm.exp2.f80"] = &Exp2; - Optimizations["llvm.exp2.f64"] = &Exp2; - Optimizations["llvm.exp2.f32"] = &Exp2; - - AddOpt(LibFunc::ceil, LibFunc::ceilf, &UnaryDoubleFP); - AddOpt(LibFunc::fabs, LibFunc::fabsf, &UnaryDoubleFP); - AddOpt(LibFunc::floor, LibFunc::floorf, &UnaryDoubleFP); - AddOpt(LibFunc::rint, LibFunc::rintf, &UnaryDoubleFP); - AddOpt(LibFunc::round, LibFunc::roundf, &UnaryDoubleFP); - AddOpt(LibFunc::nearbyint, LibFunc::nearbyintf, &UnaryDoubleFP); - AddOpt(LibFunc::trunc, LibFunc::truncf, &UnaryDoubleFP); - - if(UnsafeFPShrink) { - AddOpt(LibFunc::acos, LibFunc::acosf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::acosh, LibFunc::acoshf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::asin, LibFunc::asinf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::asinh, LibFunc::asinhf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::atan, LibFunc::atanf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::atanh, LibFunc::atanhf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::cbrt, LibFunc::cbrtf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::cosh, LibFunc::coshf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::exp, LibFunc::expf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::exp10, LibFunc::exp10f, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::expm1, LibFunc::expm1f, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::log, LibFunc::logf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::log10, LibFunc::log10f, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::log1p, LibFunc::log1pf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::log2, LibFunc::log2f, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::logb, LibFunc::logbf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::sin, LibFunc::sinf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::sinh, LibFunc::sinhf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::sqrt, LibFunc::sqrtf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::tan, LibFunc::tanf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::tanh, LibFunc::tanhf, &UnsafeUnaryDoubleFP); - } - - // Integer Optimizations - Optimizations["ffs"] = &FFS; - Optimizations["ffsl"] = &FFS; - Optimizations["ffsll"] = &FFS; - Optimizations["abs"] = &Abs; - Optimizations["labs"] = &Abs; - Optimizations["llabs"] = &Abs; - Optimizations["isdigit"] = &IsDigit; - Optimizations["isascii"] = &IsAscii; - Optimizations["toascii"] = &ToAscii; - - // Formatting and IO Optimizations - Optimizations["sprintf"] = &SPrintF; - Optimizations["printf"] = &PrintF; - AddOpt(LibFunc::fwrite, &FWrite); - AddOpt(LibFunc::fputs, &FPuts); - Optimizations["fprintf"] = &FPrintF; - Optimizations["puts"] = &Puts; } @@ -1749,7 +156,7 @@ bool SimplifyLibCalls::runOnFunction(Function &F) { if (Optimizations.empty()) InitOptimizations(); - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); IRBuilder<> Builder(F.getContext()); @@ -1785,7 +192,6 @@ bool SimplifyLibCalls::runOnFunction(Function &F) { // Something changed! Changed = true; - ++NumSimplified; // Inspect the instruction after the call (which was potentially just // added) next. diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index 34f1d6c..d4595bb 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -14,13 +14,13 @@ #define DEBUG_TYPE "sink" #include "llvm/Transforms/Scalar.h" -#include "llvm/IntrinsicInst.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Assembly/Writer.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 6557d63..6572e09 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -52,25 +52,25 @@ #define DEBUG_TYPE "tailcallelim" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" -#include "llvm/Support/CallSite.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/CFG.h" +#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; STATISTIC(NumEliminated, "Number of tail calls removed"); diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp deleted file mode 100644 index 1e6586b..0000000 --- a/lib/Transforms/Utils/AddrModeMatcher.cpp +++ /dev/null @@ -1,577 +0,0 @@ -//===- AddrModeMatcher.cpp - Addressing mode matching facility --*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements target addressing mode matcher class. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Transforms/Utils/AddrModeMatcher.h" -#include "llvm/DerivedTypes.h" -#include "llvm/GlobalValue.h" -#include "llvm/Instruction.h" -#include "llvm/Assembly/Writer.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" -#include "llvm/Support/PatternMatch.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/CallSite.h" - -using namespace llvm; -using namespace llvm::PatternMatch; - -void ExtAddrMode::print(raw_ostream &OS) const { - bool NeedPlus = false; - OS << "["; - if (BaseGV) { - OS << (NeedPlus ? " + " : "") - << "GV:"; - WriteAsOperand(OS, BaseGV, /*PrintType=*/false); - NeedPlus = true; - } - - if (BaseOffs) - OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true; - - if (BaseReg) { - OS << (NeedPlus ? " + " : "") - << "Base:"; - WriteAsOperand(OS, BaseReg, /*PrintType=*/false); - NeedPlus = true; - } - if (Scale) { - OS << (NeedPlus ? " + " : "") - << Scale << "*"; - WriteAsOperand(OS, ScaledReg, /*PrintType=*/false); - NeedPlus = true; - } - - OS << ']'; -} - -#ifndef NDEBUG -void ExtAddrMode::dump() const { - print(dbgs()); - dbgs() << '\n'; -} -#endif - - -/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode. -/// Return true and update AddrMode if this addr mode is legal for the target, -/// false if not. -bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, - unsigned Depth) { - // If Scale is 1, then this is the same as adding ScaleReg to the addressing - // mode. Just process that directly. - if (Scale == 1) - return MatchAddr(ScaleReg, Depth); - - // If the scale is 0, it takes nothing to add this. - if (Scale == 0) - return true; - - // If we already have a scale of this value, we can add to it, otherwise, we - // need an available scale field. - if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg) - return false; - - ExtAddrMode TestAddrMode = AddrMode; - - // Add scale to turn X*4+X*3 -> X*7. This could also do things like - // [A+B + A*7] -> [B+A*8]. - TestAddrMode.Scale += Scale; - TestAddrMode.ScaledReg = ScaleReg; - - // If the new address isn't legal, bail out. - if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) - return false; - - // It was legal, so commit it. - AddrMode = TestAddrMode; - - // Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now - // to see if ScaleReg is actually X+C. If so, we can turn this into adding - // X*Scale + C*Scale to addr mode. - ConstantInt *CI = 0; Value *AddLHS = 0; - if (isa<Instruction>(ScaleReg) && // not a constant expr. - match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) { - TestAddrMode.ScaledReg = AddLHS; - TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale; - - // If this addressing mode is legal, commit it and remember that we folded - // this instruction. - if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) { - AddrModeInsts.push_back(cast<Instruction>(ScaleReg)); - AddrMode = TestAddrMode; - return true; - } - } - - // Otherwise, not (x+c)*scale, just return what we have. - return true; -} - -/// MightBeFoldableInst - This is a little filter, which returns true if an -/// addressing computation involving I might be folded into a load/store -/// accessing it. This doesn't need to be perfect, but needs to accept at least -/// the set of instructions that MatchOperationAddr can. -static bool MightBeFoldableInst(Instruction *I) { - switch (I->getOpcode()) { - case Instruction::BitCast: - // Don't touch identity bitcasts. - if (I->getType() == I->getOperand(0)->getType()) - return false; - return I->getType()->isPointerTy() || I->getType()->isIntegerTy(); - case Instruction::PtrToInt: - // PtrToInt is always a noop, as we know that the int type is pointer sized. - return true; - case Instruction::IntToPtr: - // We know the input is intptr_t, so this is foldable. - return true; - case Instruction::Add: - return true; - case Instruction::Mul: - case Instruction::Shl: - // Can only handle X*C and X << C. - return isa<ConstantInt>(I->getOperand(1)); - case Instruction::GetElementPtr: - return true; - default: - return false; - } -} - - -/// MatchOperationAddr - Given an instruction or constant expr, see if we can -/// fold the operation into the addressing mode. If so, update the addressing -/// mode and return true, otherwise return false without modifying AddrMode. -bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, - unsigned Depth) { - // Avoid exponential behavior on extremely deep expression trees. - if (Depth >= 5) return false; - - switch (Opcode) { - case Instruction::PtrToInt: - // PtrToInt is always a noop, as we know that the int type is pointer sized. - return MatchAddr(AddrInst->getOperand(0), Depth); - case Instruction::IntToPtr: - // This inttoptr is a no-op if the integer type is pointer sized. - if (TLI.getValueType(AddrInst->getOperand(0)->getType()) == - TLI.getPointerTy()) - return MatchAddr(AddrInst->getOperand(0), Depth); - return false; - case Instruction::BitCast: - // BitCast is always a noop, and we can handle it as long as it is - // int->int or pointer->pointer (we don't want int<->fp or something). - if ((AddrInst->getOperand(0)->getType()->isPointerTy() || - AddrInst->getOperand(0)->getType()->isIntegerTy()) && - // Don't touch identity bitcasts. These were probably put here by LSR, - // and we don't want to mess around with them. Assume it knows what it - // is doing. - AddrInst->getOperand(0)->getType() != AddrInst->getType()) - return MatchAddr(AddrInst->getOperand(0), Depth); - return false; - case Instruction::Add: { - // Check to see if we can merge in the RHS then the LHS. If so, we win. - ExtAddrMode BackupAddrMode = AddrMode; - unsigned OldSize = AddrModeInsts.size(); - if (MatchAddr(AddrInst->getOperand(1), Depth+1) && - MatchAddr(AddrInst->getOperand(0), Depth+1)) - return true; - - // Restore the old addr mode info. - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - - // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. - if (MatchAddr(AddrInst->getOperand(0), Depth+1) && - MatchAddr(AddrInst->getOperand(1), Depth+1)) - return true; - - // Otherwise we definitely can't merge the ADD in. - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - break; - } - //case Instruction::Or: - // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD. - //break; - case Instruction::Mul: - case Instruction::Shl: { - // Can only handle X*C and X << C. - ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1)); - if (!RHS) return false; - int64_t Scale = RHS->getSExtValue(); - if (Opcode == Instruction::Shl) - Scale = 1LL << Scale; - - return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth); - } - case Instruction::GetElementPtr: { - // Scan the GEP. We check it if it contains constant offsets and at most - // one variable offset. - int VariableOperand = -1; - unsigned VariableScale = 0; - - int64_t ConstantOffset = 0; - const TargetData *TD = TLI.getTargetData(); - gep_type_iterator GTI = gep_type_begin(AddrInst); - for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) { - if (StructType *STy = dyn_cast<StructType>(*GTI)) { - const StructLayout *SL = TD->getStructLayout(STy); - unsigned Idx = - cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue(); - ConstantOffset += SL->getElementOffset(Idx); - } else { - uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType()); - if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) { - ConstantOffset += CI->getSExtValue()*TypeSize; - } else if (TypeSize) { // Scales of zero don't do anything. - // We only allow one variable index at the moment. - if (VariableOperand != -1) - return false; - - // Remember the variable index. - VariableOperand = i; - VariableScale = TypeSize; - } - } - } - - // A common case is for the GEP to only do a constant offset. In this case, - // just add it to the disp field and check validity. - if (VariableOperand == -1) { - AddrMode.BaseOffs += ConstantOffset; - if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){ - // Check to see if we can fold the base pointer in too. - if (MatchAddr(AddrInst->getOperand(0), Depth+1)) - return true; - } - AddrMode.BaseOffs -= ConstantOffset; - return false; - } - - // Save the valid addressing mode in case we can't match. - ExtAddrMode BackupAddrMode = AddrMode; - unsigned OldSize = AddrModeInsts.size(); - - // See if the scale and offset amount is valid for this target. - AddrMode.BaseOffs += ConstantOffset; - - // Match the base operand of the GEP. - if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) { - // If it couldn't be matched, just stuff the value in a register. - if (AddrMode.HasBaseReg) { - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - return false; - } - AddrMode.HasBaseReg = true; - AddrMode.BaseReg = AddrInst->getOperand(0); - } - - // Match the remaining variable portion of the GEP. - if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, - Depth)) { - // If it couldn't be matched, try stuffing the base into a register - // instead of matching it, and retrying the match of the scale. - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - if (AddrMode.HasBaseReg) - return false; - AddrMode.HasBaseReg = true; - AddrMode.BaseReg = AddrInst->getOperand(0); - AddrMode.BaseOffs += ConstantOffset; - if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), - VariableScale, Depth)) { - // If even that didn't work, bail. - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - return false; - } - } - - return true; - } - } - return false; -} - -/// MatchAddr - If we can, try to add the value of 'Addr' into the current -/// addressing mode. If Addr can't be added to AddrMode this returns false and -/// leaves AddrMode unmodified. This assumes that Addr is either a pointer type -/// or intptr_t for the target. -/// -bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { - if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) { - // Fold in immediates if legal for the target. - AddrMode.BaseOffs += CI->getSExtValue(); - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) - return true; - AddrMode.BaseOffs -= CI->getSExtValue(); - } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) { - // If this is a global variable, try to fold it into the addressing mode. - if (AddrMode.BaseGV == 0) { - AddrMode.BaseGV = GV; - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) - return true; - AddrMode.BaseGV = 0; - } - } else if (Instruction *I = dyn_cast<Instruction>(Addr)) { - ExtAddrMode BackupAddrMode = AddrMode; - unsigned OldSize = AddrModeInsts.size(); - - // Check to see if it is possible to fold this operation. - if (MatchOperationAddr(I, I->getOpcode(), Depth)) { - // Okay, it's possible to fold this. Check to see if it is actually - // *profitable* to do so. We use a simple cost model to avoid increasing - // register pressure too much. - if (I->hasOneUse() || - IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) { - AddrModeInsts.push_back(I); - return true; - } - - // It isn't profitable to do this, roll back. - //cerr << "NOT FOLDING: " << *I; - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - } - } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) { - if (MatchOperationAddr(CE, CE->getOpcode(), Depth)) - return true; - } else if (isa<ConstantPointerNull>(Addr)) { - // Null pointer gets folded without affecting the addressing mode. - return true; - } - - // Worse case, the target should support [reg] addressing modes. :) - if (!AddrMode.HasBaseReg) { - AddrMode.HasBaseReg = true; - AddrMode.BaseReg = Addr; - // Still check for legality in case the target supports [imm] but not [i+r]. - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) - return true; - AddrMode.HasBaseReg = false; - AddrMode.BaseReg = 0; - } - - // If the base register is already taken, see if we can do [r+r]. - if (AddrMode.Scale == 0) { - AddrMode.Scale = 1; - AddrMode.ScaledReg = Addr; - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) - return true; - AddrMode.Scale = 0; - AddrMode.ScaledReg = 0; - } - // Couldn't match. - return false; -} - - -/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified -/// inline asm call are due to memory operands. If so, return true, otherwise -/// return false. -static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, - const TargetLowering &TLI) { - TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI)); - for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { - TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; - - // Compute the constraint code and ConstraintType to use. - TLI.ComputeConstraintToUse(OpInfo, SDValue()); - - // If this asm operand is our Value*, and if it isn't an indirect memory - // operand, we can't fold it! - if (OpInfo.CallOperandVal == OpVal && - (OpInfo.ConstraintType != TargetLowering::C_Memory || - !OpInfo.isIndirect)) - return false; - } - - return true; -} - - -/// FindAllMemoryUses - Recursively walk all the uses of I until we find a -/// memory use. If we find an obviously non-foldable instruction, return true. -/// Add the ultimately found memory instructions to MemoryUses. -static bool FindAllMemoryUses(Instruction *I, - SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses, - SmallPtrSet<Instruction*, 16> &ConsideredInsts, - const TargetLowering &TLI) { - // If we already considered this instruction, we're done. - if (!ConsideredInsts.insert(I)) - return false; - - // If this is an obviously unfoldable instruction, bail out. - if (!MightBeFoldableInst(I)) - return true; - - // Loop over all the uses, recursively processing them. - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); - UI != E; ++UI) { - User *U = *UI; - - if (LoadInst *LI = dyn_cast<LoadInst>(U)) { - MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo())); - continue; - } - - if (StoreInst *SI = dyn_cast<StoreInst>(U)) { - unsigned opNo = UI.getOperandNo(); - if (opNo == 0) return true; // Storing addr, not into addr. - MemoryUses.push_back(std::make_pair(SI, opNo)); - continue; - } - - if (CallInst *CI = dyn_cast<CallInst>(U)) { - InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue()); - if (!IA) return true; - - // If this is a memory operand, we're cool, otherwise bail out. - if (!IsOperandAMemoryOperand(CI, IA, I, TLI)) - return true; - continue; - } - - if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts, - TLI)) - return true; - } - - return false; -} - - -/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at -/// the use site that we're folding it into. If so, there is no cost to -/// include it in the addressing mode. KnownLive1 and KnownLive2 are two values -/// that we know are live at the instruction already. -bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, - Value *KnownLive2) { - // If Val is either of the known-live values, we know it is live! - if (Val == 0 || Val == KnownLive1 || Val == KnownLive2) - return true; - - // All values other than instructions and arguments (e.g. constants) are live. - if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true; - - // If Val is a constant sized alloca in the entry block, it is live, this is - // true because it is just a reference to the stack/frame pointer, which is - // live for the whole function. - if (AllocaInst *AI = dyn_cast<AllocaInst>(Val)) - if (AI->isStaticAlloca()) - return true; - - // Check to see if this value is already used in the memory instruction's - // block. If so, it's already live into the block at the very least, so we - // can reasonably fold it. - return Val->isUsedInBasicBlock(MemoryInst->getParent()); -} - - - -/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing -/// mode of the machine to fold the specified instruction into a load or store -/// that ultimately uses it. However, the specified instruction has multiple -/// uses. Given this, it may actually increase register pressure to fold it -/// into the load. For example, consider this code: -/// -/// X = ... -/// Y = X+1 -/// use(Y) -> nonload/store -/// Z = Y+1 -/// load Z -/// -/// In this case, Y has multiple uses, and can be folded into the load of Z -/// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to -/// be live at the use(Y) line. If we don't fold Y into load Z, we use one -/// fewer register. Since Y can't be folded into "use(Y)" we don't increase the -/// number of computations either. -/// -/// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If -/// X was live across 'load Z' for other reasons, we actually *would* want to -/// fold the addressing mode in the Z case. This would make Y die earlier. -bool AddressingModeMatcher:: -IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, - ExtAddrMode &AMAfter) { - if (IgnoreProfitability) return true; - - // AMBefore is the addressing mode before this instruction was folded into it, - // and AMAfter is the addressing mode after the instruction was folded. Get - // the set of registers referenced by AMAfter and subtract out those - // referenced by AMBefore: this is the set of values which folding in this - // address extends the lifetime of. - // - // Note that there are only two potential values being referenced here, - // BaseReg and ScaleReg (global addresses are always available, as are any - // folded immediates). - Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg; - - // If the BaseReg or ScaledReg was referenced by the previous addrmode, their - // lifetime wasn't extended by adding this instruction. - if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) - BaseReg = 0; - if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) - ScaledReg = 0; - - // If folding this instruction (and it's subexprs) didn't extend any live - // ranges, we're ok with it. - if (BaseReg == 0 && ScaledReg == 0) - return true; - - // If all uses of this instruction are ultimately load/store/inlineasm's, - // check to see if their addressing modes will include this instruction. If - // so, we can fold it into all uses, so it doesn't matter if it has multiple - // uses. - SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; - SmallPtrSet<Instruction*, 16> ConsideredInsts; - if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI)) - return false; // Has a non-memory, non-foldable use! - - // Now that we know that all uses of this instruction are part of a chain of - // computation involving only operations that could theoretically be folded - // into a memory use, loop over each of these uses and see if they could - // *actually* fold the instruction. - SmallVector<Instruction*, 32> MatchedAddrModeInsts; - for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) { - Instruction *User = MemoryUses[i].first; - unsigned OpNo = MemoryUses[i].second; - - // Get the access type of this use. If the use isn't a pointer, we don't - // know what it accesses. - Value *Address = User->getOperand(OpNo); - if (!Address->getType()->isPointerTy()) - return false; - Type *AddressAccessTy = - cast<PointerType>(Address->getType())->getElementType(); - - // Do a match against the root of this address, ignoring profitability. This - // will tell us if the addressing mode for the memory operation will - // *actually* cover the shared instruction. - ExtAddrMode Result; - AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy, - MemoryInst, Result); - Matcher.IgnoreProfitability = true; - bool Success = Matcher.MatchAddr(Address, 0); - (void)Success; assert(Success && "Couldn't select *anything*?"); - - // If the match didn't cover I, then it won't be shared by it. - if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(), - I) == MatchedAddrModeInsts.end()) - return false; - - MatchedAddrModeInsts.clear(); - } - - return true; -} diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp index 75a7817..8330e84 100644 --- a/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -13,20 +13,20 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Constant.h" -#include "llvm/Type.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Scalar.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Type.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ValueHandle.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; @@ -687,3 +687,42 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, return cast<ReturnInst>(NewRet); } +/// SplitBlockAndInsertIfThen - Split the containing block at the +/// specified instruction - everything before and including Cmp stays +/// in the old basic block, and everything after Cmp is moved to a +/// new block. The two blocks are connected by a conditional branch +/// (with value of Cmp being the condition). +/// Before: +/// Head +/// Cmp +/// Tail +/// After: +/// Head +/// Cmp +/// if (Cmp) +/// ThenBlock +/// Tail +/// +/// If Unreachable is true, then ThenBlock ends with +/// UnreachableInst, otherwise it branches to Tail. +/// Returns the NewBasicBlock's terminator. + +TerminatorInst *llvm::SplitBlockAndInsertIfThen(Instruction *Cmp, + bool Unreachable, MDNode *BranchWeights) { + Instruction *SplitBefore = Cmp->getNextNode(); + BasicBlock *Head = SplitBefore->getParent(); + BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); + TerminatorInst *HeadOldTerm = Head->getTerminator(); + LLVMContext &C = Head->getContext(); + BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); + TerminatorInst *CheckTerm; + if (Unreachable) + CheckTerm = new UnreachableInst(C, ThenBlock); + else + CheckTerm = BranchInst::Create(Tail, ThenBlock); + BranchInst *HeadNewTerm = + BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp); + HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights); + ReplaceInstWithInst(HeadOldTerm, HeadNewTerm); + return CheckTerm; +} diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp index 6b04e3d..8513772 100644 --- a/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -17,17 +17,17 @@ #define DEBUG_TYPE "break-crit-edges" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ProfileInfo.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/Type.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" #include "llvm/Support/CFG.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; STATISTIC(NumBroken, "Number of blocks inserted"); diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp index e13fd71..bf540b0 100644 --- a/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/lib/Transforms/Utils/BuildLibCalls.cpp @@ -12,17 +12,15 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/BuildLibCalls.h" -#include "llvm/Constants.h" -#include "llvm/Function.h" -#include "llvm/IRBuilder.h" -#include "llvm/Intrinsics.h" -#include "llvm/Intrinsics.h" -#include "llvm/LLVMContext.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" -#include "llvm/Type.h" #include "llvm/ADT/SmallString.h" -#include "llvm/Target/TargetData.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" #include "llvm/Target/TargetLibraryInfo.h" using namespace llvm; @@ -34,19 +32,22 @@ Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) { /// EmitStrLen - Emit a call to the strlen function to the builder, for the /// specified pointer. This always returns an integer value of size intptr_t. -Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD, +Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::strlen)) return 0; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[2]; - AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); - AWI[1] = AttributeWithIndex::get(~0u, Attribute::ReadOnly | - Attribute::NoUnwind); + AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture); + Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind }; + AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex, + ArrayRef<Attribute::AttrKind>(AVs, 2)); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Constant *StrLen = M->getOrInsertFunction("strlen", AttrListPtr::get(AWI), + Constant *StrLen = M->getOrInsertFunction("strlen", + AttributeSet::get(M->getContext(), + AWI), TD->getIntPtrType(Context), B.getInt8PtrTy(), NULL); @@ -61,18 +62,21 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD, /// specified pointer. Ptr is required to be some pointer type, MaxLen must /// be of size_t type, and the return value has 'intptr_t' type. Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B, - const TargetData *TD, const TargetLibraryInfo *TLI) { + const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::strnlen)) return 0; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[2]; - AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); - AWI[1] = AttributeWithIndex::get(~0u, Attribute::ReadOnly | - Attribute::NoUnwind); + AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture); + Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind }; + AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex, + ArrayRef<Attribute::AttrKind>(AVs, 2)); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Constant *StrNLen = M->getOrInsertFunction("strnlen", AttrListPtr::get(AWI), + Constant *StrNLen = M->getOrInsertFunction("strnlen", + AttributeSet::get(M->getContext(), + AWI), TD->getIntPtrType(Context), B.getInt8PtrTy(), TD->getIntPtrType(Context), @@ -88,17 +92,21 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B, /// specified pointer and character. Ptr is required to be some pointer type, /// and the return value has 'i8*' type. Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B, - const TargetData *TD, const TargetLibraryInfo *TLI) { + const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::strchr)) return 0; Module *M = B.GetInsertBlock()->getParent()->getParent(); + Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind }; AttributeWithIndex AWI = - AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind); + AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex, + ArrayRef<Attribute::AttrKind>(AVs, 2)); Type *I8Ptr = B.getInt8PtrTy(); Type *I32Ty = B.getInt32Ty(); - Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(AWI), + Constant *StrChr = M->getOrInsertFunction("strchr", + AttributeSet::get(M->getContext(), + AWI), I8Ptr, I8Ptr, I32Ty, NULL); CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B), ConstantInt::get(I32Ty, C), "strchr"); @@ -109,20 +117,23 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B, /// EmitStrNCmp - Emit a call to the strncmp function to the builder. Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, - IRBuilder<> &B, const TargetData *TD, + IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::strncmp)) return 0; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[3]; - AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); - AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture); - AWI[2] = AttributeWithIndex::get(~0u, Attribute::ReadOnly | - Attribute::NoUnwind); + AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture); + Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind }; + AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex, + ArrayRef<Attribute::AttrKind>(AVs, 2)); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI), + Value *StrNCmp = M->getOrInsertFunction("strncmp", + AttributeSet::get(M->getContext(), + AWI), B.getInt32Ty(), B.getInt8PtrTy(), B.getInt8PtrTy(), @@ -139,17 +150,19 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, /// EmitStrCpy - Emit a call to the strcpy function to the builder, for the /// specified pointer arguments. Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B, - const TargetData *TD, const TargetLibraryInfo *TLI, + const DataLayout *TD, const TargetLibraryInfo *TLI, StringRef Name) { if (!TLI->has(LibFunc::strcpy)) return 0; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[2]; - AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture); - AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex, + Attribute::NoUnwind); Type *I8Ptr = B.getInt8PtrTy(); - Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI), + Value *StrCpy = M->getOrInsertFunction(Name, + AttributeSet::get(M->getContext(), AWI), I8Ptr, I8Ptr, I8Ptr, NULL); CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B), Name); @@ -161,17 +174,20 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B, /// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the /// specified pointer arguments. Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, - IRBuilder<> &B, const TargetData *TD, + IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI, StringRef Name) { if (!TLI->has(LibFunc::strncpy)) return 0; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[2]; - AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture); - AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex, + Attribute::NoUnwind); Type *I8Ptr = B.getInt8PtrTy(); - Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI), + Value *StrNCpy = M->getOrInsertFunction(Name, + AttributeSet::get(M->getContext(), + AWI), I8Ptr, I8Ptr, I8Ptr, Len->getType(), NULL); CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B), @@ -185,17 +201,18 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, /// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src /// are pointers. Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, - IRBuilder<> &B, const TargetData *TD, + IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::memcpy_chk)) return 0; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI; - AWI = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + AWI = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex, + Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); Value *MemCpy = M->getOrInsertFunction("__memcpy_chk", - AttrListPtr::get(AWI), + AttributeSet::get(M->getContext(), AWI), B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt8PtrTy(), @@ -212,16 +229,19 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, /// EmitMemChr - Emit a call to the memchr function. This assumes that Ptr is /// a pointer, Val is an i32 value, and Len is an 'intptr_t' value. Value *llvm::EmitMemChr(Value *Ptr, Value *Val, - Value *Len, IRBuilder<> &B, const TargetData *TD, + Value *Len, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::memchr)) return 0; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI; - AWI = AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind); + Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind }; + AWI = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex, + ArrayRef<Attribute::AttrKind>(AVs, 2)); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(AWI), + Value *MemChr = M->getOrInsertFunction("memchr", + AttributeSet::get(M->getContext(), AWI), B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt32Ty(), @@ -237,20 +257,22 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val, /// EmitMemCmp - Emit a call to the memcmp function. Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, - Value *Len, IRBuilder<> &B, const TargetData *TD, + Value *Len, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::memcmp)) return 0; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[3]; - AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); - AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture); - AWI[2] = AttributeWithIndex::get(~0u, Attribute::ReadOnly | - Attribute::NoUnwind); + AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture); + Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind }; + AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex, + ArrayRef<Attribute::AttrKind>(AVs, 2)); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI), + Value *MemCmp = M->getOrInsertFunction("memcmp", + AttributeSet::get(M->getContext(), AWI), B.getInt32Ty(), B.getInt8PtrTy(), B.getInt8PtrTy(), @@ -269,7 +291,7 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, /// returns one value with the same type. If 'Op' is a long double, 'l' is /// added as the suffix of name, if 'Op' is a float, we add a 'f' suffix. Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B, - const AttrListPtr &Attrs) { + const AttributeSet &Attrs) { SmallString<20> NameBuffer; if (!Op->getType()->isDoubleTy()) { // If we need to add a suffix, copy into NameBuffer. @@ -294,7 +316,7 @@ Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B, /// EmitPutChar - Emit a call to the putchar function. This assumes that Char /// is an integer. -Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD, +Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::putchar)) return 0; @@ -316,17 +338,19 @@ Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD, /// EmitPutS - Emit a call to the puts function. This assumes that Str is /// some pointer. -Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD, +Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::puts)) return 0; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[2]; - AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); - AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex, + Attribute::NoUnwind); - Value *PutS = M->getOrInsertFunction("puts", AttrListPtr::get(AWI), + Value *PutS = M->getOrInsertFunction("puts", + AttributeSet::get(M->getContext(), AWI), B.getInt32Ty(), B.getInt8PtrTy(), NULL); @@ -339,17 +363,19 @@ Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD, /// EmitFPutC - Emit a call to the fputc function. This assumes that Char is /// an integer and File is a pointer to FILE. Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B, - const TargetData *TD, const TargetLibraryInfo *TLI) { + const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::fputc)) return 0; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[2]; - AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture); - AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex, + Attribute::NoUnwind); Constant *F; if (File->getType()->isPointerTy()) - F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI), + F = M->getOrInsertFunction("fputc", + AttributeSet::get(M->getContext(), AWI), B.getInt32Ty(), B.getInt32Ty(), File->getType(), NULL); @@ -370,19 +396,21 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B, /// EmitFPutS - Emit a call to the puts function. Str is required to be a /// pointer and File is a pointer to FILE. Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, - const TargetData *TD, const TargetLibraryInfo *TLI) { + const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::fputs)) return 0; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[3]; - AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); - AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture); - AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture); + AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex, + Attribute::NoUnwind); StringRef FPutsName = TLI->getName(LibFunc::fputs); Constant *F; if (File->getType()->isPointerTy()) - F = M->getOrInsertFunction(FPutsName, AttrListPtr::get(AWI), + F = M->getOrInsertFunction(FPutsName, + AttributeSet::get(M->getContext(), AWI), B.getInt32Ty(), B.getInt8PtrTy(), File->getType(), NULL); @@ -400,21 +428,23 @@ Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, /// EmitFWrite - Emit a call to the fwrite function. This assumes that Ptr is /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE. Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, - IRBuilder<> &B, const TargetData *TD, + IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::fwrite)) return 0; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[3]; - AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); - AWI[1] = AttributeWithIndex::get(4, Attribute::NoCapture); - AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(M->getContext(), 4, Attribute::NoCapture); + AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex, + Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); StringRef FWriteName = TLI->getName(LibFunc::fwrite); Constant *F; if (File->getType()->isPointerTy()) - F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI), + F = M->getOrInsertFunction(FWriteName, + AttributeSet::get(M->getContext(), AWI), TD->getIntPtrType(Context), B.getInt8PtrTy(), TD->getIntPtrType(Context), @@ -436,9 +466,9 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, SimplifyFortifiedLibCalls::~SimplifyFortifiedLibCalls() { } -bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD, +bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const DataLayout *TD, const TargetLibraryInfo *TLI) { - // We really need TargetData for later. + // We really need DataLayout for later. if (!TD) return false; this->CI = CI; diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp index 30d60be..00cda8e 100644 --- a/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -16,11 +16,11 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "bypass-slow-division" -#include "llvm/Instructions.h" -#include "llvm/Function.h" -#include "llvm/IRBuilder.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/Transforms/Utils/BypassSlowDivision.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" using namespace llvm; @@ -221,7 +221,7 @@ static bool reuseOrInsertFastDiv(Function &F, // be profitably bypassed and carried out with a shorter, faster divide. bool llvm::bypassSlowDivision(Function &F, Function::iterator &I, - const DenseMap<Type *, Type *> &BypassTypeMap) { + const DenseMap<unsigned int, unsigned int> &BypassWidths) { DivCacheTy DivCache; bool MadeChange = false; @@ -238,14 +238,23 @@ bool llvm::bypassSlowDivision(Function &F, if (!UseDivOp && !UseRemOp) continue; - // Continue if div/rem type is not bypassed - DenseMap<Type *, Type *>::const_iterator BT = - BypassTypeMap.find(J->getType()); - if (BT == BypassTypeMap.end()) + // Skip division on vector types, only optimize integer instructions + if (!J->getType()->isIntegerTy()) + continue; + + // Get bitwidth of div/rem instruction + IntegerType *T = cast<IntegerType>(J->getType()); + int bitwidth = T->getBitWidth(); + + // Continue if bitwidth is not bypassed + DenseMap<unsigned int, unsigned int>::const_iterator BI = BypassWidths.find(bitwidth); + if (BI == BypassWidths.end()) continue; - IntegerType *BypassType = cast<IntegerType>(BT->second); - MadeChange |= reuseOrInsertFastDiv(F, I, J, BypassType, UseDivOp, + // Get type for div/rem instruction with bypass bitwidth + IntegerType *BT = IntegerType::get(J->getContext(), BI->second); + + MadeChange |= reuseOrInsertFastDiv(F, I, J, BT, UseDivOp, UseSignedOp, DivCache); } diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt index 215a16f..b71628b 100644 --- a/lib/Transforms/Utils/CMakeLists.txt +++ b/lib/Transforms/Utils/CMakeLists.txt @@ -1,5 +1,4 @@ add_llvm_library(LLVMTransformUtils - AddrModeMatcher.cpp BasicBlockUtils.cpp BreakCriticalEdges.cpp BuildLibCalls.cpp @@ -11,6 +10,7 @@ add_llvm_library(LLVMTransformUtils DemoteRegToStack.cpp InlineFunction.cpp InstructionNamer.cpp + IntegerDivision.cpp LCSSA.cpp Local.cpp LoopSimplify.cpp @@ -20,12 +20,14 @@ add_llvm_library(LLVMTransformUtils LowerInvoke.cpp LowerSwitch.cpp Mem2Reg.cpp + MetaRenamer.cpp ModuleUtils.cpp PromoteMemoryToRegister.cpp SSAUpdater.cpp SimplifyCFG.cpp SimplifyIndVar.cpp SimplifyInstructions.cpp + SimplifyLibCalls.cpp UnifyFunctionExitNodes.cpp Utils.cpp ValueMapper.cpp diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index 99237b8..ccc3eae 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -14,22 +14,22 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Constants.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/DebugInfo.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/GlobalVariable.h" -#include "llvm/Function.h" -#include "llvm/LLVMContext.h" -#include "llvm/Metadata.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" -#include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/ADT/SmallVector.h" #include <map> using namespace llvm; @@ -98,10 +98,14 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, Anew->addAttr( OldFunc->getAttributes() .getParamAttributes(I->getArgNo() + 1)); NewFunc->setAttributes(NewFunc->getAttributes() - .addAttr(0, OldFunc->getAttributes() + .addAttr(NewFunc->getContext(), + AttributeSet::ReturnIndex, + OldFunc->getAttributes() .getRetAttributes())); NewFunc->setAttributes(NewFunc->getAttributes() - .addAttr(~0, OldFunc->getAttributes() + .addAttr(NewFunc->getContext(), + AttributeSet::FunctionIndex, + OldFunc->getAttributes() .getFnAttributes())); } @@ -202,14 +206,14 @@ namespace { bool ModuleLevelChanges; const char *NameSuffix; ClonedCodeInfo *CodeInfo; - const TargetData *TD; + const DataLayout *TD; public: PruningFunctionCloner(Function *newFunc, const Function *oldFunc, ValueToValueMapTy &valueMap, bool moduleLevelChanges, const char *nameSuffix, ClonedCodeInfo *codeInfo, - const TargetData *td) + const DataLayout *td) : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap), ModuleLevelChanges(moduleLevelChanges), NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) { @@ -365,7 +369,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, SmallVectorImpl<ReturnInst*> &Returns, const char *NameSuffix, ClonedCodeInfo *CodeInfo, - const TargetData *TD, + const DataLayout *TD, Instruction *TheCall) { assert(NameSuffix && "NameSuffix cannot be null!"); diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp index 1dac6b5..64df089 100644 --- a/lib/Transforms/Utils/CloneModule.cpp +++ b/lib/Transforms/Utils/CloneModule.cpp @@ -13,9 +13,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Module.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Constant.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; @@ -38,10 +38,6 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { New->setTargetTriple(M->getTargetTriple()); New->setModuleInlineAsm(M->getModuleInlineAsm()); - // Copy all of the dependent libraries over. - for (Module::lib_iterator I = M->lib_begin(), E = M->lib_end(); I != E; ++I) - New->addLibrary(*I); - // Loop over all of the global variables, making corresponding globals in the // new module. Here we add them to the VMap and to the new Module. We // don't worry about attributes or initializers, they will come later. diff --git a/lib/Transforms/Utils/CmpInstAnalysis.cpp b/lib/Transforms/Utils/CmpInstAnalysis.cpp index 9b09915..8fa412a 100644 --- a/lib/Transforms/Utils/CmpInstAnalysis.cpp +++ b/lib/Transforms/Utils/CmpInstAnalysis.cpp @@ -13,8 +13,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/CmpInstAnalysis.h" -#include "llvm/Constants.h" -#include "llvm/Instructions.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" using namespace llvm; diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index c545cd6..3a21528 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -14,25 +14,25 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/CodeExtractor.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Instructions.h" -#include "llvm/Intrinsics.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/Verifier.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include <algorithm> #include <set> using namespace llvm; @@ -346,7 +346,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, header->getName(), M); // If the old function is no-throw, so is the new one. if (oldFunction->doesNotThrow()) - newFunction->setDoesNotThrow(true); + newFunction->setDoesNotThrow(); newFunction->getBasicBlockList().push_back(newRootNode); diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp index 99b5830..d5c41f5 100644 --- a/lib/Transforms/Utils/DemoteRegToStack.cpp +++ b/lib/Transforms/Utils/DemoteRegToStack.cpp @@ -8,10 +8,10 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/Type.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" using namespace llvm; /// DemoteRegToStack - This function takes a virtual register computed by an @@ -124,7 +124,12 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) { } // Insert a load in place of the PHI and replace all uses. - Value *V = new LoadInst(Slot, P->getName()+".reload", P); + BasicBlock::iterator InsertPt = P; + + for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt) + /* empty */; // Don't insert before PHI nodes or landingpad instrs. + + Value *V = new LoadInst(Slot, P->getName()+".reload", InsertPt); P->replaceAllUsesWith(V); // Delete PHI. diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index 89e89e7..0d2598a 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -13,21 +13,21 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Attributes.h" -#include "llvm/Constants.h" -#include "llvm/DebugInfo.h" -#include "llvm/DerivedTypes.h" -#include "llvm/IRBuilder.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Intrinsics.h" -#include "llvm/Module.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/DebugInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" #include "llvm/Support/CallSite.h" -#include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -357,7 +357,7 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, Type *VoidPtrTy = Type::getInt8PtrTy(Context); - // Create the alloca. If we have TargetData, use nice alignment. + // Create the alloca. If we have DataLayout, use nice alignment. unsigned Align = 1; if (IFI.TD) Align = IFI.TD->getPrefTypeAlignment(AggTy); @@ -668,10 +668,29 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, if (hasLifetimeMarkers(AI)) continue; - builder.CreateLifetimeStart(AI); + // Try to determine the size of the allocation. + ConstantInt *AllocaSize = 0; + if (ConstantInt *AIArraySize = + dyn_cast<ConstantInt>(AI->getArraySize())) { + if (IFI.TD) { + Type *AllocaType = AI->getAllocatedType(); + uint64_t AllocaTypeSize = IFI.TD->getTypeAllocSize(AllocaType); + uint64_t AllocaArraySize = AIArraySize->getLimitedValue(); + assert(AllocaArraySize > 0 && "array size of AllocaInst is zero"); + // Check that array size doesn't saturate uint64_t and doesn't + // overflow when it's multiplied by type size. + if (AllocaArraySize != ~0ULL && + UINT64_MAX / AllocaArraySize >= AllocaTypeSize) { + AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()), + AllocaArraySize * AllocaTypeSize); + } + } + } + + builder.CreateLifetimeStart(AI, AllocaSize); for (unsigned ri = 0, re = Returns.size(); ri != re; ++ri) { IRBuilder<> builder(Returns[ri]); - builder.CreateLifetimeEnd(AI); + builder.CreateLifetimeEnd(AI, AllocaSize); } } } diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp index 45c15de..a020bc7 100644 --- a/lib/Transforms/Utils/InstructionNamer.cpp +++ b/lib/Transforms/Utils/InstructionNamer.cpp @@ -15,9 +15,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" -#include "llvm/Function.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" #include "llvm/Pass.h" -#include "llvm/Type.h" using namespace llvm; namespace { diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp new file mode 100644 index 0000000..5187d7c --- /dev/null +++ b/lib/Transforms/Utils/IntegerDivision.cpp @@ -0,0 +1,420 @@ +//===-- IntegerDivision.cpp - Expand integer division ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains an implementation of 32bit scalar integer division for +// targets that don't have native support. It's largely derived from +// compiler-rt's implementation of __udivsi3, but hand-tuned to reduce the +// amount of control flow +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "integer-division" +#include "llvm/Transforms/Utils/IntegerDivision.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" + +using namespace llvm; + +/// Generate code to compute the remainder of two signed integers. Returns the +/// remainder, which will have the sign of the dividend. Builder's insert point +/// should be pointing where the caller wants code generated, e.g. at the srem +/// instruction. This will generate a urem in the process, and Builder's insert +/// point will be pointing at the uren (if present, i.e. not folded), ready to +/// be expanded if the user wishes +static Value *generateSignedRemainderCode(Value *Dividend, Value *Divisor, + IRBuilder<> &Builder) { + ConstantInt *ThirtyOne = Builder.getInt32(31); + + // ; %dividend_sgn = ashr i32 %dividend, 31 + // ; %divisor_sgn = ashr i32 %divisor, 31 + // ; %dvd_xor = xor i32 %dividend, %dividend_sgn + // ; %dvs_xor = xor i32 %divisor, %divisor_sgn + // ; %u_dividend = sub i32 %dvd_xor, %dividend_sgn + // ; %u_divisor = sub i32 %dvs_xor, %divisor_sgn + // ; %urem = urem i32 %dividend, %divisor + // ; %xored = xor i32 %urem, %dividend_sgn + // ; %srem = sub i32 %xored, %dividend_sgn + Value *DividendSign = Builder.CreateAShr(Dividend, ThirtyOne); + Value *DivisorSign = Builder.CreateAShr(Divisor, ThirtyOne); + Value *DvdXor = Builder.CreateXor(Dividend, DividendSign); + Value *DvsXor = Builder.CreateXor(Divisor, DivisorSign); + Value *UDividend = Builder.CreateSub(DvdXor, DividendSign); + Value *UDivisor = Builder.CreateSub(DvsXor, DivisorSign); + Value *URem = Builder.CreateURem(UDividend, UDivisor); + Value *Xored = Builder.CreateXor(URem, DividendSign); + Value *SRem = Builder.CreateSub(Xored, DividendSign); + + if (Instruction *URemInst = dyn_cast<Instruction>(URem)) + Builder.SetInsertPoint(URemInst); + + return SRem; +} + + +/// Generate code to compute the remainder of two unsigned integers. Returns the +/// remainder. Builder's insert point should be pointing where the caller wants +/// code generated, e.g. at the urem instruction. This will generate a udiv in +/// the process, and Builder's insert point will be pointing at the udiv (if +/// present, i.e. not folded), ready to be expanded if the user wishes +static Value *generatedUnsignedRemainderCode(Value *Dividend, Value *Divisor, + IRBuilder<> &Builder) { + // Remainder = Dividend - Quotient*Divisor + + // ; %quotient = udiv i32 %dividend, %divisor + // ; %product = mul i32 %divisor, %quotient + // ; %remainder = sub i32 %dividend, %product + Value *Quotient = Builder.CreateUDiv(Dividend, Divisor); + Value *Product = Builder.CreateMul(Divisor, Quotient); + Value *Remainder = Builder.CreateSub(Dividend, Product); + + if (Instruction *UDiv = dyn_cast<Instruction>(Quotient)) + Builder.SetInsertPoint(UDiv); + + return Remainder; +} + +/// Generate code to divide two signed integers. Returns the quotient, rounded +/// towards 0. Builder's insert point should be pointing where the caller wants +/// code generated, e.g. at the sdiv instruction. This will generate a udiv in +/// the process, and Builder's insert point will be pointing at the udiv (if +/// present, i.e. not folded), ready to be expanded if the user wishes. +static Value *generateSignedDivisionCode(Value *Dividend, Value *Divisor, + IRBuilder<> &Builder) { + // Implementation taken from compiler-rt's __divsi3 + + ConstantInt *ThirtyOne = Builder.getInt32(31); + + // ; %tmp = ashr i32 %dividend, 31 + // ; %tmp1 = ashr i32 %divisor, 31 + // ; %tmp2 = xor i32 %tmp, %dividend + // ; %u_dvnd = sub nsw i32 %tmp2, %tmp + // ; %tmp3 = xor i32 %tmp1, %divisor + // ; %u_dvsr = sub nsw i32 %tmp3, %tmp1 + // ; %q_sgn = xor i32 %tmp1, %tmp + // ; %q_mag = udiv i32 %u_dvnd, %u_dvsr + // ; %tmp4 = xor i32 %q_mag, %q_sgn + // ; %q = sub i32 %tmp4, %q_sgn + Value *Tmp = Builder.CreateAShr(Dividend, ThirtyOne); + Value *Tmp1 = Builder.CreateAShr(Divisor, ThirtyOne); + Value *Tmp2 = Builder.CreateXor(Tmp, Dividend); + Value *U_Dvnd = Builder.CreateSub(Tmp2, Tmp); + Value *Tmp3 = Builder.CreateXor(Tmp1, Divisor); + Value *U_Dvsr = Builder.CreateSub(Tmp3, Tmp1); + Value *Q_Sgn = Builder.CreateXor(Tmp1, Tmp); + Value *Q_Mag = Builder.CreateUDiv(U_Dvnd, U_Dvsr); + Value *Tmp4 = Builder.CreateXor(Q_Mag, Q_Sgn); + Value *Q = Builder.CreateSub(Tmp4, Q_Sgn); + + if (Instruction *UDiv = dyn_cast<Instruction>(Q_Mag)) + Builder.SetInsertPoint(UDiv); + + return Q; +} + +/// Generates code to divide two unsigned scalar 32-bit integers. Returns the +/// quotient, rounded towards 0. Builder's insert point should be pointing where +/// the caller wants code generated, e.g. at the udiv instruction. +static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, + IRBuilder<> &Builder) { + // The basic algorithm can be found in the compiler-rt project's + // implementation of __udivsi3.c. Here, we do a lower-level IR based approach + // that's been hand-tuned to lessen the amount of control flow involved. + + // Some helper values + IntegerType *I32Ty = Builder.getInt32Ty(); + + ConstantInt *Zero = Builder.getInt32(0); + ConstantInt *One = Builder.getInt32(1); + ConstantInt *ThirtyOne = Builder.getInt32(31); + ConstantInt *NegOne = ConstantInt::getSigned(I32Ty, -1); + ConstantInt *True = Builder.getTrue(); + + BasicBlock *IBB = Builder.GetInsertBlock(); + Function *F = IBB->getParent(); + Function *CTLZi32 = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, + I32Ty); + + // Our CFG is going to look like: + // +---------------------+ + // | special-cases | + // | ... | + // +---------------------+ + // | | + // | +----------+ + // | | bb1 | + // | | ... | + // | +----------+ + // | | | + // | | +------------+ + // | | | preheader | + // | | | ... | + // | | +------------+ + // | | | + // | | | +---+ + // | | | | | + // | | +------------+ | + // | | | do-while | | + // | | | ... | | + // | | +------------+ | + // | | | | | + // | +-----------+ +---+ + // | | loop-exit | + // | | ... | + // | +-----------+ + // | | + // +-------+ + // | ... | + // | end | + // +-------+ + BasicBlock *SpecialCases = Builder.GetInsertBlock(); + SpecialCases->setName(Twine(SpecialCases->getName(), "_udiv-special-cases")); + BasicBlock *End = SpecialCases->splitBasicBlock(Builder.GetInsertPoint(), + "udiv-end"); + BasicBlock *LoopExit = BasicBlock::Create(Builder.getContext(), + "udiv-loop-exit", F, End); + BasicBlock *DoWhile = BasicBlock::Create(Builder.getContext(), + "udiv-do-while", F, End); + BasicBlock *Preheader = BasicBlock::Create(Builder.getContext(), + "udiv-preheader", F, End); + BasicBlock *BB1 = BasicBlock::Create(Builder.getContext(), + "udiv-bb1", F, End); + + // We'll be overwriting the terminator to insert our extra blocks + SpecialCases->getTerminator()->eraseFromParent(); + + // First off, check for special cases: dividend or divisor is zero, divisor + // is greater than dividend, and divisor is 1. + // ; special-cases: + // ; %ret0_1 = icmp eq i32 %divisor, 0 + // ; %ret0_2 = icmp eq i32 %dividend, 0 + // ; %ret0_3 = or i1 %ret0_1, %ret0_2 + // ; %tmp0 = tail call i32 @llvm.ctlz.i32(i32 %divisor, i1 true) + // ; %tmp1 = tail call i32 @llvm.ctlz.i32(i32 %dividend, i1 true) + // ; %sr = sub nsw i32 %tmp0, %tmp1 + // ; %ret0_4 = icmp ugt i32 %sr, 31 + // ; %ret0 = or i1 %ret0_3, %ret0_4 + // ; %retDividend = icmp eq i32 %sr, 31 + // ; %retVal = select i1 %ret0, i32 0, i32 %dividend + // ; %earlyRet = or i1 %ret0, %retDividend + // ; br i1 %earlyRet, label %end, label %bb1 + Builder.SetInsertPoint(SpecialCases); + Value *Ret0_1 = Builder.CreateICmpEQ(Divisor, Zero); + Value *Ret0_2 = Builder.CreateICmpEQ(Dividend, Zero); + Value *Ret0_3 = Builder.CreateOr(Ret0_1, Ret0_2); + Value *Tmp0 = Builder.CreateCall2(CTLZi32, Divisor, True); + Value *Tmp1 = Builder.CreateCall2(CTLZi32, Dividend, True); + Value *SR = Builder.CreateSub(Tmp0, Tmp1); + Value *Ret0_4 = Builder.CreateICmpUGT(SR, ThirtyOne); + Value *Ret0 = Builder.CreateOr(Ret0_3, Ret0_4); + Value *RetDividend = Builder.CreateICmpEQ(SR, ThirtyOne); + Value *RetVal = Builder.CreateSelect(Ret0, Zero, Dividend); + Value *EarlyRet = Builder.CreateOr(Ret0, RetDividend); + Builder.CreateCondBr(EarlyRet, End, BB1); + + // ; bb1: ; preds = %special-cases + // ; %sr_1 = add i32 %sr, 1 + // ; %tmp2 = sub i32 31, %sr + // ; %q = shl i32 %dividend, %tmp2 + // ; %skipLoop = icmp eq i32 %sr_1, 0 + // ; br i1 %skipLoop, label %loop-exit, label %preheader + Builder.SetInsertPoint(BB1); + Value *SR_1 = Builder.CreateAdd(SR, One); + Value *Tmp2 = Builder.CreateSub(ThirtyOne, SR); + Value *Q = Builder.CreateShl(Dividend, Tmp2); + Value *SkipLoop = Builder.CreateICmpEQ(SR_1, Zero); + Builder.CreateCondBr(SkipLoop, LoopExit, Preheader); + + // ; preheader: ; preds = %bb1 + // ; %tmp3 = lshr i32 %dividend, %sr_1 + // ; %tmp4 = add i32 %divisor, -1 + // ; br label %do-while + Builder.SetInsertPoint(Preheader); + Value *Tmp3 = Builder.CreateLShr(Dividend, SR_1); + Value *Tmp4 = Builder.CreateAdd(Divisor, NegOne); + Builder.CreateBr(DoWhile); + + // ; do-while: ; preds = %do-while, %preheader + // ; %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ] + // ; %sr_3 = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ] + // ; %r_1 = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ] + // ; %q_2 = phi i32 [ %q, %preheader ], [ %q_1, %do-while ] + // ; %tmp5 = shl i32 %r_1, 1 + // ; %tmp6 = lshr i32 %q_2, 31 + // ; %tmp7 = or i32 %tmp5, %tmp6 + // ; %tmp8 = shl i32 %q_2, 1 + // ; %q_1 = or i32 %carry_1, %tmp8 + // ; %tmp9 = sub i32 %tmp4, %tmp7 + // ; %tmp10 = ashr i32 %tmp9, 31 + // ; %carry = and i32 %tmp10, 1 + // ; %tmp11 = and i32 %tmp10, %divisor + // ; %r = sub i32 %tmp7, %tmp11 + // ; %sr_2 = add i32 %sr_3, -1 + // ; %tmp12 = icmp eq i32 %sr_2, 0 + // ; br i1 %tmp12, label %loop-exit, label %do-while + Builder.SetInsertPoint(DoWhile); + PHINode *Carry_1 = Builder.CreatePHI(I32Ty, 2); + PHINode *SR_3 = Builder.CreatePHI(I32Ty, 2); + PHINode *R_1 = Builder.CreatePHI(I32Ty, 2); + PHINode *Q_2 = Builder.CreatePHI(I32Ty, 2); + Value *Tmp5 = Builder.CreateShl(R_1, One); + Value *Tmp6 = Builder.CreateLShr(Q_2, ThirtyOne); + Value *Tmp7 = Builder.CreateOr(Tmp5, Tmp6); + Value *Tmp8 = Builder.CreateShl(Q_2, One); + Value *Q_1 = Builder.CreateOr(Carry_1, Tmp8); + Value *Tmp9 = Builder.CreateSub(Tmp4, Tmp7); + Value *Tmp10 = Builder.CreateAShr(Tmp9, 31); + Value *Carry = Builder.CreateAnd(Tmp10, One); + Value *Tmp11 = Builder.CreateAnd(Tmp10, Divisor); + Value *R = Builder.CreateSub(Tmp7, Tmp11); + Value *SR_2 = Builder.CreateAdd(SR_3, NegOne); + Value *Tmp12 = Builder.CreateICmpEQ(SR_2, Zero); + Builder.CreateCondBr(Tmp12, LoopExit, DoWhile); + + // ; loop-exit: ; preds = %do-while, %bb1 + // ; %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ] + // ; %q_3 = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ] + // ; %tmp13 = shl i32 %q_3, 1 + // ; %q_4 = or i32 %carry_2, %tmp13 + // ; br label %end + Builder.SetInsertPoint(LoopExit); + PHINode *Carry_2 = Builder.CreatePHI(I32Ty, 2); + PHINode *Q_3 = Builder.CreatePHI(I32Ty, 2); + Value *Tmp13 = Builder.CreateShl(Q_3, One); + Value *Q_4 = Builder.CreateOr(Carry_2, Tmp13); + Builder.CreateBr(End); + + // ; end: ; preds = %loop-exit, %special-cases + // ; %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ] + // ; ret i32 %q_5 + Builder.SetInsertPoint(End, End->begin()); + PHINode *Q_5 = Builder.CreatePHI(I32Ty, 2); + + // Populate the Phis, since all values have now been created. Our Phis were: + // ; %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ] + Carry_1->addIncoming(Zero, Preheader); + Carry_1->addIncoming(Carry, DoWhile); + // ; %sr_3 = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ] + SR_3->addIncoming(SR_1, Preheader); + SR_3->addIncoming(SR_2, DoWhile); + // ; %r_1 = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ] + R_1->addIncoming(Tmp3, Preheader); + R_1->addIncoming(R, DoWhile); + // ; %q_2 = phi i32 [ %q, %preheader ], [ %q_1, %do-while ] + Q_2->addIncoming(Q, Preheader); + Q_2->addIncoming(Q_1, DoWhile); + // ; %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ] + Carry_2->addIncoming(Zero, BB1); + Carry_2->addIncoming(Carry, DoWhile); + // ; %q_3 = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ] + Q_3->addIncoming(Q, BB1); + Q_3->addIncoming(Q_1, DoWhile); + // ; %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ] + Q_5->addIncoming(Q_4, LoopExit); + Q_5->addIncoming(RetVal, SpecialCases); + + return Q_5; +} + +/// Generate code to calculate the remainder of two integers, replacing Rem with +/// the generated code. This currently generates code using the udiv expansion, +/// but future work includes generating more specialized code, e.g. when more +/// information about the operands are known. Currently only implements 32bit +/// scalar division (due to udiv's limitation), but future work is removing this +/// limitation. +/// +/// @brief Replace Rem with generated code. +bool llvm::expandRemainder(BinaryOperator *Rem) { + assert((Rem->getOpcode() == Instruction::SRem || + Rem->getOpcode() == Instruction::URem) && + "Trying to expand remainder from a non-remainder function"); + + IRBuilder<> Builder(Rem); + + // First prepare the sign if it's a signed remainder + if (Rem->getOpcode() == Instruction::SRem) { + Value *Remainder = generateSignedRemainderCode(Rem->getOperand(0), + Rem->getOperand(1), Builder); + + Rem->replaceAllUsesWith(Remainder); + Rem->dropAllReferences(); + Rem->eraseFromParent(); + + // If we didn't actually generate a udiv instruction, we're done + BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint()); + if (!BO || BO->getOpcode() != Instruction::URem) + return true; + + Rem = BO; + } + + Value *Remainder = generatedUnsignedRemainderCode(Rem->getOperand(0), + Rem->getOperand(1), + Builder); + + Rem->replaceAllUsesWith(Remainder); + Rem->dropAllReferences(); + Rem->eraseFromParent(); + + // Expand the udiv + if (BinaryOperator *UDiv = dyn_cast<BinaryOperator>(Builder.GetInsertPoint())) { + assert(UDiv->getOpcode() == Instruction::UDiv && "Non-udiv in expansion?"); + expandDivision(UDiv); + } + + return true; +} + + +/// Generate code to divide two integers, replacing Div with the generated +/// code. This currently generates code similarly to compiler-rt's +/// implementations, but future work includes generating more specialized code +/// when more information about the operands are known. Currently only +/// implements 32bit scalar division, but future work is removing this +/// limitation. +/// +/// @brief Replace Div with generated code. +bool llvm::expandDivision(BinaryOperator *Div) { + assert((Div->getOpcode() == Instruction::SDiv || + Div->getOpcode() == Instruction::UDiv) && + "Trying to expand division from a non-division function"); + + IRBuilder<> Builder(Div); + + if (Div->getType()->isVectorTy()) + llvm_unreachable("Div over vectors not supported"); + + // First prepare the sign if it's a signed division + if (Div->getOpcode() == Instruction::SDiv) { + // Lower the code to unsigned division, and reset Div to point to the udiv. + Value *Quotient = generateSignedDivisionCode(Div->getOperand(0), + Div->getOperand(1), Builder); + Div->replaceAllUsesWith(Quotient); + Div->dropAllReferences(); + Div->eraseFromParent(); + + // If we didn't actually generate a udiv instruction, we're done + BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint()); + if (!BO || BO->getOpcode() != Instruction::UDiv) + return true; + + Div = BO; + } + + // Insert the unsigned division code + Value *Quotient = generateUnsignedDivisionCode(Div->getOperand(0), + Div->getOperand(1), + Builder); + Div->replaceAllUsesWith(Quotient); + Div->dropAllReferences(); + Div->eraseFromParent(); + + return true; +} diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp index b654111..2d1b166 100644 --- a/lib/Transforms/Utils/LCSSA.cpp +++ b/lib/Transforms/Utils/LCSSA.cpp @@ -29,17 +29,17 @@ #define DEBUG_TYPE "lcssa" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/Pass.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" #include "llvm/Support/PredIteratorCache.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; STATISTIC(NumLCSSA, "Number of live out of a loop variables"); @@ -53,6 +53,8 @@ namespace { // Cached analysis information for the current function. DominatorTree *DT; + LoopInfo *LI; + ScalarEvolution *SE; std::vector<BasicBlock*> LoopBlocks; PredIteratorCache PredCache; Loop *L; @@ -117,6 +119,8 @@ bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) { L = TheLoop; DT = &getAnalysis<DominatorTree>(); + LI = &getAnalysis<LoopInfo>(); + SE = getAnalysisIfAvailable<ScalarEvolution>(); // Get the set of exiting blocks. SmallVector<BasicBlock*, 8> ExitBlocks; @@ -156,6 +160,12 @@ bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) { MadeChange |= ProcessInstruction(I, ExitBlocks); } } + + // If we modified the code, remove any caches about the loop from SCEV to + // avoid dangling entries. + // FIXME: This is a big hammer, can we clear the cache more selectively? + if (SE && MadeChange) + SE->forgetLoop(L); assert(L->isLCSSAForm(*DT)); PredCache.clear(); @@ -245,7 +255,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst, // Remember that this phi makes the value alive in this block. SSAUpdate.AddAvailableValue(ExitBB, PN); } - + // Rewrite all uses outside the loop in terms of the new PHIs we just // inserted. for (unsigned i = 0, e = UsesToRewrite.size(); i != e; ++i) { @@ -260,6 +270,9 @@ bool LCSSA::ProcessInstruction(Instruction *Inst, if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) { + // Tell the VHs that the uses changed. This updates SCEV's caches. + if (UsesToRewrite[i]->get()->hasValueHandle()) + ValueHandleBase::ValueIsRAUWd(*UsesToRewrite[i], UserBB->begin()); UsesToRewrite[i]->set(UserBB->begin()); continue; } diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 0601433..a54ee08 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -13,32 +13,34 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Constants.h" -#include "llvm/DIBuilder.h" -#include "llvm/DebugInfo.h" -#include "llvm/DerivedTypes.h" -#include "llvm/GlobalAlias.h" -#include "llvm/GlobalVariable.h" -#include "llvm/IRBuilder.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Intrinsics.h" -#include "llvm/Metadata.h" -#include "llvm/Operator.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ProfileInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Operator.h" #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" using namespace llvm; //===----------------------------------------------------------------------===// @@ -122,6 +124,27 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, // Check to see if this branch is going to the same place as the default // dest. If so, eliminate it as an explicit compare. if (i.getCaseSuccessor() == DefaultDest) { + MDNode* MD = SI->getMetadata(LLVMContext::MD_prof); + // MD should have 2 + NumCases operands. + if (MD && MD->getNumOperands() == 2 + SI->getNumCases()) { + // Collect branch weights into a vector. + SmallVector<uint32_t, 8> Weights; + for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e; + ++MD_i) { + ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i)); + assert(CI); + Weights.push_back(CI->getValue().getZExtValue()); + } + // Merge weight of this case to the default weight. + unsigned idx = i.getCaseIndex(); + Weights[0] += Weights[idx+1]; + // Remove weight for this case. + std::swap(Weights[idx+1], Weights.back()); + Weights.pop_back(); + SI->setMetadata(LLVMContext::MD_prof, + MDBuilder(BB->getContext()). + createBranchWeights(Weights)); + } // Remove this entry. DefaultDest->removePredecessor(SI->getParent()); SI->removeCase(i); @@ -178,8 +201,20 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, "cond"); // Insert the new branch. - Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(), - SI->getDefaultDest()); + BranchInst *NewBr = Builder.CreateCondBr(Cond, + FirstCase.getCaseSuccessor(), + SI->getDefaultDest()); + MDNode* MD = SI->getMetadata(LLVMContext::MD_prof); + if (MD && MD->getNumOperands() == 3) { + ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2)); + ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1)); + assert(SICase && SIDef); + // The TrueWeight should be the weight for the single case of SI. + NewBr->setMetadata(LLVMContext::MD_prof, + MDBuilder(BB->getContext()). + createBranchWeights(SICase->getValue().getZExtValue(), + SIDef->getValue().getZExtValue())); + } // Delete the old switch. SI->eraseFromParent(); @@ -363,7 +398,7 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN, /// /// This returns true if it changed the code, note that it can delete /// instructions in other blocks as well in this block. -bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD, +bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD, const TargetLibraryInfo *TLI) { bool MadeChange = false; @@ -411,7 +446,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD, /// .. and delete the predecessor corresponding to the '1', this will attempt to /// recursively fold the and to 0. void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred, - TargetData *TD) { + DataLayout *TD) { // This only adjusts blocks with PHI nodes. if (!isa<PHINode>(BB->begin())) return; @@ -570,7 +605,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) { // possible to handle such cases, but difficult: it requires checking whether // BB dominates Succ, which is non-trivial to calculate in the case where // Succ has multiple predecessors. Also, it requires checking whether - // constructing the necessary self-referential PHI node doesn't intoduce any + // constructing the necessary self-referential PHI node doesn't introduce any // conflicts; this isn't too difficult, but the previous code for doing this // was incorrect. // @@ -726,7 +761,7 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { /// their preferred alignment from the beginning. /// static unsigned enforceKnownAlignment(Value *V, unsigned Align, - unsigned PrefAlign, const TargetData *TD) { + unsigned PrefAlign, const DataLayout *TD) { V = V->stripPointerCasts(); if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) { @@ -769,7 +804,7 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align, /// and it is more than the alignment of the ultimate object, see if we can /// increase the alignment of the ultimate object, making this check succeed. unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, - const TargetData *TD) { + const DataLayout *TD) { assert(V->getType()->isPointerTy() && "getOrEnforceKnownAlignment expects a pointer!"); unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64; @@ -894,3 +929,78 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) { return 0; } + +bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, + DIBuilder &Builder) { + DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI); + if (!DDI) + return false; + DIVariable DIVar(DDI->getVariable()); + if (!DIVar.Verify()) + return false; + + // Create a copy of the original DIDescriptor for user variable, appending + // "deref" operation to a list of address elements, as new llvm.dbg.declare + // will take a value storing address of the memory for variable, not + // alloca itself. + Type *Int64Ty = Type::getInt64Ty(AI->getContext()); + SmallVector<Value*, 4> NewDIVarAddress; + if (DIVar.hasComplexAddress()) { + for (unsigned i = 0, n = DIVar.getNumAddrElements(); i < n; ++i) { + NewDIVarAddress.push_back( + ConstantInt::get(Int64Ty, DIVar.getAddrElement(i))); + } + } + NewDIVarAddress.push_back(ConstantInt::get(Int64Ty, DIBuilder::OpDeref)); + DIVariable NewDIVar = Builder.createComplexVariable( + DIVar.getTag(), DIVar.getContext(), DIVar.getName(), + DIVar.getFile(), DIVar.getLineNumber(), DIVar.getType(), + NewDIVarAddress, DIVar.getArgNumber()); + + // Insert llvm.dbg.declare in the same basic block as the original alloca, + // and remove old llvm.dbg.declare. + BasicBlock *BB = AI->getParent(); + Builder.insertDeclare(NewAllocaAddress, NewDIVar, BB); + DDI->eraseFromParent(); + return true; +} + +bool llvm::removeUnreachableBlocks(Function &F) { + SmallPtrSet<BasicBlock*, 16> Reachable; + SmallVector<BasicBlock*, 128> Worklist; + Worklist.push_back(&F.getEntryBlock()); + Reachable.insert(&F.getEntryBlock()); + do { + BasicBlock *BB = Worklist.pop_back_val(); + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) + if (Reachable.insert(*SI)) + Worklist.push_back(*SI); + } while (!Worklist.empty()); + + if (Reachable.size() == F.size()) + return false; + + assert(Reachable.size() < F.size()); + for (Function::iterator I = llvm::next(F.begin()), E = F.end(); I != E; ++I) { + if (Reachable.count(I)) + continue; + + // Remove the block as predecessor of all its reachable successors. + // Unreachable successors don't matter as they'll soon be removed, too. + for (succ_iterator SI = succ_begin(I), SE = succ_end(I); SI != SE; ++SI) + if (Reachable.count(*SI)) + (*SI)->removePredecessor(I); + + // Zap all instructions in this basic block. + while (!I->empty()) { + Instruction &Inst = I->back(); + if (!Inst.use_empty()) + Inst.replaceAllUsesWith(UndefValue::get(Inst.getType())); + I->getInstList().pop_back(); + } + + --I; + llvm::next(I)->eraseFromParent(); + } + return true; +} diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp index 0bc185d..37819cc 100644 --- a/lib/Transforms/Utils/LoopSimplify.cpp +++ b/lib/Transforms/Utils/LoopSimplify.cpp @@ -39,25 +39,26 @@ #define DEBUG_TYPE "loop-simplify" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Function.h" -#include "llvm/LLVMContext.h" -#include "llvm/Type.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Type.h" #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" -#include "llvm/ADT/SetOperations.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted"); @@ -89,6 +90,7 @@ namespace { AU.addPreserved<AliasAnalysis>(); AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<DependenceAnalysis>(); AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added. } @@ -194,6 +196,11 @@ ReprocessLoop: BI->setCondition(ConstantInt::get(Cond->getType(), !L->contains(BI->getSuccessor(0)))); + + // This may make the loop analyzable, force SCEV recomputation. + if (SE) + SE->forgetLoop(L); + Changed = true; } } diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index 2023750..cb581b3 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -18,12 +18,12 @@ #define DEBUG_TYPE "loop-unroll" #include "llvm/Transforms/Utils/UnrollLoop.h" -#include "llvm/BasicBlock.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 67e17f4..d801d5f 100644 --- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -23,12 +23,12 @@ #define DEBUG_TYPE "loop-unroll" #include "llvm/Transforms/Utils/UnrollLoop.h" -#include "llvm/BasicBlock.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp index 02bdcda..4aee8ff 100644 --- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp +++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp @@ -12,17 +12,17 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "lower-expect-intrinsic" -#include "llvm/BasicBlock.h" -#include "llvm/Constants.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/Intrinsics.h" -#include "llvm/LLVMContext.h" -#include "llvm/MDBuilder.h" -#include "llvm/Metadata.h" -#include "llvm/Pass.h" -#include "llvm/ADT/Statistic.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include <vector> diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp index 9305554..9ec84d7 100644 --- a/lib/Transforms/Utils/LowerInvoke.cpp +++ b/lib/Transforms/Utils/LowerInvoke.cpp @@ -36,19 +36,19 @@ #define DEBUG_TYPE "lowerinvoke" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Instructions.h" -#include "llvm/Intrinsics.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLowering.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <csetjmp> #include <set> using namespace llvm; diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp index 1547439..955b853 100644 --- a/lib/Transforms/Utils/LowerSwitch.cpp +++ b/lib/Transforms/Utils/LowerSwitch.cpp @@ -14,16 +14,16 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" -#include "llvm/Constants.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/LLVMContext.h" -#include "llvm/Pass.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Pass.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" #include <algorithm> using namespace llvm; diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp index f4ca81a..61b3965 100644 --- a/lib/Transforms/Utils/Mem2Reg.cpp +++ b/lib/Transforms/Utils/Mem2Reg.cpp @@ -14,12 +14,12 @@ #define DEBUG_TYPE "mem2reg" #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" -#include "llvm/Analysis/Dominators.h" -#include "llvm/Instructions.h" -#include "llvm/Function.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumPromoted, "Number of alloca's promoted"); diff --git a/lib/Transforms/Utils/MetaRenamer.cpp b/lib/Transforms/Utils/MetaRenamer.cpp new file mode 100644 index 0000000..d519fb7 --- /dev/null +++ b/lib/Transforms/Utils/MetaRenamer.cpp @@ -0,0 +1,131 @@ +//===- MetaRenamer.cpp - Rename everything with metasyntatic names --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass renames everything with metasyntatic names. The intent is to use +// this pass after bugpoint reduction to conceal the nature of the original +// program. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/TypeFinder.h" +#include "llvm/Pass.h" +using namespace llvm; + +namespace { + + // This PRNG is from the ISO C spec. It is intentionally simple and + // unsuitable for cryptographic use. We're just looking for enough + // variety to surprise and delight users. + struct PRNG { + unsigned long next; + + void srand(unsigned int seed) { + next = seed; + } + + int rand() { + next = next * 1103515245 + 12345; + return (unsigned int)(next / 65536) % 32768; + } + }; + + struct MetaRenamer : public ModulePass { + static char ID; // Pass identification, replacement for typeid + MetaRenamer() : ModulePass(ID) { + initializeMetaRenamerPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + } + + bool runOnModule(Module &M) { + static const char *metaNames[] = { + // See http://en.wikipedia.org/wiki/Metasyntactic_variable + "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge", + "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam" + }; + + // Seed our PRNG with simple additive sum of ModuleID. We're looking to + // simply avoid always having the same function names, and we need to + // remain deterministic. + unsigned int randSeed = 0; + for (std::string::const_iterator I = M.getModuleIdentifier().begin(), + E = M.getModuleIdentifier().end(); I != E; ++I) + randSeed += *I; + + PRNG prng; + prng.srand(randSeed); + + // Rename all aliases + for (Module::alias_iterator AI = M.alias_begin(), AE = M.alias_end(); + AI != AE; ++AI) + AI->setName("alias"); + + // Rename all global variables + for (Module::global_iterator GI = M.global_begin(), GE = M.global_end(); + GI != GE; ++GI) + GI->setName("global"); + + // Rename all struct types + TypeFinder StructTypes; + StructTypes.run(M, true); + for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) { + StructType *STy = StructTypes[i]; + if (STy->isLiteral() || STy->getName().empty()) continue; + + SmallString<128> NameStorage; + STy->setName((Twine("struct.") + metaNames[prng.rand() % + array_lengthof(metaNames)]).toStringRef(NameStorage)); + } + + // Rename all functions + for (Module::iterator FI = M.begin(), FE = M.end(); + FI != FE; ++FI) { + FI->setName(metaNames[prng.rand() % array_lengthof(metaNames)]); + runOnFunction(*FI); + } + return true; + } + + bool runOnFunction(Function &F) { + for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); + AI != AE; ++AI) + if (!AI->getType()->isVoidTy()) + AI->setName("arg"); + + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + BB->setName("bb"); + + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + if (!I->getType()->isVoidTy()) + I->setName("tmp"); + } + return true; + } + }; +} + +char MetaRenamer::ID = 0; +INITIALIZE_PASS(MetaRenamer, "metarenamer", + "Assign new names to everything", false, false) +//===----------------------------------------------------------------------===// +// +// MetaRenamer - Rename everything with metasyntactic names. +// +ModulePass *llvm::createMetaRenamerPass() { + return new MetaRenamer(); +} diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp index dbcf3b2..d090b48 100644 --- a/lib/Transforms/Utils/ModuleUtils.cpp +++ b/lib/Transforms/Utils/ModuleUtils.cpp @@ -12,10 +12,10 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/ModuleUtils.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/IRBuilder.h" -#include "llvm/Module.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" using namespace llvm; diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index dd5e20e..de335ec 100644 --- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -27,26 +27,26 @@ #define DEBUG_TYPE "mem2reg" #include "llvm/Transforms/Utils/PromoteMemToReg.h" -#include "llvm/Constants.h" -#include "llvm/DebugInfo.h" -#include "llvm/DerivedTypes.h" -#include "llvm/DIBuilder.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Metadata.h" -#include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/Dominators.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Hashing.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/CFG.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <queue> using namespace llvm; @@ -212,9 +212,13 @@ namespace { /// DenseMap<AllocaInst*, unsigned> AllocaLookup; - /// NewPhiNodes - The PhiNodes we're adding. + /// NewPhiNodes - The PhiNodes we're adding. That map is used to simplify + /// some Phi nodes as we iterate over it, so it should have deterministic + /// iterators. We could use a MapVector, but since we already maintain a + /// map from BasicBlock* to a stable numbering (BBNumbers), the DenseMap is + /// more efficient (also supports removal). /// - DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*> NewPhiNodes; + DenseMap<std::pair<unsigned, unsigned>, PHINode*> NewPhiNodes; /// PhiToAllocaMap - For each PHI node, keep track of which entry in Allocas /// it corresponds to. @@ -588,7 +592,11 @@ void PromoteMem2Reg::run() { while (EliminatedAPHI) { EliminatedAPHI = false; - for (DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*>::iterator I = + // Iterating over NewPhiNodes is deterministic, so it is safe to try to + // simplify and RAUW them as we go. If it was not, we could add uses to + // the values we replace with in a non deterministic order, thus creating + // non deterministic def->use chains. + for (DenseMap<std::pair<unsigned, unsigned>, PHINode*>::iterator I = NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E;) { PHINode *PN = I->second; @@ -612,7 +620,7 @@ void PromoteMem2Reg::run() { // have incoming values for all predecessors. Loop over all PHI nodes we have // created, inserting undef values if they are missing any incoming values. // - for (DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*>::iterator I = + for (DenseMap<std::pair<unsigned, unsigned>, PHINode*>::iterator I = NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E; ++I) { // We want to do this once per basic block. As such, only process a block // when we find the PHI that is the first entry in the block. @@ -992,7 +1000,7 @@ void PromoteMem2Reg::PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info, bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo, unsigned &Version) { // Look up the basic-block in question. - PHINode *&PN = NewPhiNodes[std::make_pair(BB, AllocaNo)]; + PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaNo)]; // If the BB already has a phi node added for the i'th alloca then we're done! if (PN) return false; diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp index 72d4199..9d90fbe 100644 --- a/lib/Transforms/Utils/SSAUpdater.cpp +++ b/lib/Transforms/Utils/SSAUpdater.cpp @@ -12,12 +12,13 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "ssaupdater" -#include "llvm/Constants.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/TinyPtrVector.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/AlignOf.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CFG.h" @@ -25,7 +26,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/SSAUpdaterImpl.h" using namespace llvm; diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 32d7fa1..f10c35f 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -13,18 +13,6 @@ #define DEBUG_TYPE "simplifycfg" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/GlobalVariable.h" -#include "llvm/IRBuilder.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/MDBuilder.h" -#include "llvm/Metadata.h" -#include "llvm/Module.h" -#include "llvm/Operator.h" -#include "llvm/Type.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" @@ -32,18 +20,31 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" #include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ConstantRange.h" #include "llvm/Support/Debug.h" #include "llvm/Support/NoFolder.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include <algorithm> -#include <set> #include <map> +#include <set> using namespace llvm; static cl::opt<unsigned> @@ -54,8 +55,14 @@ static cl::opt<bool> DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false), cl::desc("Duplicate return instructions into unconditional branches")); -STATISTIC(NumSpeculations, "Number of speculative executed instructions"); +static cl::opt<bool> +SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true), + cl::desc("Sink common instructions down to the end block")); + +STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables"); +STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block"); +STATISTIC(NumSpeculations, "Number of speculative executed instructions"); namespace { /// ValueEqualityComparisonCase - Represents a case of a switch. @@ -70,10 +77,13 @@ namespace { // Comparing pointers is ok as we only rely on the order for uniquing. return Value < RHS.Value; } + + bool operator==(BasicBlock *RHSDest) const { return Dest == RHSDest; } }; class SimplifyCFGOpt { - const TargetData *const TD; + const TargetTransformInfo &TTI; + const DataLayout *const TD; Value *isValueEqualityComparison(TerminatorInst *TI); BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI, @@ -93,7 +103,8 @@ class SimplifyCFGOpt { bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder); public: - explicit SimplifyCFGOpt(const TargetData *td) : TD(td) {} + SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout *TD) + : TTI(TTI), TD(TD) {} bool run(BasicBlock *BB); }; } @@ -376,7 +387,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, /// GetConstantInt - Extract ConstantInt from value, looking through IntToPtr /// and PointerNullValue. Return NULL if value is not a constant int. -static ConstantInt *GetConstantInt(Value *V, const TargetData *TD) { +static ConstantInt *GetConstantInt(Value *V, const DataLayout *TD) { // Normal constant int. ConstantInt *CI = dyn_cast<ConstantInt>(V); if (CI || !TD || !isa<Constant>(V) || !V->getType()->isPointerTy()) @@ -384,7 +395,7 @@ static ConstantInt *GetConstantInt(Value *V, const TargetData *TD) { // This is some kind of pointer constant. Turn it into a pointer-sized // ConstantInt if possible. - IntegerType *PtrTy = TD->getIntPtrType(V->getContext()); + IntegerType *PtrTy = cast<IntegerType>(TD->getIntPtrType(V->getType())); // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*). if (isa<ConstantPointerNull>(V)) @@ -410,7 +421,7 @@ static ConstantInt *GetConstantInt(Value *V, const TargetData *TD) { /// Values vector. static Value * GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra, - const TargetData *TD, bool isEQ, unsigned &UsedICmps) { + const DataLayout *TD, bool isEQ, unsigned &UsedICmps) { Instruction *I = dyn_cast<Instruction>(V); if (I == 0) return 0; @@ -558,11 +569,7 @@ GetValueEqualityComparisonCases(TerminatorInst *TI, /// in the list that match the specified block. static void EliminateBlockCases(BasicBlock *BB, std::vector<ValueEqualityComparisonCase> &Cases) { - for (unsigned i = 0, e = Cases.size(); i != e; ++i) - if (Cases[i].Dest == BB) { - Cases.erase(Cases.begin()+i); - --i; --e; - } + Cases.erase(std::remove(Cases.begin(), Cases.end(), BB), Cases.end()); } /// ValuesOverlap - Return true if there are any keys in C1 that exist in C2 as @@ -667,13 +674,32 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator() << "Through successor TI: " << *TI); + // Collect branch weights into a vector. + SmallVector<uint32_t, 8> Weights; + MDNode* MD = SI->getMetadata(LLVMContext::MD_prof); + bool HasWeight = MD && (MD->getNumOperands() == 2 + SI->getNumCases()); + if (HasWeight) + for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e; + ++MD_i) { + ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i)); + assert(CI); + Weights.push_back(CI->getValue().getZExtValue()); + } for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) { --i; if (DeadCases.count(i.getCaseValue())) { + if (HasWeight) { + std::swap(Weights[i.getCaseIndex()+1], Weights.back()); + Weights.pop_back(); + } i.getCaseSuccessor()->removePredecessor(TI->getParent()); SI->removeCase(i); } } + if (HasWeight && Weights.size() >= 2) + SI->setMetadata(LLVMContext::MD_prof, + MDBuilder(SI->getParent()->getContext()). + createBranchWeights(Weights)); DEBUG(dbgs() << "Leaving: " << *TI << "\n"); return true; @@ -752,38 +778,27 @@ static inline bool HasBranchWeights(const Instruction* I) { return false; } -/// Tries to get a branch weight for the given instruction, returns NULL if it -/// can't. Pos starts at 0. -static ConstantInt* GetWeight(Instruction* I, int Pos) { - MDNode* ProfMD = I->getMetadata(LLVMContext::MD_prof); - if (ProfMD && ProfMD->getOperand(0)) { - if (MDString* MDS = dyn_cast<MDString>(ProfMD->getOperand(0))) { - if (MDS->getString().equals("branch_weights")) { - assert(ProfMD->getNumOperands() >= 3); - return dyn_cast<ConstantInt>(ProfMD->getOperand(1 + Pos)); - } - } - } - - return 0; -} - -/// Scale the given weights based on the successor TI's metadata. Scaling is -/// done by multiplying every weight by the sum of the successor's weights. -static void ScaleWeights(Instruction* STI, MutableArrayRef<uint64_t> Weights) { - // Sum the successor's weights - assert(HasBranchWeights(STI)); - unsigned Scale = 0; - MDNode* ProfMD = STI->getMetadata(LLVMContext::MD_prof); - for (unsigned i = 1; i < ProfMD->getNumOperands(); ++i) { - ConstantInt* CI = dyn_cast<ConstantInt>(ProfMD->getOperand(i)); +/// Get Weights of a given TerminatorInst, the default weight is at the front +/// of the vector. If TI is a conditional eq, we need to swap the branch-weight +/// metadata. +static void GetBranchWeights(TerminatorInst *TI, + SmallVectorImpl<uint64_t> &Weights) { + MDNode* MD = TI->getMetadata(LLVMContext::MD_prof); + assert(MD); + for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) { + ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(i)); assert(CI); - Scale += CI->getValue().getZExtValue(); + Weights.push_back(CI->getValue().getZExtValue()); } - // Skip default, as it's replaced during the folding - for (unsigned i = 1; i < Weights.size(); ++i) { - Weights[i] *= Scale; + // If TI is a conditional eq, the default case is the false case, + // and the corresponding branch-weight data is at index 2. We swap the + // default weight to be the first entry. + if (BranchInst* BI = dyn_cast<BranchInst>(TI)) { + assert(Weights.size() == 2); + ICmpInst *ICI = cast<ICmpInst>(BI->getCondition()); + if (ICI->getPredicate() == ICmpInst::ICMP_EQ) + std::swap(Weights.front(), Weights.back()); } } @@ -838,52 +853,28 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, // Update the branch weight metadata along the way SmallVector<uint64_t, 8> Weights; - uint64_t PredDefaultWeight = 0; bool PredHasWeights = HasBranchWeights(PTI); bool SuccHasWeights = HasBranchWeights(TI); if (PredHasWeights) { - MDNode* MD = PTI->getMetadata(LLVMContext::MD_prof); - assert(MD); - for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) { - ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(i)); - assert(CI); - Weights.push_back(CI->getValue().getZExtValue()); - } - - // If the predecessor is a conditional eq, then swap the default weight - // to be the first entry. - if (BranchInst* BI = dyn_cast<BranchInst>(PTI)) { - assert(Weights.size() == 2); - ICmpInst *ICI = cast<ICmpInst>(BI->getCondition()); - - if (ICI->getPredicate() == ICmpInst::ICMP_EQ) { - std::swap(Weights.front(), Weights.back()); - } - } - - PredDefaultWeight = Weights.front(); - } else if (SuccHasWeights) { + GetBranchWeights(PTI, Weights); + // branch-weight metadata is inconsistent here. + if (Weights.size() != 1 + PredCases.size()) + PredHasWeights = SuccHasWeights = false; + } else if (SuccHasWeights) // If there are no predecessor weights but there are successor weights, // populate Weights with 1, which will later be scaled to the sum of // successor's weights Weights.assign(1 + PredCases.size(), 1); - PredDefaultWeight = 1; - } - uint64_t SuccDefaultWeight = 0; + SmallVector<uint64_t, 8> SuccWeights; if (SuccHasWeights) { - int Index = 0; - if (BranchInst* BI = dyn_cast<BranchInst>(TI)) { - ICmpInst* ICI = dyn_cast<ICmpInst>(BI->getCondition()); - assert(ICI); - - if (ICI->getPredicate() == ICmpInst::ICMP_EQ) - Index = 1; - } - - SuccDefaultWeight = GetWeight(TI, Index)->getValue().getZExtValue(); - } + GetBranchWeights(TI, SuccWeights); + // branch-weight metadata is inconsistent here. + if (SuccWeights.size() != 1 + BBCases.size()) + PredHasWeights = SuccHasWeights = false; + } else if (PredHasWeights) + SuccWeights.assign(1 + BBCases.size(), 1); if (PredDefault == BB) { // If this is the default destination from PTI, only the edges in TI @@ -896,7 +887,9 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, // The default destination is BB, we don't need explicit targets. std::swap(PredCases[i], PredCases.back()); - if (PredHasWeights) { + if (PredHasWeights || SuccHasWeights) { + // Increase weight for the default case. + Weights[0] += Weights[i+1]; std::swap(Weights[i+1], Weights.back()); Weights.pop_back(); } @@ -912,40 +905,46 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, NewSuccessors.push_back(BBDefault); } - if (SuccHasWeights) { - ScaleWeights(TI, Weights); - Weights.front() *= SuccDefaultWeight; - } else if (PredHasWeights) { - Weights.front() /= (1 + BBCases.size()); - } - + unsigned CasesFromPred = Weights.size(); + uint64_t ValidTotalSuccWeight = 0; for (unsigned i = 0, e = BBCases.size(); i != e; ++i) if (!PTIHandled.count(BBCases[i].Value) && BBCases[i].Dest != BBDefault) { PredCases.push_back(BBCases[i]); NewSuccessors.push_back(BBCases[i].Dest); - if (SuccHasWeights) { - Weights.push_back(PredDefaultWeight * - GetWeight(TI, i)->getValue().getZExtValue()); - } else if (PredHasWeights) { - // Split the old default's weight amongst the children - Weights.push_back(PredDefaultWeight / (1 + BBCases.size())); + if (SuccHasWeights || PredHasWeights) { + // The default weight is at index 0, so weight for the ith case + // should be at index i+1. Scale the cases from successor by + // PredDefaultWeight (Weights[0]). + Weights.push_back(Weights[0] * SuccWeights[i+1]); + ValidTotalSuccWeight += SuccWeights[i+1]; } } + if (SuccHasWeights || PredHasWeights) { + ValidTotalSuccWeight += SuccWeights[0]; + // Scale the cases from predecessor by ValidTotalSuccWeight. + for (unsigned i = 1; i < CasesFromPred; ++i) + Weights[i] *= ValidTotalSuccWeight; + // Scale the default weight by SuccDefaultWeight (SuccWeights[0]). + Weights[0] *= SuccWeights[0]; + } } else { - // FIXME: preserve branch weight metadata, similarly to the 'then' - // above. For now, drop it. - PredHasWeights = false; - SuccHasWeights = false; - // If this is not the default destination from PSI, only the edges // in SI that occur in PSI with a destination of BB will be // activated. std::set<ConstantInt*, ConstantIntOrdering> PTIHandled; + std::map<ConstantInt*, uint64_t> WeightsForHandled; for (unsigned i = 0, e = PredCases.size(); i != e; ++i) if (PredCases[i].Dest == BB) { PTIHandled.insert(PredCases[i].Value); + + if (PredHasWeights || SuccHasWeights) { + WeightsForHandled[PredCases[i].Value] = Weights[i+1]; + std::swap(Weights[i+1], Weights.back()); + Weights.pop_back(); + } + std::swap(PredCases[i], PredCases.back()); PredCases.pop_back(); --i; --e; @@ -956,6 +955,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, for (unsigned i = 0, e = BBCases.size(); i != e; ++i) if (PTIHandled.count(BBCases[i].Value)) { // If this is one we are capable of getting... + if (PredHasWeights || SuccHasWeights) + Weights.push_back(WeightsForHandled[BBCases[i].Value]); PredCases.push_back(BBCases[i]); NewSuccessors.push_back(BBCases[i].Dest); PTIHandled.erase(BBCases[i].Value);// This constant is taken care of @@ -966,6 +967,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I = PTIHandled.begin(), E = PTIHandled.end(); I != E; ++I) { + if (PredHasWeights || SuccHasWeights) + Weights.push_back(WeightsForHandled[*I]); PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault)); NewSuccessors.push_back(BBDefault); } @@ -980,7 +983,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, Builder.SetInsertPoint(PTI); // Convert pointer to int before we switch. if (CV->getType()->isPointerTy()) { - assert(TD && "Cannot switch on pointer without TargetData"); + assert(TD && "Cannot switch on pointer without DataLayout"); CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getContext()), "magicptr"); } @@ -1160,6 +1163,175 @@ HoistTerminator: return true; } +/// SinkThenElseCodeToEnd - Given an unconditional branch that goes to BBEnd, +/// check whether BBEnd has only two predecessors and the other predecessor +/// ends with an unconditional branch. If it is true, sink any common code +/// in the two predecessors to BBEnd. +static bool SinkThenElseCodeToEnd(BranchInst *BI1) { + assert(BI1->isUnconditional()); + BasicBlock *BB1 = BI1->getParent(); + BasicBlock *BBEnd = BI1->getSuccessor(0); + + // Check that BBEnd has two predecessors and the other predecessor ends with + // an unconditional branch. + pred_iterator PI = pred_begin(BBEnd), PE = pred_end(BBEnd); + BasicBlock *Pred0 = *PI++; + if (PI == PE) // Only one predecessor. + return false; + BasicBlock *Pred1 = *PI++; + if (PI != PE) // More than two predecessors. + return false; + BasicBlock *BB2 = (Pred0 == BB1) ? Pred1 : Pred0; + BranchInst *BI2 = dyn_cast<BranchInst>(BB2->getTerminator()); + if (!BI2 || !BI2->isUnconditional()) + return false; + + // Gather the PHI nodes in BBEnd. + std::map<Value*, std::pair<Value*, PHINode*> > MapValueFromBB1ToBB2; + Instruction *FirstNonPhiInBBEnd = 0; + for (BasicBlock::iterator I = BBEnd->begin(), E = BBEnd->end(); + I != E; ++I) { + if (PHINode *PN = dyn_cast<PHINode>(I)) { + Value *BB1V = PN->getIncomingValueForBlock(BB1); + Value *BB2V = PN->getIncomingValueForBlock(BB2); + MapValueFromBB1ToBB2[BB1V] = std::make_pair(BB2V, PN); + } else { + FirstNonPhiInBBEnd = &*I; + break; + } + } + if (!FirstNonPhiInBBEnd) + return false; + + + // This does very trivial matching, with limited scanning, to find identical + // instructions in the two blocks. We scan backward for obviously identical + // instructions in an identical order. + BasicBlock::InstListType::reverse_iterator RI1 = BB1->getInstList().rbegin(), + RE1 = BB1->getInstList().rend(), RI2 = BB2->getInstList().rbegin(), + RE2 = BB2->getInstList().rend(); + // Skip debug info. + while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1; + if (RI1 == RE1) + return false; + while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2; + if (RI2 == RE2) + return false; + // Skip the unconditional branches. + ++RI1; + ++RI2; + + bool Changed = false; + while (RI1 != RE1 && RI2 != RE2) { + // Skip debug info. + while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1; + if (RI1 == RE1) + return Changed; + while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2; + if (RI2 == RE2) + return Changed; + + Instruction *I1 = &*RI1, *I2 = &*RI2; + // I1 and I2 should have a single use in the same PHI node, and they + // perform the same operation. + // Cannot move control-flow-involving, volatile loads, vaarg, etc. + if (isa<PHINode>(I1) || isa<PHINode>(I2) || + isa<TerminatorInst>(I1) || isa<TerminatorInst>(I2) || + isa<LandingPadInst>(I1) || isa<LandingPadInst>(I2) || + isa<AllocaInst>(I1) || isa<AllocaInst>(I2) || + I1->mayHaveSideEffects() || I2->mayHaveSideEffects() || + I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() || + !I1->hasOneUse() || !I2->hasOneUse() || + MapValueFromBB1ToBB2.find(I1) == MapValueFromBB1ToBB2.end() || + MapValueFromBB1ToBB2[I1].first != I2) + return Changed; + + // Check whether we should swap the operands of ICmpInst. + ICmpInst *ICmp1 = dyn_cast<ICmpInst>(I1), *ICmp2 = dyn_cast<ICmpInst>(I2); + bool SwapOpnds = false; + if (ICmp1 && ICmp2 && + ICmp1->getOperand(0) != ICmp2->getOperand(0) && + ICmp1->getOperand(1) != ICmp2->getOperand(1) && + (ICmp1->getOperand(0) == ICmp2->getOperand(1) || + ICmp1->getOperand(1) == ICmp2->getOperand(0))) { + ICmp2->swapOperands(); + SwapOpnds = true; + } + if (!I1->isSameOperationAs(I2)) { + if (SwapOpnds) + ICmp2->swapOperands(); + return Changed; + } + + // The operands should be either the same or they need to be generated + // with a PHI node after sinking. We only handle the case where there is + // a single pair of different operands. + Value *DifferentOp1 = 0, *DifferentOp2 = 0; + unsigned Op1Idx = 0; + for (unsigned I = 0, E = I1->getNumOperands(); I != E; ++I) { + if (I1->getOperand(I) == I2->getOperand(I)) + continue; + // Early exit if we have more-than one pair of different operands or + // the different operand is already in MapValueFromBB1ToBB2. + // Early exit if we need a PHI node to replace a constant. + if (DifferentOp1 || + MapValueFromBB1ToBB2.find(I1->getOperand(I)) != + MapValueFromBB1ToBB2.end() || + isa<Constant>(I1->getOperand(I)) || + isa<Constant>(I2->getOperand(I))) { + // If we can't sink the instructions, undo the swapping. + if (SwapOpnds) + ICmp2->swapOperands(); + return Changed; + } + DifferentOp1 = I1->getOperand(I); + Op1Idx = I; + DifferentOp2 = I2->getOperand(I); + } + + // We insert the pair of different operands to MapValueFromBB1ToBB2 and + // remove (I1, I2) from MapValueFromBB1ToBB2. + if (DifferentOp1) { + PHINode *NewPN = PHINode::Create(DifferentOp1->getType(), 2, + DifferentOp1->getName() + ".sink", + BBEnd->begin()); + MapValueFromBB1ToBB2[DifferentOp1] = std::make_pair(DifferentOp2, NewPN); + // I1 should use NewPN instead of DifferentOp1. + I1->setOperand(Op1Idx, NewPN); + NewPN->addIncoming(DifferentOp1, BB1); + NewPN->addIncoming(DifferentOp2, BB2); + DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";); + } + PHINode *OldPN = MapValueFromBB1ToBB2[I1].second; + MapValueFromBB1ToBB2.erase(I1); + + DEBUG(dbgs() << "SINK common instructions " << *I1 << "\n";); + DEBUG(dbgs() << " " << *I2 << "\n";); + // We need to update RE1 and RE2 if we are going to sink the first + // instruction in the basic block down. + bool UpdateRE1 = (I1 == BB1->begin()), UpdateRE2 = (I2 == BB2->begin()); + // Sink the instruction. + BBEnd->getInstList().splice(FirstNonPhiInBBEnd, BB1->getInstList(), I1); + if (!OldPN->use_empty()) + OldPN->replaceAllUsesWith(I1); + OldPN->eraseFromParent(); + + if (!I2->use_empty()) + I2->replaceAllUsesWith(I1); + I1->intersectOptionalDataWith(I2); + I2->eraseFromParent(); + + if (UpdateRE1) + RE1 = BB1->getInstList().rend(); + if (UpdateRE2) + RE2 = BB2->getInstList().rend(); + FirstNonPhiInBBEnd = I1; + NumSinkCommons++; + Changed = true; + } + return Changed; +} + /// SpeculativelyExecuteBB - Given a conditional branch that goes to BB1 /// and an BB2 and the only successor of BB1 is BB2, hoist simple code /// (for now, restricted to a single instruction that's side effect free) from @@ -1243,7 +1415,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) { if (BB1V == BIParentV) continue; - // Check for saftey. + // Check for safety. if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BB1V)) { // An unfolded ConstantExpr could end up getting expanded into // Instructions. Don't speculate this and another instruction at @@ -1339,7 +1511,7 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) { /// that is defined in the same block as the branch and if any PHI entries are /// constants, thread edges corresponding to that entry to be branches to their /// ultimate destination. -static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) { +static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) { BasicBlock *BB = BI->getParent(); PHINode *PN = dyn_cast<PHINode>(BI->getCondition()); // NOTE: we currently cannot transform this case if the PHI node is used @@ -1435,7 +1607,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) { /// FoldTwoEntryPHINode - Given a BB that starts with the specified two-entry /// PHI node, see if we can eliminate it. -static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) { +static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *TD) { // Ok, this is a two entry PHI node. Check to see if this is a simple "if // statement", which has a very simple dominance structure. Basically, we // are trying to find the condition that is being branched on, which @@ -1662,7 +1834,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI, /// parameters and return true, or returns false if no or invalid metadata was /// found. static bool ExtractBranchMetadata(BranchInst *BI, - APInt &ProbTrue, APInt &ProbFalse) { + uint64_t &ProbTrue, uint64_t &ProbFalse) { assert(BI->isConditional() && "Looking for probabilities on unconditional branch?"); MDNode *ProfileData = BI->getMetadata(LLVMContext::MD_prof); @@ -1670,35 +1842,11 @@ static bool ExtractBranchMetadata(BranchInst *BI, ConstantInt *CITrue = dyn_cast<ConstantInt>(ProfileData->getOperand(1)); ConstantInt *CIFalse = dyn_cast<ConstantInt>(ProfileData->getOperand(2)); if (!CITrue || !CIFalse) return false; - ProbTrue = CITrue->getValue(); - ProbFalse = CIFalse->getValue(); - assert(ProbTrue.getBitWidth() == 32 && ProbFalse.getBitWidth() == 32 && - "Branch probability metadata must be 32-bit integers"); + ProbTrue = CITrue->getValue().getZExtValue(); + ProbFalse = CIFalse->getValue().getZExtValue(); return true; } -/// MultiplyAndLosePrecision - Multiplies A and B, then returns the result. In -/// the event of overflow, logically-shifts all four inputs right until the -/// multiply fits. -static APInt MultiplyAndLosePrecision(APInt &A, APInt &B, APInt &C, APInt &D, - unsigned &BitsLost) { - BitsLost = 0; - bool Overflow = false; - APInt Result = A.umul_ov(B, Overflow); - if (Overflow) { - APInt MaxB = APInt::getMaxValue(A.getBitWidth()).udiv(A); - do { - B = B.lshr(1); - ++BitsLost; - } while (B.ugt(MaxB)); - A = A.lshr(BitsLost); - C = C.lshr(BitsLost); - D = D.lshr(BitsLost); - Result = A * B; - } - return Result; -} - /// checkCSEInPredecessor - Return true if the given instruction is available /// in its predecessor block. If yes, the instruction will be removed. /// @@ -1824,7 +1972,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { continue; // Determine if the two branches share a common destination. - Instruction::BinaryOps Opc; + Instruction::BinaryOps Opc = Instruction::BinaryOpsEnd; bool InvertPredCond = false; if (BI->isConditional()) { @@ -1923,14 +2071,53 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { New, "or.cond")); PBI->setCondition(NewCond); + uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight; + bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight, + PredFalseWeight); + bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight, + SuccFalseWeight); + SmallVector<uint64_t, 8> NewWeights; + if (PBI->getSuccessor(0) == BB) { + if (PredHasWeights && SuccHasWeights) { + // PBI: br i1 %x, BB, FalseDest + // BI: br i1 %y, TrueDest, FalseDest + //TrueWeight is TrueWeight for PBI * TrueWeight for BI. + NewWeights.push_back(PredTrueWeight * SuccTrueWeight); + //FalseWeight is FalseWeight for PBI * TotalWeight for BI + + // TrueWeight for PBI * FalseWeight for BI. + // We assume that total weights of a BranchInst can fit into 32 bits. + // Therefore, we will not have overflow using 64-bit arithmetic. + NewWeights.push_back(PredFalseWeight * (SuccFalseWeight + + SuccTrueWeight) + PredTrueWeight * SuccFalseWeight); + } AddPredecessorToBlock(TrueDest, PredBlock, BB); PBI->setSuccessor(0, TrueDest); } if (PBI->getSuccessor(1) == BB) { + if (PredHasWeights && SuccHasWeights) { + // PBI: br i1 %x, TrueDest, BB + // BI: br i1 %y, TrueDest, FalseDest + //TrueWeight is TrueWeight for PBI * TotalWeight for BI + + // FalseWeight for PBI * TrueWeight for BI. + NewWeights.push_back(PredTrueWeight * (SuccFalseWeight + + SuccTrueWeight) + PredFalseWeight * SuccTrueWeight); + //FalseWeight is FalseWeight for PBI * FalseWeight for BI. + NewWeights.push_back(PredFalseWeight * SuccFalseWeight); + } AddPredecessorToBlock(FalseDest, PredBlock, BB); PBI->setSuccessor(1, FalseDest); } + if (NewWeights.size() == 2) { + // Halve the weights if any of them cannot fit in an uint32_t + FitWeights(NewWeights); + + SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(),NewWeights.end()); + PBI->setMetadata(LLVMContext::MD_prof, + MDBuilder(BI->getContext()). + createBranchWeights(MDWeights)); + } else + PBI->setMetadata(LLVMContext::MD_prof, NULL); } else { // Update PHI nodes in the common successors. for (unsigned i = 0, e = PHIs.size(); i != e; ++i) { @@ -1985,90 +2172,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // TODO: If BB is reachable from all paths through PredBlock, then we // could replace PBI's branch probabilities with BI's. - // Merge probability data into PredBlock's branch. - APInt A, B, C, D; - if (PBI->isConditional() && BI->isConditional() && - ExtractBranchMetadata(PBI, C, D) && ExtractBranchMetadata(BI, A, B)) { - // Given IR which does: - // bbA: - // br i1 %x, label %bbB, label %bbC - // bbB: - // br i1 %y, label %bbD, label %bbC - // Let's call the probability that we take the edge from %bbA to %bbB - // 'a', from %bbA to %bbC, 'b', from %bbB to %bbD 'c' and from %bbB to - // %bbC probability 'd'. - // - // We transform the IR into: - // bbA: - // br i1 %z, label %bbD, label %bbC - // where the probability of going to %bbD is (a*c) and going to bbC is - // (b+a*d). - // - // Probabilities aren't stored as ratios directly. Using branch weights, - // we get: - // (a*c)% = A*C, (b+(a*d))% = A*D+B*C+B*D. - - // In the event of overflow, we want to drop the LSB of the input - // probabilities. - unsigned BitsLost; - - // Ignore overflow result on ProbTrue. - APInt ProbTrue = MultiplyAndLosePrecision(A, C, B, D, BitsLost); - - APInt Tmp1 = MultiplyAndLosePrecision(B, D, A, C, BitsLost); - if (BitsLost) { - ProbTrue = ProbTrue.lshr(BitsLost*2); - } - - APInt Tmp2 = MultiplyAndLosePrecision(A, D, C, B, BitsLost); - if (BitsLost) { - ProbTrue = ProbTrue.lshr(BitsLost*2); - Tmp1 = Tmp1.lshr(BitsLost*2); - } - - APInt Tmp3 = MultiplyAndLosePrecision(B, C, A, D, BitsLost); - if (BitsLost) { - ProbTrue = ProbTrue.lshr(BitsLost*2); - Tmp1 = Tmp1.lshr(BitsLost*2); - Tmp2 = Tmp2.lshr(BitsLost*2); - } - - bool Overflow1 = false, Overflow2 = false; - APInt Tmp4 = Tmp2.uadd_ov(Tmp3, Overflow1); - APInt ProbFalse = Tmp4.uadd_ov(Tmp1, Overflow2); - - if (Overflow1 || Overflow2) { - ProbTrue = ProbTrue.lshr(1); - Tmp1 = Tmp1.lshr(1); - Tmp2 = Tmp2.lshr(1); - Tmp3 = Tmp3.lshr(1); - Tmp4 = Tmp2 + Tmp3; - ProbFalse = Tmp4 + Tmp1; - } - - // The sum of branch weights must fit in 32-bits. - if (ProbTrue.isNegative() && ProbFalse.isNegative()) { - ProbTrue = ProbTrue.lshr(1); - ProbFalse = ProbFalse.lshr(1); - } - - if (ProbTrue != ProbFalse) { - // Normalize the result. - APInt GCD = APIntOps::GreatestCommonDivisor(ProbTrue, ProbFalse); - ProbTrue = ProbTrue.udiv(GCD); - ProbFalse = ProbFalse.udiv(GCD); - - MDBuilder MDB(BI->getContext()); - MDNode *N = MDB.createBranchWeights(ProbTrue.getZExtValue(), - ProbFalse.getZExtValue()); - PBI->setMetadata(LLVMContext::MD_prof, N); - } else { - PBI->setMetadata(LLVMContext::MD_prof, NULL); - } - } else { - PBI->setMetadata(LLVMContext::MD_prof, NULL); - } - // Copy any debug value intrinsics into the end of PredBlock. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) if (isa<DbgInfoIntrinsic>(*I)) @@ -2223,6 +2326,33 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { PBI->setSuccessor(0, CommonDest); PBI->setSuccessor(1, OtherDest); + // Update branch weight for PBI. + uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight; + bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight, + PredFalseWeight); + bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight, + SuccFalseWeight); + if (PredHasWeights && SuccHasWeights) { + uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight; + uint64_t PredOther = PBIOp ?PredTrueWeight : PredFalseWeight; + uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight; + uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight; + // The weight to CommonDest should be PredCommon * SuccTotal + + // PredOther * SuccCommon. + // The weight to OtherDest should be PredOther * SuccOther. + SmallVector<uint64_t, 2> NewWeights; + NewWeights.push_back(PredCommon * (SuccCommon + SuccOther) + + PredOther * SuccCommon); + NewWeights.push_back(PredOther * SuccOther); + // Halve the weights if any of them cannot fit in an uint32_t + FitWeights(NewWeights); + + SmallVector<uint32_t, 2> MDWeights(NewWeights.begin(),NewWeights.end()); + PBI->setMetadata(LLVMContext::MD_prof, + MDBuilder(BI->getContext()). + createBranchWeights(MDWeights)); + } + // OtherDest may have phi nodes. If so, add an entry from PBI's // block that are identical to the entries for BI's block. AddPredecessorToBlock(OtherDest, PBI->getParent(), BB); @@ -2259,7 +2389,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { // Also makes sure not to introduce new successors by assuming that edges to // non-successor TrueBBs and FalseBBs aren't reachable. static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond, - BasicBlock *TrueBB, BasicBlock *FalseBB){ + BasicBlock *TrueBB, BasicBlock *FalseBB, + uint32_t TrueWeight, + uint32_t FalseWeight){ // Remove any superfluous successor edges from the CFG. // First, figure out which successors to preserve. // If TrueBB and FalseBB are equal, only try to preserve one copy of that @@ -2288,10 +2420,15 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond, // We were only looking for one successor, and it was present. // Create an unconditional branch to it. Builder.CreateBr(TrueBB); - else + else { // We found both of the successors we were looking for. // Create a conditional branch sharing the condition of the select. - Builder.CreateCondBr(Cond, TrueBB, FalseBB); + BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB); + if (TrueWeight != FalseWeight) + NewBI->setMetadata(LLVMContext::MD_prof, + MDBuilder(OldTerm->getContext()). + createBranchWeights(TrueWeight, FalseWeight)); + } } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) { // Neither of the selected blocks were successors, so this // terminator must be unreachable. @@ -2328,8 +2465,23 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) { BasicBlock *TrueBB = SI->findCaseValue(TrueVal).getCaseSuccessor(); BasicBlock *FalseBB = SI->findCaseValue(FalseVal).getCaseSuccessor(); + // Get weight for TrueBB and FalseBB. + uint32_t TrueWeight = 0, FalseWeight = 0; + SmallVector<uint64_t, 8> Weights; + bool HasWeights = HasBranchWeights(SI); + if (HasWeights) { + GetBranchWeights(SI, Weights); + if (Weights.size() == 1 + SI->getNumCases()) { + TrueWeight = (uint32_t)Weights[SI->findCaseValue(TrueVal). + getSuccessorIndex()]; + FalseWeight = (uint32_t)Weights[SI->findCaseValue(FalseVal). + getSuccessorIndex()]; + } + } + // Perform the actual simplification. - return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB); + return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB, + TrueWeight, FalseWeight); } // SimplifyIndirectBrOnSelect - Replaces @@ -2349,7 +2501,8 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) { BasicBlock *FalseBB = FBA->getBasicBlock(); // Perform the actual simplification. - return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB); + return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, + 0, 0); } /// TryToSimplifyUncondBranchWithICmpInIt - This is called when we find an icmp @@ -2369,9 +2522,9 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) { /// /// We prefer to split the edge to 'end' so that there is a true/false entry to /// the PHI, merging the third icmp into the switch. -static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, - const TargetData *TD, - IRBuilder<> &Builder) { +static bool TryToSimplifyUncondBranchWithICmpInIt( + ICmpInst *ICI, IRBuilder<> &Builder, const TargetTransformInfo &TTI, + const DataLayout *TD) { BasicBlock *BB = ICI->getParent(); // If the block has any PHIs in it or the icmp has multiple uses, it is too @@ -2404,7 +2557,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, ICI->eraseFromParent(); } // BB is now empty, so it is likely to simplify away. - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; } // Ok, the block is reachable from the default dest. If the constant we're @@ -2420,7 +2573,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, ICI->replaceAllUsesWith(V); ICI->eraseFromParent(); // BB is now empty, so it is likely to simplify away. - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; } // The use of the icmp has to be in the 'end' block, by the only PHI node in @@ -2448,6 +2601,21 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, // the switch to the merge point on the compared value. BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "switch.edge", BB->getParent(), BB); + SmallVector<uint64_t, 8> Weights; + bool HasWeights = HasBranchWeights(SI); + if (HasWeights) { + GetBranchWeights(SI, Weights); + if (Weights.size() == 1 + SI->getNumCases()) { + // Split weight for default case to case for "Cst". + Weights[0] = (Weights[0]+1) >> 1; + Weights.push_back(Weights[0]); + + SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end()); + SI->setMetadata(LLVMContext::MD_prof, + MDBuilder(SI->getContext()). + createBranchWeights(MDWeights)); + } + } SI->addCase(Cst, NewBB); // NewBB branches to the phi block, add the uncond branch and the phi entry. @@ -2461,7 +2629,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, /// SimplifyBranchOnICmpChain - The specified branch is a conditional branch. /// Check to see if it is branching on an or/and chain of icmp instructions, and /// fold it into a switch instruction if so. -static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD, +static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *TD, IRBuilder<> &Builder) { Instruction *Cond = dyn_cast<Instruction>(BI->getCondition()); if (Cond == 0) return false; @@ -2542,7 +2710,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD, Builder.SetInsertPoint(BI); // Convert pointer to int before we switch. if (CompVal->getType()->isPointerTy()) { - assert(TD && "Cannot switch on pointer without TargetData"); + assert(TD && "Cannot switch on pointer without DataLayout"); CompVal = Builder.CreatePtrToInt(CompVal, TD->getIntPtrType(CompVal->getContext()), "magicptr"); @@ -2861,9 +3029,28 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { if (!Offset->isNullValue()) Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off"); Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch"); - Builder.CreateCondBr( + BranchInst *NewBI = Builder.CreateCondBr( Cmp, SI->case_begin().getCaseSuccessor(), SI->getDefaultDest()); + // Update weight for the newly-created conditional branch. + SmallVector<uint64_t, 8> Weights; + bool HasWeights = HasBranchWeights(SI); + if (HasWeights) { + GetBranchWeights(SI, Weights); + if (Weights.size() == 1 + SI->getNumCases()) { + // Combine all weights for the cases to be the true weight of NewBI. + // We assume that the sum of all weights for a Terminator can fit into 32 + // bits. + uint32_t NewTrueWeight = 0; + for (unsigned I = 1, E = Weights.size(); I != E; ++I) + NewTrueWeight += (uint32_t)Weights[I]; + NewBI->setMetadata(LLVMContext::MD_prof, + MDBuilder(SI->getContext()). + createBranchWeights(NewTrueWeight, + (uint32_t)Weights[0])); + } + } + // Prune obsolete incoming values off the successor's PHI nodes. for (BasicBlock::iterator BBI = SI->case_begin().getCaseSuccessor()->begin(); isa<PHINode>(BBI); ++BBI) { @@ -2894,15 +3081,33 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI) { } } + SmallVector<uint64_t, 8> Weights; + bool HasWeight = HasBranchWeights(SI); + if (HasWeight) { + GetBranchWeights(SI, Weights); + HasWeight = (Weights.size() == 1 + SI->getNumCases()); + } + // Remove dead cases from the switch. for (unsigned I = 0, E = DeadCases.size(); I != E; ++I) { SwitchInst::CaseIt Case = SI->findCaseValue(DeadCases[I]); assert(Case != SI->case_default() && "Case was not found. Probably mistake in DeadCases forming."); + if (HasWeight) { + std::swap(Weights[Case.getCaseIndex()+1], Weights.back()); + Weights.pop_back(); + } + // Prune unused values from PHI nodes. Case.getCaseSuccessor()->removePredecessor(SI->getParent()); SI->removeCase(Case); } + if (HasWeight) { + SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end()); + SI->setMetadata(LLVMContext::MD_prof, + MDBuilder(SI->getParent()->getContext()). + createBranchWeights(MDWeights)); + } return !DeadCases.empty(); } @@ -2991,26 +3196,95 @@ static bool ValidLookupTableConstant(Constant *C) { isa<UndefValue>(C); } -/// GetCaseResulsts - Try to determine the resulting constant values in phi -/// nodes at the common destination basic block for one of the case -/// destinations of a switch instruction. +/// LookupConstant - If V is a Constant, return it. Otherwise, try to look up +/// its constant value in ConstantPool, returning 0 if it's not there. +static Constant *LookupConstant(Value *V, + const SmallDenseMap<Value*, Constant*>& ConstantPool) { + if (Constant *C = dyn_cast<Constant>(V)) + return C; + return ConstantPool.lookup(V); +} + +/// ConstantFold - Try to fold instruction I into a constant. This works for +/// simple instructions such as binary operations where both operands are +/// constant or can be replaced by constants from the ConstantPool. Returns the +/// resulting constant on success, 0 otherwise. +static Constant *ConstantFold(Instruction *I, + const SmallDenseMap<Value*, Constant*>& ConstantPool) { + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { + Constant *A = LookupConstant(BO->getOperand(0), ConstantPool); + if (!A) + return 0; + Constant *B = LookupConstant(BO->getOperand(1), ConstantPool); + if (!B) + return 0; + return ConstantExpr::get(BO->getOpcode(), A, B); + } + + if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) { + Constant *A = LookupConstant(I->getOperand(0), ConstantPool); + if (!A) + return 0; + Constant *B = LookupConstant(I->getOperand(1), ConstantPool); + if (!B) + return 0; + return ConstantExpr::getCompare(Cmp->getPredicate(), A, B); + } + + if (SelectInst *Select = dyn_cast<SelectInst>(I)) { + Constant *A = LookupConstant(Select->getCondition(), ConstantPool); + if (!A) + return 0; + if (A->isAllOnesValue()) + return LookupConstant(Select->getTrueValue(), ConstantPool); + if (A->isNullValue()) + return LookupConstant(Select->getFalseValue(), ConstantPool); + return 0; + } + + if (CastInst *Cast = dyn_cast<CastInst>(I)) { + Constant *A = LookupConstant(I->getOperand(0), ConstantPool); + if (!A) + return 0; + return ConstantExpr::getCast(Cast->getOpcode(), A, Cast->getDestTy()); + } + + return 0; +} + +/// GetCaseResults - Try to determine the resulting constant values in phi nodes +/// at the common destination basic block, *CommonDest, for one of the case +/// destionations CaseDest corresponding to value CaseVal (0 for the default +/// case), of a switch instruction SI. static bool GetCaseResults(SwitchInst *SI, + ConstantInt *CaseVal, BasicBlock *CaseDest, BasicBlock **CommonDest, SmallVector<std::pair<PHINode*,Constant*>, 4> &Res) { // The block from which we enter the common destination. BasicBlock *Pred = SI->getParent(); - // If CaseDest is empty, continue to its successor. - if (CaseDest->getFirstNonPHIOrDbg() == CaseDest->getTerminator() && - !isa<PHINode>(CaseDest->begin())) { - - TerminatorInst *Terminator = CaseDest->getTerminator(); - if (Terminator->getNumSuccessors() != 1) - return false; - - Pred = CaseDest; - CaseDest = Terminator->getSuccessor(0); + // If CaseDest is empty except for some side-effect free instructions through + // which we can constant-propagate the CaseVal, continue to its successor. + SmallDenseMap<Value*, Constant*> ConstantPool; + ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal)); + for (BasicBlock::iterator I = CaseDest->begin(), E = CaseDest->end(); I != E; + ++I) { + if (TerminatorInst *T = dyn_cast<TerminatorInst>(I)) { + // If the terminator is a simple branch, continue to the next block. + if (T->getNumSuccessors() != 1) + return false; + Pred = CaseDest; + CaseDest = T->getSuccessor(0); + } else if (isa<DbgInfoIntrinsic>(I)) { + // Skip debug intrinsic. + continue; + } else if (Constant *C = ConstantFold(I, ConstantPool)) { + // Instruction is side-effect free and constant. + ConstantPool.insert(std::make_pair(I, C)); + } else { + break; + } } // If we did not have a CommonDest before, use the current one. @@ -3027,10 +3301,17 @@ static bool GetCaseResults(SwitchInst *SI, if (Idx == -1) continue; - Constant *ConstVal = dyn_cast<Constant>(PHI->getIncomingValue(Idx)); + Constant *ConstVal = LookupConstant(PHI->getIncomingValue(Idx), + ConstantPool); if (!ConstVal) return false; + // Note: If the constant comes from constant-propagating the case value + // through the CaseDest basic block, it will be safe to remove the + // instructions in that block. They cannot be used (except in the phi nodes + // we visit) outside CaseDest, because that block does not dominate its + // successor. If it did, we would not be in this phi node. + // Be conservative about which kinds of constants we support. if (!ValidLookupTableConstant(ConstVal)) return false; @@ -3041,83 +3322,255 @@ static bool GetCaseResults(SwitchInst *SI, return true; } -/// BuildLookupTable - Build a lookup table with the contents of Results, using -/// DefaultResult to fill the holes in the table. If the table ends up -/// containing the same result in each element, set *SingleResult to that value -/// and return NULL. -static GlobalVariable *BuildLookupTable(Module &M, - uint64_t TableSize, - ConstantInt *Offset, - const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Results, - Constant *DefaultResult, - Constant **SingleResult) { - assert(Results.size() && "Need values to build lookup table"); - assert(TableSize >= Results.size() && "Table needs to hold all values"); +namespace { + /// SwitchLookupTable - This class represents a lookup table that can be used + /// to replace a switch. + class SwitchLookupTable { + public: + /// SwitchLookupTable - Create a lookup table to use as a switch replacement + /// with the contents of Values, using DefaultValue to fill any holes in the + /// table. + SwitchLookupTable(Module &M, + uint64_t TableSize, + ConstantInt *Offset, + const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values, + Constant *DefaultValue, + const DataLayout *TD); + + /// BuildLookup - Build instructions with Builder to retrieve the value at + /// the position given by Index in the lookup table. + Value *BuildLookup(Value *Index, IRBuilder<> &Builder); + + /// WouldFitInRegister - Return true if a table with TableSize elements of + /// type ElementType would fit in a target-legal register. + static bool WouldFitInRegister(const DataLayout *TD, + uint64_t TableSize, + const Type *ElementType); + + private: + // Depending on the contents of the table, it can be represented in + // different ways. + enum { + // For tables where each element contains the same value, we just have to + // store that single value and return it for each lookup. + SingleValueKind, + + // For small tables with integer elements, we can pack them into a bitmap + // that fits into a target-legal register. Values are retrieved by + // shift and mask operations. + BitMapKind, + + // The table is stored as an array of values. Values are retrieved by load + // instructions from the table. + ArrayKind + } Kind; + + // For SingleValueKind, this is the single value. + Constant *SingleValue; + + // For BitMapKind, this is the bitmap. + ConstantInt *BitMap; + IntegerType *BitMapElementTy; + + // For ArrayKind, this is the array. + GlobalVariable *Array; + }; +} + +SwitchLookupTable::SwitchLookupTable(Module &M, + uint64_t TableSize, + ConstantInt *Offset, + const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values, + Constant *DefaultValue, + const DataLayout *TD) { + assert(Values.size() && "Can't build lookup table without values!"); + assert(TableSize >= Values.size() && "Can't fit values in table!"); // If all values in the table are equal, this is that value. - Constant *SameResult = Results.begin()->second; + SingleValue = Values.begin()->second; // Build up the table contents. - std::vector<Constant*> TableContents(TableSize); - for (size_t I = 0, E = Results.size(); I != E; ++I) { - ConstantInt *CaseVal = Results[I].first; - Constant *CaseRes = Results[I].second; - - uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue(); + SmallVector<Constant*, 64> TableContents(TableSize); + for (size_t I = 0, E = Values.size(); I != E; ++I) { + ConstantInt *CaseVal = Values[I].first; + Constant *CaseRes = Values[I].second; + assert(CaseRes->getType() == DefaultValue->getType()); + + uint64_t Idx = (CaseVal->getValue() - Offset->getValue()) + .getLimitedValue(); TableContents[Idx] = CaseRes; - if (CaseRes != SameResult) - SameResult = NULL; + if (CaseRes != SingleValue) + SingleValue = 0; } // Fill in any holes in the table with the default result. - if (Results.size() < TableSize) { - for (unsigned i = 0; i < TableSize; ++i) { - if (!TableContents[i]) - TableContents[i] = DefaultResult; + if (Values.size() < TableSize) { + for (uint64_t I = 0; I < TableSize; ++I) { + if (!TableContents[I]) + TableContents[I] = DefaultValue; } - if (DefaultResult != SameResult) - SameResult = NULL; + if (DefaultValue != SingleValue) + SingleValue = 0; } - // Same result was used in the entire table; just return that. - if (SameResult) { - *SingleResult = SameResult; - return NULL; + // If each element in the table contains the same value, we only need to store + // that single value. + if (SingleValue) { + Kind = SingleValueKind; + return; } - ArrayType *ArrayTy = ArrayType::get(DefaultResult->getType(), TableSize); + // If the type is integer and the table fits in a register, build a bitmap. + if (WouldFitInRegister(TD, TableSize, DefaultValue->getType())) { + IntegerType *IT = cast<IntegerType>(DefaultValue->getType()); + APInt TableInt(TableSize * IT->getBitWidth(), 0); + for (uint64_t I = TableSize; I > 0; --I) { + TableInt <<= IT->getBitWidth(); + // Insert values into the bitmap. Undef values are set to zero. + if (!isa<UndefValue>(TableContents[I - 1])) { + ConstantInt *Val = cast<ConstantInt>(TableContents[I - 1]); + TableInt |= Val->getValue().zext(TableInt.getBitWidth()); + } + } + BitMap = ConstantInt::get(M.getContext(), TableInt); + BitMapElementTy = IT; + Kind = BitMapKind; + ++NumBitMaps; + return; + } + + // Store the table in an array. + ArrayType *ArrayTy = ArrayType::get(DefaultValue->getType(), TableSize); Constant *Initializer = ConstantArray::get(ArrayTy, TableContents); - GlobalVariable *GV = new GlobalVariable(M, ArrayTy, /*constant=*/ true, - GlobalVariable::PrivateLinkage, - Initializer, - "switch.table"); - GV->setUnnamedAddr(true); - return GV; + Array = new GlobalVariable(M, ArrayTy, /*constant=*/ true, + GlobalVariable::PrivateLinkage, + Initializer, + "switch.table"); + Array->setUnnamedAddr(true); + Kind = ArrayKind; +} + +Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) { + switch (Kind) { + case SingleValueKind: + return SingleValue; + case BitMapKind: { + // Type of the bitmap (e.g. i59). + IntegerType *MapTy = BitMap->getType(); + + // Cast Index to the same type as the bitmap. + // Note: The Index is <= the number of elements in the table, so + // truncating it to the width of the bitmask is safe. + Value *ShiftAmt = Builder.CreateZExtOrTrunc(Index, MapTy, "switch.cast"); + + // Multiply the shift amount by the element width. + ShiftAmt = Builder.CreateMul(ShiftAmt, + ConstantInt::get(MapTy, BitMapElementTy->getBitWidth()), + "switch.shiftamt"); + + // Shift down. + Value *DownShifted = Builder.CreateLShr(BitMap, ShiftAmt, + "switch.downshift"); + // Mask off. + return Builder.CreateTrunc(DownShifted, BitMapElementTy, + "switch.masked"); + } + case ArrayKind: { + Value *GEPIndices[] = { Builder.getInt32(0), Index }; + Value *GEP = Builder.CreateInBoundsGEP(Array, GEPIndices, + "switch.gep"); + return Builder.CreateLoad(GEP, "switch.load"); + } + } + llvm_unreachable("Unknown lookup table kind!"); +} + +bool SwitchLookupTable::WouldFitInRegister(const DataLayout *TD, + uint64_t TableSize, + const Type *ElementType) { + if (!TD) + return false; + const IntegerType *IT = dyn_cast<IntegerType>(ElementType); + if (!IT) + return false; + // FIXME: If the type is wider than it needs to be, e.g. i8 but all values + // are <= 15, we could try to narrow the type. + + // Avoid overflow, fitsInLegalInteger uses unsigned int for the width. + if (TableSize >= UINT_MAX/IT->getBitWidth()) + return false; + return TD->fitsInLegalInteger(TableSize * IT->getBitWidth()); +} + +/// ShouldBuildLookupTable - Determine whether a lookup table should be built +/// for this switch, based on the number of caes, size of the table and the +/// types of the results. +static bool ShouldBuildLookupTable(SwitchInst *SI, + uint64_t TableSize, + const TargetTransformInfo &TTI, + const DataLayout *TD, + const SmallDenseMap<PHINode*, Type*>& ResultTypes) { + if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10) + return false; // TableSize overflowed, or mul below might overflow. + + bool AllTablesFitInRegister = true; + bool HasIllegalType = false; + for (SmallDenseMap<PHINode*, Type*>::const_iterator I = ResultTypes.begin(), + E = ResultTypes.end(); I != E; ++I) { + Type *Ty = I->second; + + // Saturate this flag to true. + HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty); + + // Saturate this flag to false. + AllTablesFitInRegister = AllTablesFitInRegister && + SwitchLookupTable::WouldFitInRegister(TD, TableSize, Ty); + + // If both flags saturate, we're done. NOTE: This *only* works with + // saturating flags, and all flags have to saturate first due to the + // non-deterministic behavior of iterating over a dense map. + if (HasIllegalType && !AllTablesFitInRegister) + break; + } + + // If each table would fit in a register, we should build it anyway. + if (AllTablesFitInRegister) + return true; + + // Don't build a table that doesn't fit in-register if it has illegal types. + if (HasIllegalType) + return false; + + // The table density should be at least 40%. This is the same criterion as for + // jump tables, see SelectionDAGBuilder::handleJTSwitchCase. + // FIXME: Find the best cut-off. + return SI->getNumCases() * 10 >= TableSize * 4; } /// SwitchToLookupTable - If the switch is only used to initialize one or more /// phi nodes in a common successor block with different constant values, /// replace the switch with lookup tables. static bool SwitchToLookupTable(SwitchInst *SI, - IRBuilder<> &Builder) { + IRBuilder<> &Builder, + const TargetTransformInfo &TTI, + const DataLayout* TD) { assert(SI->getNumCases() > 1 && "Degenerate switch?"); - // FIXME: Handle unreachable cases. + + // Only build lookup table when we have a target that supports it. + if (!TTI.shouldBuildLookupTables()) + return false; // FIXME: If the switch is too sparse for a lookup table, perhaps we could // split off a dense part and build a lookup table for that. - // FIXME: If the results are all integers and the lookup table would fit in a - // target-legal register, we should store them as a bitmap and use shift/mask - // to look up the result. - // FIXME: This creates arrays of GEPs to constant strings, which means each // GEP needs a runtime relocation in PIC code. We should just build one big // string and lookup indices into that. - // Ignore the switch if the number of cases are too small. + // Ignore the switch if the number of cases is too small. // This is similar to the check when building jump tables in // SelectionDAGBuilder::handleJTSwitchCase. // FIXME: Determine the best cut-off. @@ -3131,7 +3584,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, ConstantInt *MinCaseVal = CI.getCaseValue(); ConstantInt *MaxCaseVal = CI.getCaseValue(); - BasicBlock *CommonDest = NULL; + BasicBlock *CommonDest = 0; typedef SmallVector<std::pair<ConstantInt*, Constant*>, 4> ResultListTy; SmallDenseMap<PHINode*, ResultListTy> ResultLists; SmallDenseMap<PHINode*, Constant*> DefaultResults; @@ -3148,7 +3601,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, // Resulting value at phi nodes for this case value. typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy; ResultsTy Results; - if (!GetCaseResults(SI, CI.getCaseSuccessor(), &CommonDest, Results)) + if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest, + Results)) return false; // Append the result from this case to the list for each phi. @@ -3161,7 +3615,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, // Get the resulting values for the default case. SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList; - if (!GetCaseResults(SI, SI->getDefaultDest(), &CommonDest, DefaultResultsList)) + if (!GetCaseResults(SI, 0, SI->getDefaultDest(), &CommonDest, + DefaultResultsList)) return false; for (size_t I = 0, E = DefaultResultsList.size(); I != E; ++I) { PHINode *PHI = DefaultResultsList[I].first; @@ -3171,33 +3626,12 @@ static bool SwitchToLookupTable(SwitchInst *SI, } APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue(); - // The table density should be at lest 40%. This is the same criterion as for - // jump tables, see SelectionDAGBuilder::handleJTSwitchCase. - // FIXME: Find the best cut-off. - // Be careful to avoid overlow in the density computation. - if (RangeSpread.zextOrSelf(64).ugt(UINT64_MAX / 4 - 1)) - return false; uint64_t TableSize = RangeSpread.getLimitedValue() + 1; - if (SI->getNumCases() * 10 < TableSize * 4) + if (!ShouldBuildLookupTable(SI, TableSize, TTI, TD, ResultTypes)) return false; - // Build the lookup tables. - SmallDenseMap<PHINode*, GlobalVariable*> LookupTables; - SmallDenseMap<PHINode*, Constant*> SingleResults; - - Module &Mod = *CommonDest->getParent()->getParent(); - for (SmallVector<PHINode*, 4>::iterator I = PHIs.begin(), E = PHIs.end(); - I != E; ++I) { - PHINode *PHI = *I; - - Constant *SingleResult = NULL; - LookupTables[PHI] = BuildLookupTable(Mod, TableSize, MinCaseVal, - ResultLists[PHI], DefaultResults[PHI], - &SingleResult); - SingleResults[PHI] = SingleResult; - } - // Create the BB that does the lookups. + Module &Mod = *CommonDest->getParent()->getParent(); BasicBlock *LookupBB = BasicBlock::Create(Mod.getContext(), "switch.lookup", CommonDest->getParent(), @@ -3215,31 +3649,24 @@ static bool SwitchToLookupTable(SwitchInst *SI, // Populate the BB that does the lookups. Builder.SetInsertPoint(LookupBB); bool ReturnedEarly = false; - for (SmallVector<PHINode*, 4>::iterator I = PHIs.begin(), E = PHIs.end(); - I != E; ++I) { - PHINode *PHI = *I; - // There was a single result for this phi; just use that. - if (Constant *SingleResult = SingleResults[PHI]) { - PHI->addIncoming(SingleResult, LookupBB); - continue; - } + for (size_t I = 0, E = PHIs.size(); I != E; ++I) { + PHINode *PHI = PHIs[I]; - Value *GEPIndices[] = { Builder.getInt32(0), TableIndex }; - Value *GEP = Builder.CreateInBoundsGEP(LookupTables[PHI], GEPIndices, - "switch.gep"); - Value *Result = Builder.CreateLoad(GEP, "switch.load"); - - // If the result is only going to be used to return from the function, - // we want to do that right here. - if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->use_begin())) { - if (CommonDest->getFirstNonPHIOrDbg() == CommonDest->getTerminator()) { - Builder.CreateRet(Result); - ReturnedEarly = true; - } + SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultLists[PHI], + DefaultResults[PHI], TD); + + Value *Result = Table.BuildLookup(TableIndex, Builder); + + // If the result is used to return immediately from the function, we want to + // do that right here. + if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->use_begin()) && + *PHI->use_begin() == CommonDest->getFirstNonPHIOrDbg()) { + Builder.CreateRet(Result); + ReturnedEarly = true; + break; } - if (!ReturnedEarly) - PHI->addIncoming(Result, LookupBB); + PHI->addIncoming(Result, LookupBB); } if (!ReturnedEarly) @@ -3258,46 +3685,44 @@ static bool SwitchToLookupTable(SwitchInst *SI, } bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { - // If this switch is too complex to want to look at, ignore it. - if (!isValueEqualityComparison(SI)) - return false; - BasicBlock *BB = SI->getParent(); - // If we only have one predecessor, and if it is a branch on this value, - // see if that predecessor totally determines the outcome of this switch. - if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) - if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder)) - return SimplifyCFG(BB) | true; + if (isValueEqualityComparison(SI)) { + // If we only have one predecessor, and if it is a branch on this value, + // see if that predecessor totally determines the outcome of this switch. + if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) + if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder)) + return SimplifyCFG(BB, TTI, TD) | true; - Value *Cond = SI->getCondition(); - if (SelectInst *Select = dyn_cast<SelectInst>(Cond)) - if (SimplifySwitchOnSelect(SI, Select)) - return SimplifyCFG(BB) | true; + Value *Cond = SI->getCondition(); + if (SelectInst *Select = dyn_cast<SelectInst>(Cond)) + if (SimplifySwitchOnSelect(SI, Select)) + return SimplifyCFG(BB, TTI, TD) | true; - // If the block only contains the switch, see if we can fold the block - // away into any preds. - BasicBlock::iterator BBI = BB->begin(); - // Ignore dbg intrinsics. - while (isa<DbgInfoIntrinsic>(BBI)) - ++BBI; - if (SI == &*BBI) - if (FoldValueComparisonIntoPredecessors(SI, Builder)) - return SimplifyCFG(BB) | true; + // If the block only contains the switch, see if we can fold the block + // away into any preds. + BasicBlock::iterator BBI = BB->begin(); + // Ignore dbg intrinsics. + while (isa<DbgInfoIntrinsic>(BBI)) + ++BBI; + if (SI == &*BBI) + if (FoldValueComparisonIntoPredecessors(SI, Builder)) + return SimplifyCFG(BB, TTI, TD) | true; + } // Try to transform the switch into an icmp and a branch. if (TurnSwitchRangeIntoICmp(SI, Builder)) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; // Remove unreachable cases. if (EliminateDeadSwitchCases(SI)) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; if (ForwardSwitchConditionToPHI(SI)) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; - if (SwitchToLookupTable(SI, Builder)) - return SimplifyCFG(BB) | true; + if (SwitchToLookupTable(SI, Builder, TTI, TD)) + return SimplifyCFG(BB, TTI, TD) | true; return false; } @@ -3334,7 +3759,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) { if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) { if (SimplifyIndirectBrOnSelect(IBI, SI)) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; } return Changed; } @@ -3342,6 +3767,9 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) { bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ BasicBlock *BB = BI->getParent(); + if (SinkCommon && SinkThenElseCodeToEnd(BI)) + return true; + // If the Terminator is the only non-phi instruction, simplify the block. BasicBlock::iterator I = BB->getFirstNonPHIOrDbgOrLifetime(); if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() && @@ -3355,7 +3783,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ for (++I; isa<DbgInfoIntrinsic>(I); ++I) ; if (I->isTerminator() && - TryToSimplifyUncondBranchWithICmpInIt(ICI, TD, Builder)) + TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, TTI, TD)) return true; } @@ -3364,7 +3792,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ // predecessor and use logical operations to update the incoming value // for PHI nodes in common successor. if (FoldBranchToCommonDest(BI)) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; return false; } @@ -3379,7 +3807,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { // switch. if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder)) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; // This block must be empty, except for the setcond inst, if it exists. // Ignore dbg intrinsics. @@ -3389,14 +3817,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { ++I; if (&*I == BI) { if (FoldValueComparisonIntoPredecessors(BI, Builder)) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; } else if (&*I == cast<Instruction>(BI->getCondition())){ ++I; // Ignore dbg intrinsics. while (isa<DbgInfoIntrinsic>(I)) ++I; if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder)) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; } } @@ -3408,7 +3836,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { // branches to us and one of our successors, fold the comparison into the // predecessor and use logical operations to pick the right destination. if (FoldBranchToCommonDest(BI)) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; // We have a conditional branch to two blocks that are only reachable // from BI. We know that the condbr dominates the two blocks, so see if @@ -3417,7 +3845,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { if (BI->getSuccessor(0)->getSinglePredecessor() != 0) { if (BI->getSuccessor(1)->getSinglePredecessor() != 0) { if (HoistThenElseCodeToIf(BI)) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; } else { // If Successor #1 has multiple preds, we may be able to conditionally // execute Successor #0 if it branches to successor #1. @@ -3425,7 +3853,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { if (Succ0TI->getNumSuccessors() == 1 && Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0))) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; } } else if (BI->getSuccessor(1)->getSinglePredecessor() != 0) { // If Successor #0 has multiple preds, we may be able to conditionally @@ -3434,7 +3862,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { if (Succ1TI->getNumSuccessors() == 1 && Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1))) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; } // If this is a branch on a phi node in the current block, thread control @@ -3442,14 +3870,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition())) if (PN->getParent() == BI->getParent()) if (FoldCondBranchOnPHI(BI, TD)) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; // Scan predecessor blocks for conditional branches. for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator())) if (PBI != BI && PBI->isConditional()) if (SimplifyCondBranchToCondBranch(PBI, BI)) - return SimplifyCFG(BB) | true; + return SimplifyCFG(BB, TTI, TD) | true; return false; } @@ -3460,11 +3888,12 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) { if (!C) return false; - if (!I->hasOneUse()) // Only look at single-use instructions, for compile time + if (I->use_empty()) return false; if (C->isNullValue()) { - Instruction *Use = I->use_back(); + // Only look at the first use, avoid hurting compile time with long uselists + User *Use = *I->use_begin(); // Now make sure that there are no instructions in between that can alter // control flow (eg. calls) @@ -3589,6 +4018,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { /// eliminates unreachable basic blocks, and does other "peephole" optimization /// of the CFG. It returns true if a modification was made. /// -bool llvm::SimplifyCFG(BasicBlock *BB, const TargetData *TD) { - return SimplifyCFGOpt(TD).run(BB); +bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, + const DataLayout *TD) { + return SimplifyCFGOpt(TTI, TD).run(BB); } diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp index 5d673f1..41c207c 100644 --- a/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -15,18 +15,18 @@ #define DEBUG_TYPE "indvars" -#include "llvm/Instructions.h" +#include "llvm/Transforms/Utils/SimplifyIndVar.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/SimplifyIndVar.h" -#include "llvm/Target/TargetData.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; @@ -44,7 +44,7 @@ namespace { Loop *L; LoopInfo *LI; ScalarEvolution *SE; - const TargetData *TD; // May be NULL + const DataLayout *TD; // May be NULL SmallVectorImpl<WeakVH> &DeadInsts; @@ -56,7 +56,7 @@ namespace { L(Loop), LI(LPM->getAnalysisIfAvailable<LoopInfo>()), SE(SE), - TD(LPM->getAnalysisIfAvailable<TargetData>()), + TD(LPM->getAnalysisIfAvailable<DataLayout>()), DeadInsts(Dead), Changed(false) { assert(LI && "IV simplification requires LoopInfo"); diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp index 528e6a1..f9687e4 100644 --- a/lib/Transforms/Utils/SimplifyInstructions.cpp +++ b/lib/Transforms/Utils/SimplifyInstructions.cpp @@ -15,17 +15,17 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "instsimplify" -#include "llvm/Function.h" -#include "llvm/Pass.h" -#include "llvm/Type.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Target/TargetData.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" +#include "llvm/Pass.h" #include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -46,7 +46,7 @@ namespace { /// runOnFunction - Remove instructions that simplify. bool runOnFunction(Function &F) { const DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>(); - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; bool Changed = false; diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp new file mode 100644 index 0000000..83c74e7 --- /dev/null +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -0,0 +1,1894 @@ +//===------ SimplifyLibCalls.cpp - Library calls simplifier ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a utility pass used for testing the InstructionSimplify analysis. +// The analysis is applied to every instruction, and if it simplifies then the +// instruction is replaced by the simplification. If you are looking for a pass +// that performs serious instruction folding, use the instcombine pass instead. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" + +using namespace llvm; + +/// This class is the abstract base class for the set of optimizations that +/// corresponds to one library call. +namespace { +class LibCallOptimization { +protected: + Function *Caller; + const DataLayout *TD; + const TargetLibraryInfo *TLI; + const LibCallSimplifier *LCS; + LLVMContext* Context; +public: + LibCallOptimization() { } + virtual ~LibCallOptimization() {} + + /// callOptimizer - This pure virtual method is implemented by base classes to + /// do various optimizations. If this returns null then no transformation was + /// performed. If it returns CI, then it transformed the call and CI is to be + /// deleted. If it returns something else, replace CI with the new value and + /// delete CI. + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) + =0; + + Value *optimizeCall(CallInst *CI, const DataLayout *TD, + const TargetLibraryInfo *TLI, + const LibCallSimplifier *LCS, IRBuilder<> &B) { + Caller = CI->getParent()->getParent(); + this->TD = TD; + this->TLI = TLI; + this->LCS = LCS; + if (CI->getCalledFunction()) + Context = &CI->getCalledFunction()->getContext(); + + // We never change the calling convention. + if (CI->getCallingConv() != llvm::CallingConv::C) + return NULL; + + return callOptimizer(CI->getCalledFunction(), CI, B); + } +}; + +//===----------------------------------------------------------------------===// +// Helper Functions +//===----------------------------------------------------------------------===// + +/// isOnlyUsedInZeroEqualityComparison - Return true if it only matters that the +/// value is equal or not-equal to zero. +static bool isOnlyUsedInZeroEqualityComparison(Value *V) { + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); + UI != E; ++UI) { + if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) + if (IC->isEquality()) + if (Constant *C = dyn_cast<Constant>(IC->getOperand(1))) + if (C->isNullValue()) + continue; + // Unknown instruction. + return false; + } + return true; +} + +/// isOnlyUsedInEqualityComparison - Return true if it is only used in equality +/// comparisons with With. +static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) { + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); + UI != E; ++UI) { + if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) + if (IC->isEquality() && IC->getOperand(1) == With) + continue; + // Unknown instruction. + return false; + } + return true; +} + +static bool callHasFloatingPointArgument(const CallInst *CI) { + for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end(); + it != e; ++it) { + if ((*it)->getType()->isFloatingPointTy()) + return true; + } + return false; +} + +//===----------------------------------------------------------------------===// +// Fortified Library Call Optimizations +//===----------------------------------------------------------------------===// + +struct FortifiedLibCallOptimization : public LibCallOptimization { +protected: + virtual bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp, + bool isString) const = 0; +}; + +struct InstFortifiedLibCallOptimization : public FortifiedLibCallOptimization { + CallInst *CI; + + bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp, bool isString) const { + if (CI->getArgOperand(SizeCIOp) == CI->getArgOperand(SizeArgOp)) + return true; + if (ConstantInt *SizeCI = + dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) { + if (SizeCI->isAllOnesValue()) + return true; + if (isString) { + uint64_t Len = GetStringLength(CI->getArgOperand(SizeArgOp)); + // If the length is 0 we don't know how long it is and so we can't + // remove the check. + if (Len == 0) return false; + return SizeCI->getZExtValue() >= Len; + } + if (ConstantInt *Arg = dyn_cast<ConstantInt>( + CI->getArgOperand(SizeArgOp))) + return SizeCI->getZExtValue() >= Arg->getZExtValue(); + } + return false; + } +}; + +struct MemCpyChkOpt : public InstFortifiedLibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + this->CI = CI; + FunctionType *FT = Callee->getFunctionType(); + LLVMContext &Context = CI->getParent()->getContext(); + + // Check if this has the right signature. + if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + FT->getParamType(2) != TD->getIntPtrType(Context) || + FT->getParamType(3) != TD->getIntPtrType(Context)) + return 0; + + if (isFoldable(3, 2, false)) { + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } + return 0; + } +}; + +struct MemMoveChkOpt : public InstFortifiedLibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + this->CI = CI; + FunctionType *FT = Callee->getFunctionType(); + LLVMContext &Context = CI->getParent()->getContext(); + + // Check if this has the right signature. + if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + FT->getParamType(2) != TD->getIntPtrType(Context) || + FT->getParamType(3) != TD->getIntPtrType(Context)) + return 0; + + if (isFoldable(3, 2, false)) { + B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } + return 0; + } +}; + +struct MemSetChkOpt : public InstFortifiedLibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + this->CI = CI; + FunctionType *FT = Callee->getFunctionType(); + LLVMContext &Context = CI->getParent()->getContext(); + + // Check if this has the right signature. + if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isIntegerTy() || + FT->getParamType(2) != TD->getIntPtrType(Context) || + FT->getParamType(3) != TD->getIntPtrType(Context)) + return 0; + + if (isFoldable(3, 2, false)) { + Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), + false); + B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } + return 0; + } +}; + +struct StrCpyChkOpt : public InstFortifiedLibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + this->CI = CI; + StringRef Name = Callee->getName(); + FunctionType *FT = Callee->getFunctionType(); + LLVMContext &Context = CI->getParent()->getContext(); + + // Check if this has the right signature. + if (FT->getNumParams() != 3 || + FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != Type::getInt8PtrTy(Context) || + FT->getParamType(2) != TD->getIntPtrType(Context)) + return 0; + + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); + if (Dst == Src) // __strcpy_chk(x,x) -> x + return Src; + + // If a) we don't have any length information, or b) we know this will + // fit then just lower to a plain strcpy. Otherwise we'll keep our + // strcpy_chk call which may fail at runtime if the size is too long. + // TODO: It might be nice to get a maximum length out of the possible + // string lengths for varying. + if (isFoldable(2, 1, true)) { + Value *Ret = EmitStrCpy(Dst, Src, B, TD, TLI, Name.substr(2, 6)); + return Ret; + } else { + // Maybe we can stil fold __strcpy_chk to __memcpy_chk. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + + // This optimization require DataLayout. + if (!TD) return 0; + + Value *Ret = + EmitMemCpyChk(Dst, Src, + ConstantInt::get(TD->getIntPtrType(Context), Len), + CI->getArgOperand(2), B, TD, TLI); + return Ret; + } + return 0; + } +}; + +struct StpCpyChkOpt : public InstFortifiedLibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + this->CI = CI; + StringRef Name = Callee->getName(); + FunctionType *FT = Callee->getFunctionType(); + LLVMContext &Context = CI->getParent()->getContext(); + + // Check if this has the right signature. + if (FT->getNumParams() != 3 || + FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != Type::getInt8PtrTy(Context) || + FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0))) + return 0; + + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); + if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) + Value *StrLen = EmitStrLen(Src, B, TD, TLI); + return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0; + } + + // If a) we don't have any length information, or b) we know this will + // fit then just lower to a plain stpcpy. Otherwise we'll keep our + // stpcpy_chk call which may fail at runtime if the size is too long. + // TODO: It might be nice to get a maximum length out of the possible + // string lengths for varying. + if (isFoldable(2, 1, true)) { + Value *Ret = EmitStrCpy(Dst, Src, B, TD, TLI, Name.substr(2, 6)); + return Ret; + } else { + // Maybe we can stil fold __stpcpy_chk to __memcpy_chk. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + + // This optimization require DataLayout. + if (!TD) return 0; + + Type *PT = FT->getParamType(0); + Value *LenV = ConstantInt::get(TD->getIntPtrType(PT), Len); + Value *DstEnd = B.CreateGEP(Dst, + ConstantInt::get(TD->getIntPtrType(PT), + Len - 1)); + if (!EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, TD, TLI)) + return 0; + return DstEnd; + } + return 0; + } +}; + +struct StrNCpyChkOpt : public InstFortifiedLibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + this->CI = CI; + StringRef Name = Callee->getName(); + FunctionType *FT = Callee->getFunctionType(); + LLVMContext &Context = CI->getParent()->getContext(); + + // Check if this has the right signature. + if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != Type::getInt8PtrTy(Context) || + !FT->getParamType(2)->isIntegerTy() || + FT->getParamType(3) != TD->getIntPtrType(Context)) + return 0; + + if (isFoldable(3, 2, false)) { + Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, TD, TLI, + Name.substr(2, 7)); + return Ret; + } + return 0; + } +}; + +//===----------------------------------------------------------------------===// +// String and Memory Library Call Optimizations +//===----------------------------------------------------------------------===// + +struct StrCatOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strcat" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + FT->getParamType(1) != FT->getReturnType()) + return 0; + + // Extract some information from the instruction + Value *Dst = CI->getArgOperand(0); + Value *Src = CI->getArgOperand(1); + + // See if we can get the length of the input string. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + --Len; // Unbias length. + + // Handle the simple, do-nothing case: strcat(x, "") -> x + if (Len == 0) + return Dst; + + // These optimizations require DataLayout. + if (!TD) return 0; + + return emitStrLenMemCpy(Src, Dst, Len, B); + } + + Value *emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, + IRBuilder<> &B) { + // We need to find the end of the destination string. That's where the + // memory is to be moved to. We just generate a call to strlen. + Value *DstLen = EmitStrLen(Dst, B, TD, TLI); + if (!DstLen) + return 0; + + // Now that we have the destination's length, we must index into the + // destination's pointer to get the actual memcpy destination (end of + // the string .. we're concatenating). + Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr"); + + // We have enough information to now generate the memcpy call to do the + // concatenation for us. Make a memcpy to copy the nul byte with align = 1. + B.CreateMemCpy(CpyDst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1); + return Dst; + } +}; + +struct StrNCatOpt : public StrCatOpt { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strncat" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + FT->getParamType(1) != FT->getReturnType() || + !FT->getParamType(2)->isIntegerTy()) + return 0; + + // Extract some information from the instruction + Value *Dst = CI->getArgOperand(0); + Value *Src = CI->getArgOperand(1); + uint64_t Len; + + // We don't do anything if length is not constant + if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) + Len = LengthArg->getZExtValue(); + else + return 0; + + // See if we can get the length of the input string. + uint64_t SrcLen = GetStringLength(Src); + if (SrcLen == 0) return 0; + --SrcLen; // Unbias length. + + // Handle the simple, do-nothing cases: + // strncat(x, "", c) -> x + // strncat(x, c, 0) -> x + if (SrcLen == 0 || Len == 0) return Dst; + + // These optimizations require DataLayout. + if (!TD) return 0; + + // We don't optimize this case + if (Len < SrcLen) return 0; + + // strncat(x, s, c) -> strcat(x, s) + // s is constant so the strcat can be optimized further + return emitStrLenMemCpy(Src, Dst, SrcLen, B); + } +}; + +struct StrChrOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strchr" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + !FT->getParamType(1)->isIntegerTy(32)) + return 0; + + Value *SrcStr = CI->getArgOperand(0); + + // If the second operand is non-constant, see if we can compute the length + // of the input string and turn this into memchr. + ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + if (CharC == 0) { + // These optimizations require DataLayout. + if (!TD) return 0; + + uint64_t Len = GetStringLength(SrcStr); + if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32. + return 0; + + return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul. + ConstantInt::get(TD->getIntPtrType(*Context), Len), + B, TD, TLI); + } + + // Otherwise, the character is a constant, see if the first argument is + // a string literal. If so, we can constant fold. + StringRef Str; + if (!getConstantStringInfo(SrcStr, Str)) + return 0; + + // Compute the offset, make sure to handle the case when we're searching for + // zero (a weird way to spell strlen). + size_t I = CharC->getSExtValue() == 0 ? + Str.size() : Str.find(CharC->getSExtValue()); + if (I == StringRef::npos) // Didn't find the char. strchr returns null. + return Constant::getNullValue(CI->getType()); + + // strchr(s+n,c) -> gep(s+n+i,c) + return B.CreateGEP(SrcStr, B.getInt64(I), "strchr"); + } +}; + +struct StrRChrOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strrchr" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + !FT->getParamType(1)->isIntegerTy(32)) + return 0; + + Value *SrcStr = CI->getArgOperand(0); + ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + + // Cannot fold anything if we're not looking for a constant. + if (!CharC) + return 0; + + StringRef Str; + if (!getConstantStringInfo(SrcStr, Str)) { + // strrchr(s, 0) -> strchr(s, 0) + if (TD && CharC->isZero()) + return EmitStrChr(SrcStr, '\0', B, TD, TLI); + return 0; + } + + // Compute the offset. + size_t I = CharC->getSExtValue() == 0 ? + Str.size() : Str.rfind(CharC->getSExtValue()); + if (I == StringRef::npos) // Didn't find the char. Return null. + return Constant::getNullValue(CI->getType()); + + // strrchr(s+n,c) -> gep(s+n+i,c) + return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr"); + } +}; + +struct StrCmpOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strcmp" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + !FT->getReturnType()->isIntegerTy(32) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy()) + return 0; + + Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); + if (Str1P == Str2P) // strcmp(x,x) -> 0 + return ConstantInt::get(CI->getType(), 0); + + StringRef Str1, Str2; + bool HasStr1 = getConstantStringInfo(Str1P, Str1); + bool HasStr2 = getConstantStringInfo(Str2P, Str2); + + // strcmp(x, y) -> cnst (if both x and y are constant strings) + if (HasStr1 && HasStr2) + return ConstantInt::get(CI->getType(), Str1.compare(Str2)); + + if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x + return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), + CI->getType())); + + if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x + return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); + + // strcmp(P, "x") -> memcmp(P, "x", 2) + uint64_t Len1 = GetStringLength(Str1P); + uint64_t Len2 = GetStringLength(Str2P); + if (Len1 && Len2) { + // These optimizations require DataLayout. + if (!TD) return 0; + + return EmitMemCmp(Str1P, Str2P, + ConstantInt::get(TD->getIntPtrType(*Context), + std::min(Len1, Len2)), B, TD, TLI); + } + + return 0; + } +}; + +struct StrNCmpOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strncmp" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || + !FT->getReturnType()->isIntegerTy(32) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy() || + !FT->getParamType(2)->isIntegerTy()) + return 0; + + Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); + if (Str1P == Str2P) // strncmp(x,x,n) -> 0 + return ConstantInt::get(CI->getType(), 0); + + // Get the length argument if it is constant. + uint64_t Length; + if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) + Length = LengthArg->getZExtValue(); + else + return 0; + + if (Length == 0) // strncmp(x,y,0) -> 0 + return ConstantInt::get(CI->getType(), 0); + + if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) + return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD, TLI); + + StringRef Str1, Str2; + bool HasStr1 = getConstantStringInfo(Str1P, Str1); + bool HasStr2 = getConstantStringInfo(Str2P, Str2); + + // strncmp(x, y) -> cnst (if both x and y are constant strings) + if (HasStr1 && HasStr2) { + StringRef SubStr1 = Str1.substr(0, Length); + StringRef SubStr2 = Str2.substr(0, Length); + return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2)); + } + + if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x + return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), + CI->getType())); + + if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x + return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); + + return 0; + } +}; + +struct StrCpyOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strcpy" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy()) + return 0; + + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); + if (Dst == Src) // strcpy(x,x) -> x + return Src; + + // These optimizations require DataLayout. + if (!TD) return 0; + + // See if we can get the length of the input string. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + + // We have enough information to now generate the memcpy call to do the + // copy for us. Make a memcpy to copy the nul byte with align = 1. + B.CreateMemCpy(Dst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); + return Dst; + } +}; + +struct StpCpyOpt: public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "stpcpy" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy()) + return 0; + + // These optimizations require DataLayout. + if (!TD) return 0; + + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); + if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) + Value *StrLen = EmitStrLen(Src, B, TD, TLI); + return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0; + } + + // See if we can get the length of the input string. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + + Type *PT = FT->getParamType(0); + Value *LenV = ConstantInt::get(TD->getIntPtrType(PT), Len); + Value *DstEnd = B.CreateGEP(Dst, + ConstantInt::get(TD->getIntPtrType(PT), + Len - 1)); + + // We have enough information to now generate the memcpy call to do the + // copy for us. Make a memcpy to copy the nul byte with align = 1. + B.CreateMemCpy(Dst, Src, LenV, 1); + return DstEnd; + } +}; + +struct StrNCpyOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy() || + !FT->getParamType(2)->isIntegerTy()) + return 0; + + Value *Dst = CI->getArgOperand(0); + Value *Src = CI->getArgOperand(1); + Value *LenOp = CI->getArgOperand(2); + + // See if we can get the length of the input string. + uint64_t SrcLen = GetStringLength(Src); + if (SrcLen == 0) return 0; + --SrcLen; + + if (SrcLen == 0) { + // strncpy(x, "", y) -> memset(x, '\0', y, 1) + B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1); + return Dst; + } + + uint64_t Len; + if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp)) + Len = LengthArg->getZExtValue(); + else + return 0; + + if (Len == 0) return Dst; // strncpy(x, y, 0) -> x + + // These optimizations require DataLayout. + if (!TD) return 0; + + // Let strncpy handle the zero padding + if (Len > SrcLen+1) return 0; + + Type *PT = FT->getParamType(0); + // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant] + B.CreateMemCpy(Dst, Src, + ConstantInt::get(TD->getIntPtrType(PT), Len), 1); + + return Dst; + } +}; + +struct StrLenOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 1 || + FT->getParamType(0) != B.getInt8PtrTy() || + !FT->getReturnType()->isIntegerTy()) + return 0; + + Value *Src = CI->getArgOperand(0); + + // Constant folding: strlen("xyz") -> 3 + if (uint64_t Len = GetStringLength(Src)) + return ConstantInt::get(CI->getType(), Len-1); + + // strlen(x) != 0 --> *x != 0 + // strlen(x) == 0 --> *x == 0 + if (isOnlyUsedInZeroEqualityComparison(CI)) + return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType()); + return 0; + } +}; + +struct StrPBrkOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getParamType(0) != B.getInt8PtrTy() || + FT->getParamType(1) != FT->getParamType(0) || + FT->getReturnType() != FT->getParamType(0)) + return 0; + + StringRef S1, S2; + bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); + bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); + + // strpbrk(s, "") -> NULL + // strpbrk("", s) -> NULL + if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) + return Constant::getNullValue(CI->getType()); + + // Constant folding. + if (HasS1 && HasS2) { + size_t I = S1.find_first_of(S2); + if (I == std::string::npos) // No match. + return Constant::getNullValue(CI->getType()); + + return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk"); + } + + // strpbrk(s, "a") -> strchr(s, 'a') + if (TD && HasS2 && S2.size() == 1) + return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD, TLI); + + return 0; + } +}; + +struct StrToOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy()) + return 0; + + Value *EndPtr = CI->getArgOperand(1); + if (isa<ConstantPointerNull>(EndPtr)) { + // With a null EndPtr, this function won't capture the main argument. + // It would be readonly too, except that it still may write to errno. + CI->addAttribute(1, Attribute::get(Callee->getContext(), + Attribute::NoCapture)); + } + + return 0; + } +}; + +struct StrSpnOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getParamType(0) != B.getInt8PtrTy() || + FT->getParamType(1) != FT->getParamType(0) || + !FT->getReturnType()->isIntegerTy()) + return 0; + + StringRef S1, S2; + bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); + bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); + + // strspn(s, "") -> 0 + // strspn("", s) -> 0 + if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) + return Constant::getNullValue(CI->getType()); + + // Constant folding. + if (HasS1 && HasS2) { + size_t Pos = S1.find_first_not_of(S2); + if (Pos == StringRef::npos) Pos = S1.size(); + return ConstantInt::get(CI->getType(), Pos); + } + + return 0; + } +}; + +struct StrCSpnOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getParamType(0) != B.getInt8PtrTy() || + FT->getParamType(1) != FT->getParamType(0) || + !FT->getReturnType()->isIntegerTy()) + return 0; + + StringRef S1, S2; + bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); + bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); + + // strcspn("", s) -> 0 + if (HasS1 && S1.empty()) + return Constant::getNullValue(CI->getType()); + + // Constant folding. + if (HasS1 && HasS2) { + size_t Pos = S1.find_first_of(S2); + if (Pos == StringRef::npos) Pos = S1.size(); + return ConstantInt::get(CI->getType(), Pos); + } + + // strcspn(s, "") -> strlen(s) + if (TD && HasS2 && S2.empty()) + return EmitStrLen(CI->getArgOperand(0), B, TD, TLI); + + return 0; + } +}; + +struct StrStrOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !FT->getReturnType()->isPointerTy()) + return 0; + + // fold strstr(x, x) -> x. + if (CI->getArgOperand(0) == CI->getArgOperand(1)) + return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); + + // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0 + if (TD && isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) { + Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI); + if (!StrLen) + return 0; + Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1), + StrLen, B, TD, TLI); + if (!StrNCmp) + return 0; + for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end(); + UI != UE; ) { + ICmpInst *Old = cast<ICmpInst>(*UI++); + Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp, + ConstantInt::getNullValue(StrNCmp->getType()), + "cmp"); + LCS->replaceAllUsesWith(Old, Cmp); + } + return CI; + } + + // See if either input string is a constant string. + StringRef SearchStr, ToFindStr; + bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr); + bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr); + + // fold strstr(x, "") -> x. + if (HasStr2 && ToFindStr.empty()) + return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); + + // If both strings are known, constant fold it. + if (HasStr1 && HasStr2) { + std::string::size_type Offset = SearchStr.find(ToFindStr); + + if (Offset == StringRef::npos) // strstr("foo", "bar") -> null + return Constant::getNullValue(CI->getType()); + + // strstr("abcd", "bc") -> gep((char*)"abcd", 1) + Value *Result = CastToCStr(CI->getArgOperand(0), B); + Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr"); + return B.CreateBitCast(Result, CI->getType()); + } + + // fold strstr(x, "y") -> strchr(x, 'y'). + if (HasStr2 && ToFindStr.size() == 1) { + Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI); + return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0; + } + return 0; + } +}; + +struct MemCmpOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !FT->getReturnType()->isIntegerTy(32)) + return 0; + + Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1); + + if (LHS == RHS) // memcmp(s,s,x) -> 0 + return Constant::getNullValue(CI->getType()); + + // Make sure we have a constant length. + ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + if (!LenC) return 0; + uint64_t Len = LenC->getZExtValue(); + + if (Len == 0) // memcmp(s1,s2,0) -> 0 + return Constant::getNullValue(CI->getType()); + + // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS + if (Len == 1) { + Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"), + CI->getType(), "lhsv"); + Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"), + CI->getType(), "rhsv"); + return B.CreateSub(LHSV, RHSV, "chardiff"); + } + + // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant) + StringRef LHSStr, RHSStr; + if (getConstantStringInfo(LHS, LHSStr) && + getConstantStringInfo(RHS, RHSStr)) { + // Make sure we're not reading out-of-bounds memory. + if (Len > LHSStr.size() || Len > RHSStr.size()) + return 0; + // Fold the memcmp and normalize the result. This way we get consistent + // results across multiple platforms. + uint64_t Ret = 0; + int Cmp = memcmp(LHSStr.data(), RHSStr.data(), Len); + if (Cmp < 0) + Ret = -1; + else if (Cmp > 0) + Ret = 1; + return ConstantInt::get(CI->getType(), Ret); + } + + return 0; + } +}; + +struct MemCpyOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // These optimizations require DataLayout. + if (!TD) return 0; + + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + FT->getParamType(2) != TD->getIntPtrType(*Context)) + return 0; + + // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1) + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } +}; + +struct MemMoveOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // These optimizations require DataLayout. + if (!TD) return 0; + + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + FT->getParamType(2) != TD->getIntPtrType(*Context)) + return 0; + + // memmove(x, y, n) -> llvm.memmove(x, y, n, 1) + B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } +}; + +struct MemSetOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // These optimizations require DataLayout. + if (!TD) return 0; + + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isIntegerTy() || + FT->getParamType(2) != TD->getIntPtrType(*Context)) + return 0; + + // memset(p, v, n) -> llvm.memset(p, v, n, 1) + Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); + B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } +}; + +//===----------------------------------------------------------------------===// +// Math Library Optimizations +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Double -> Float Shrinking Optimizations for Unary Functions like 'floor' + +struct UnaryDoubleFPOpt : public LibCallOptimization { + bool CheckRetType; + UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {} + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() || + !FT->getParamType(0)->isDoubleTy()) + return 0; + + if (CheckRetType) { + // Check if all the uses for function like 'sin' are converted to float. + for (Value::use_iterator UseI = CI->use_begin(); UseI != CI->use_end(); + ++UseI) { + FPTruncInst *Cast = dyn_cast<FPTruncInst>(*UseI); + if (Cast == 0 || !Cast->getType()->isFloatTy()) + return 0; + } + } + + // If this is something like 'floor((double)floatval)', convert to floorf. + FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0)); + if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy()) + return 0; + + // floor((double)floatval) -> (double)floorf(floatval) + Value *V = Cast->getOperand(0); + V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes()); + return B.CreateFPExt(V, B.getDoubleTy()); + } +}; + +struct UnsafeFPLibCallOptimization : public LibCallOptimization { + bool UnsafeFPShrink; + UnsafeFPLibCallOptimization(bool UnsafeFPShrink) { + this->UnsafeFPShrink = UnsafeFPShrink; + } +}; + +struct CosOpt : public UnsafeFPLibCallOptimization { + CosOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {} + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *Ret = NULL; + if (UnsafeFPShrink && Callee->getName() == "cos" && + TLI->has(LibFunc::cosf)) { + UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); + Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B); + } + + FunctionType *FT = Callee->getFunctionType(); + // Just make sure this has 1 argument of FP type, which matches the + // result type. + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isFloatingPointTy()) + return Ret; + + // cos(-x) -> cos(x) + Value *Op1 = CI->getArgOperand(0); + if (BinaryOperator::isFNeg(Op1)) { + BinaryOperator *BinExpr = cast<BinaryOperator>(Op1); + return B.CreateCall(Callee, BinExpr->getOperand(1), "cos"); + } + return Ret; + } +}; + +struct PowOpt : public UnsafeFPLibCallOptimization { + PowOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {} + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *Ret = NULL; + if (UnsafeFPShrink && Callee->getName() == "pow" && + TLI->has(LibFunc::powf)) { + UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); + Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B); + } + + FunctionType *FT = Callee->getFunctionType(); + // Just make sure this has 2 arguments of the same FP type, which match the + // result type. + if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + !FT->getParamType(0)->isFloatingPointTy()) + return Ret; + + Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1); + if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) { + if (Op1C->isExactlyValue(1.0)) // pow(1.0, x) -> 1.0 + return Op1C; + if (Op1C->isExactlyValue(2.0)) // pow(2.0, x) -> exp2(x) + return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes()); + } + + ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2); + if (Op2C == 0) return Ret; + + if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0 + return ConstantFP::get(CI->getType(), 1.0); + + if (Op2C->isExactlyValue(0.5)) { + // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))). + // This is faster than calling pow, and still handles negative zero + // and negative infinity correctly. + // TODO: In fast-math mode, this could be just sqrt(x). + // TODO: In finite-only mode, this could be just fabs(sqrt(x)). + Value *Inf = ConstantFP::getInfinity(CI->getType()); + Value *NegInf = ConstantFP::getInfinity(CI->getType(), true); + Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B, + Callee->getAttributes()); + Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B, + Callee->getAttributes()); + Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf); + Value *Sel = B.CreateSelect(FCmp, Inf, FAbs); + return Sel; + } + + if (Op2C->isExactlyValue(1.0)) // pow(x, 1.0) -> x + return Op1; + if (Op2C->isExactlyValue(2.0)) // pow(x, 2.0) -> x*x + return B.CreateFMul(Op1, Op1, "pow2"); + if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x + return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), + Op1, "powrecip"); + return 0; + } +}; + +struct Exp2Opt : public UnsafeFPLibCallOptimization { + Exp2Opt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {} + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *Ret = NULL; + if (UnsafeFPShrink && Callee->getName() == "exp2" && + TLI->has(LibFunc::exp2)) { + UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); + Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B); + } + + FunctionType *FT = Callee->getFunctionType(); + // Just make sure this has 1 argument of FP type, which matches the + // result type. + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isFloatingPointTy()) + return Ret; + + Value *Op = CI->getArgOperand(0); + // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x)) if sizeof(x) <= 32 + // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x)) if sizeof(x) < 32 + Value *LdExpArg = 0; + if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) { + if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32) + LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty()); + } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) { + if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32) + LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty()); + } + + if (LdExpArg) { + const char *Name; + if (Op->getType()->isFloatTy()) + Name = "ldexpf"; + else if (Op->getType()->isDoubleTy()) + Name = "ldexp"; + else + Name = "ldexpl"; + + Constant *One = ConstantFP::get(*Context, APFloat(1.0f)); + if (!Op->getType()->isFloatTy()) + One = ConstantExpr::getFPExtend(One, Op->getType()); + + Module *M = Caller->getParent(); + Value *Callee = M->getOrInsertFunction(Name, Op->getType(), + Op->getType(), + B.getInt32Ty(), NULL); + CallInst *CI = B.CreateCall2(Callee, One, LdExpArg); + if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + + return CI; + } + return Ret; + } +}; + +//===----------------------------------------------------------------------===// +// Integer Library Call Optimizations +//===----------------------------------------------------------------------===// + +struct FFSOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + // Just make sure this has 2 arguments of the same FP type, which match the + // result type. + if (FT->getNumParams() != 1 || + !FT->getReturnType()->isIntegerTy(32) || + !FT->getParamType(0)->isIntegerTy()) + return 0; + + Value *Op = CI->getArgOperand(0); + + // Constant fold. + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + if (CI->isZero()) // ffs(0) -> 0. + return B.getInt32(0); + // ffs(c) -> cttz(c)+1 + return B.getInt32(CI->getValue().countTrailingZeros() + 1); + } + + // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0 + Type *ArgType = Op->getType(); + Value *F = Intrinsic::getDeclaration(Callee->getParent(), + Intrinsic::cttz, ArgType); + Value *V = B.CreateCall2(F, Op, B.getFalse(), "cttz"); + V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1)); + V = B.CreateIntCast(V, B.getInt32Ty(), false); + + Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType)); + return B.CreateSelect(Cond, V, B.getInt32(0)); + } +}; + +struct AbsOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + // We require integer(integer) where the types agree. + if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || + FT->getParamType(0) != FT->getReturnType()) + return 0; + + // abs(x) -> x >s -1 ? x : -x + Value *Op = CI->getArgOperand(0); + Value *Pos = B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()), + "ispos"); + Value *Neg = B.CreateNeg(Op, "neg"); + return B.CreateSelect(Pos, Op, Neg); + } +}; + +struct IsDigitOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + // We require integer(i32) + if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || + !FT->getParamType(0)->isIntegerTy(32)) + return 0; + + // isdigit(c) -> (c-'0') <u 10 + Value *Op = CI->getArgOperand(0); + Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp"); + Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit"); + return B.CreateZExt(Op, CI->getType()); + } +}; + +struct IsAsciiOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + // We require integer(i32) + if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || + !FT->getParamType(0)->isIntegerTy(32)) + return 0; + + // isascii(c) -> c <u 128 + Value *Op = CI->getArgOperand(0); + Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii"); + return B.CreateZExt(Op, CI->getType()); + } +}; + +struct ToAsciiOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + // We require i32(i32) + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isIntegerTy(32)) + return 0; + + // toascii(c) -> c & 0x7f + return B.CreateAnd(CI->getArgOperand(0), + ConstantInt::get(CI->getType(),0x7F)); + } +}; + +//===----------------------------------------------------------------------===// +// Formatting and IO Library Call Optimizations +//===----------------------------------------------------------------------===// + +struct PrintFOpt : public LibCallOptimization { + Value *optimizeFixedFormatString(Function *Callee, CallInst *CI, + IRBuilder<> &B) { + // Check for a fixed format string. + StringRef FormatStr; + if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr)) + return 0; + + // Empty format string -> noop. + if (FormatStr.empty()) // Tolerate printf's declared void. + return CI->use_empty() ? (Value*)CI : + ConstantInt::get(CI->getType(), 0); + + // Do not do any of the following transformations if the printf return value + // is used, in general the printf return value is not compatible with either + // putchar() or puts(). + if (!CI->use_empty()) + return 0; + + // printf("x") -> putchar('x'), even for '%'. + if (FormatStr.size() == 1) { + Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD, TLI); + if (CI->use_empty() || !Res) return Res; + return B.CreateIntCast(Res, CI->getType(), true); + } + + // printf("foo\n") --> puts("foo") + if (FormatStr[FormatStr.size()-1] == '\n' && + FormatStr.find('%') == std::string::npos) { // no format characters. + // Create a string literal with no \n on it. We expect the constant merge + // pass to be run after this pass, to merge duplicate strings. + FormatStr = FormatStr.drop_back(); + Value *GV = B.CreateGlobalString(FormatStr, "str"); + Value *NewCI = EmitPutS(GV, B, TD, TLI); + return (CI->use_empty() || !NewCI) ? + NewCI : + ConstantInt::get(CI->getType(), FormatStr.size()+1); + } + + // Optimize specific format strings. + // printf("%c", chr) --> putchar(chr) + if (FormatStr == "%c" && CI->getNumArgOperands() > 1 && + CI->getArgOperand(1)->getType()->isIntegerTy()) { + Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD, TLI); + + if (CI->use_empty() || !Res) return Res; + return B.CreateIntCast(Res, CI->getType(), true); + } + + // printf("%s\n", str) --> puts(str) + if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 && + CI->getArgOperand(1)->getType()->isPointerTy()) { + return EmitPutS(CI->getArgOperand(1), B, TD, TLI); + } + return 0; + } + + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require one fixed pointer argument and an integer/void result. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || + !(FT->getReturnType()->isIntegerTy() || + FT->getReturnType()->isVoidTy())) + return 0; + + if (Value *V = optimizeFixedFormatString(Callee, CI, B)) { + return V; + } + + // printf(format, ...) -> iprintf(format, ...) if no floating point + // arguments. + if (TLI->has(LibFunc::iprintf) && !callHasFloatingPointArgument(CI)) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + Constant *IPrintFFn = + M->getOrInsertFunction("iprintf", FT, Callee->getAttributes()); + CallInst *New = cast<CallInst>(CI->clone()); + New->setCalledFunction(IPrintFFn); + B.Insert(New); + return New; + } + return 0; + } +}; + +struct SPrintFOpt : public LibCallOptimization { + Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI, + IRBuilder<> &B) { + // Check for a fixed format string. + StringRef FormatStr; + if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) + return 0; + + // If we just have a format string (nothing else crazy) transform it. + if (CI->getNumArgOperands() == 2) { + // Make sure there's no % in the constant array. We could try to handle + // %% -> % in the future if we cared. + for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) + if (FormatStr[i] == '%') + return 0; // we found a format specifier, bail out. + + // These optimizations require DataLayout. + if (!TD) return 0; + + // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1) + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), + ConstantInt::get(TD->getIntPtrType(*Context), // Copy the + FormatStr.size() + 1), 1); // nul byte. + return ConstantInt::get(CI->getType(), FormatStr.size()); + } + + // The remaining optimizations require the format string to be "%s" or "%c" + // and have an extra operand. + if (FormatStr.size() != 2 || FormatStr[0] != '%' || + CI->getNumArgOperands() < 3) + return 0; + + // Decode the second character of the format string. + if (FormatStr[1] == 'c') { + // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0 + if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; + Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char"); + Value *Ptr = CastToCStr(CI->getArgOperand(0), B); + B.CreateStore(V, Ptr); + Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul"); + B.CreateStore(B.getInt8(0), Ptr); + + return ConstantInt::get(CI->getType(), 1); + } + + if (FormatStr[1] == 's') { + // These optimizations require DataLayout. + if (!TD) return 0; + + // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1) + if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0; + + Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD, TLI); + if (!Len) + return 0; + Value *IncLen = B.CreateAdd(Len, + ConstantInt::get(Len->getType(), 1), + "leninc"); + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1); + + // The sprintf result is the unincremented number of bytes in the string. + return B.CreateIntCast(Len, CI->getType(), false); + } + return 0; + } + + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require two fixed pointer arguments and an integer result. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !FT->getReturnType()->isIntegerTy()) + return 0; + + if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) { + return V; + } + + // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating + // point arguments. + if (TLI->has(LibFunc::siprintf) && !callHasFloatingPointArgument(CI)) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + Constant *SIPrintFFn = + M->getOrInsertFunction("siprintf", FT, Callee->getAttributes()); + CallInst *New = cast<CallInst>(CI->clone()); + New->setCalledFunction(SIPrintFFn); + B.Insert(New); + return New; + } + return 0; + } +}; + +struct FPrintFOpt : public LibCallOptimization { + Value *optimizeFixedFormatString(Function *Callee, CallInst *CI, + IRBuilder<> &B) { + // All the optimizations depend on the format string. + StringRef FormatStr; + if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) + return 0; + + // fprintf(F, "foo") --> fwrite("foo", 3, 1, F) + if (CI->getNumArgOperands() == 2) { + for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) + if (FormatStr[i] == '%') // Could handle %% -> % if we cared. + return 0; // We found a format specifier. + + // These optimizations require DataLayout. + if (!TD) return 0; + + Value *NewCI = EmitFWrite(CI->getArgOperand(1), + ConstantInt::get(TD->getIntPtrType(*Context), + FormatStr.size()), + CI->getArgOperand(0), B, TD, TLI); + return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0; + } + + // The remaining optimizations require the format string to be "%s" or "%c" + // and have an extra operand. + if (FormatStr.size() != 2 || FormatStr[0] != '%' || + CI->getNumArgOperands() < 3) + return 0; + + // Decode the second character of the format string. + if (FormatStr[1] == 'c') { + // fprintf(F, "%c", chr) --> fputc(chr, F) + if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; + Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, + TD, TLI); + return NewCI ? ConstantInt::get(CI->getType(), 1) : 0; + } + + if (FormatStr[1] == 's') { + // fprintf(F, "%s", str) --> fputs(str, F) + if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty()) + return 0; + return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI); + } + return 0; + } + + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require two fixed paramters as pointers and integer result. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !FT->getReturnType()->isIntegerTy()) + return 0; + + if (Value *V = optimizeFixedFormatString(Callee, CI, B)) { + return V; + } + + // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no + // floating point arguments. + if (TLI->has(LibFunc::fiprintf) && !callHasFloatingPointArgument(CI)) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + Constant *FIPrintFFn = + M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes()); + CallInst *New = cast<CallInst>(CI->clone()); + New->setCalledFunction(FIPrintFFn); + B.Insert(New); + return New; + } + return 0; + } +}; + +struct FWriteOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require a pointer, an integer, an integer, a pointer, returning integer. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isIntegerTy() || + !FT->getParamType(2)->isIntegerTy() || + !FT->getParamType(3)->isPointerTy() || + !FT->getReturnType()->isIntegerTy()) + return 0; + + // Get the element size and count. + ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + if (!SizeC || !CountC) return 0; + uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue(); + + // If this is writing zero records, remove the call (it's a noop). + if (Bytes == 0) + return ConstantInt::get(CI->getType(), 0); + + // If this is writing one byte, turn it into fputc. + // This optimisation is only valid, if the return value is unused. + if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F) + Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char"); + Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TD, TLI); + return NewCI ? ConstantInt::get(CI->getType(), 1) : 0; + } + + return 0; + } +}; + +struct FPutsOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // These optimizations require DataLayout. + if (!TD) return 0; + + // Require two pointers. Also, we can't optimize if return value is used. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !CI->use_empty()) + return 0; + + // fputs(s,F) --> fwrite(s,1,strlen(s),F) + uint64_t Len = GetStringLength(CI->getArgOperand(0)); + if (!Len) return 0; + // Known to have no uses (see above). + return EmitFWrite(CI->getArgOperand(0), + ConstantInt::get(TD->getIntPtrType(*Context), Len-1), + CI->getArgOperand(1), B, TD, TLI); + } +}; + +struct PutsOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require one fixed pointer argument and an integer/void result. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || + !(FT->getReturnType()->isIntegerTy() || + FT->getReturnType()->isVoidTy())) + return 0; + + // Check for a constant string. + StringRef Str; + if (!getConstantStringInfo(CI->getArgOperand(0), Str)) + return 0; + + if (Str.empty() && CI->use_empty()) { + // puts("") -> putchar('\n') + Value *Res = EmitPutChar(B.getInt32('\n'), B, TD, TLI); + if (CI->use_empty() || !Res) return Res; + return B.CreateIntCast(Res, CI->getType(), true); + } + + return 0; + } +}; + +} // End anonymous namespace. + +namespace llvm { + +class LibCallSimplifierImpl { + const DataLayout *TD; + const TargetLibraryInfo *TLI; + const LibCallSimplifier *LCS; + bool UnsafeFPShrink; + StringMap<LibCallOptimization*> Optimizations; + + // Fortified library call optimizations. + MemCpyChkOpt MemCpyChk; + MemMoveChkOpt MemMoveChk; + MemSetChkOpt MemSetChk; + StrCpyChkOpt StrCpyChk; + StpCpyChkOpt StpCpyChk; + StrNCpyChkOpt StrNCpyChk; + + // String library call optimizations. + StrCatOpt StrCat; + StrNCatOpt StrNCat; + StrChrOpt StrChr; + StrRChrOpt StrRChr; + StrCmpOpt StrCmp; + StrNCmpOpt StrNCmp; + StrCpyOpt StrCpy; + StpCpyOpt StpCpy; + StrNCpyOpt StrNCpy; + StrLenOpt StrLen; + StrPBrkOpt StrPBrk; + StrToOpt StrTo; + StrSpnOpt StrSpn; + StrCSpnOpt StrCSpn; + StrStrOpt StrStr; + + // Memory library call optimizations. + MemCmpOpt MemCmp; + MemCpyOpt MemCpy; + MemMoveOpt MemMove; + MemSetOpt MemSet; + + // Math library call optimizations. + UnaryDoubleFPOpt UnaryDoubleFP, UnsafeUnaryDoubleFP; + CosOpt Cos; PowOpt Pow; Exp2Opt Exp2; + + // Integer library call optimizations. + FFSOpt FFS; + AbsOpt Abs; + IsDigitOpt IsDigit; + IsAsciiOpt IsAscii; + ToAsciiOpt ToAscii; + + // Formatting and IO library call optimizations. + PrintFOpt PrintF; + SPrintFOpt SPrintF; + FPrintFOpt FPrintF; + FWriteOpt FWrite; + FPutsOpt FPuts; + PutsOpt Puts; + + void initOptimizations(); + void addOpt(LibFunc::Func F, LibCallOptimization* Opt); + void addOpt(LibFunc::Func F1, LibFunc::Func F2, LibCallOptimization* Opt); +public: + LibCallSimplifierImpl(const DataLayout *TD, const TargetLibraryInfo *TLI, + const LibCallSimplifier *LCS, + bool UnsafeFPShrink = false) + : UnaryDoubleFP(false), UnsafeUnaryDoubleFP(true), + Cos(UnsafeFPShrink), Pow(UnsafeFPShrink), Exp2(UnsafeFPShrink) { + this->TD = TD; + this->TLI = TLI; + this->LCS = LCS; + this->UnsafeFPShrink = UnsafeFPShrink; + } + + Value *optimizeCall(CallInst *CI); +}; + +void LibCallSimplifierImpl::initOptimizations() { + // Fortified library call optimizations. + Optimizations["__memcpy_chk"] = &MemCpyChk; + Optimizations["__memmove_chk"] = &MemMoveChk; + Optimizations["__memset_chk"] = &MemSetChk; + Optimizations["__strcpy_chk"] = &StrCpyChk; + Optimizations["__stpcpy_chk"] = &StpCpyChk; + Optimizations["__strncpy_chk"] = &StrNCpyChk; + Optimizations["__stpncpy_chk"] = &StrNCpyChk; + + // String library call optimizations. + addOpt(LibFunc::strcat, &StrCat); + addOpt(LibFunc::strncat, &StrNCat); + addOpt(LibFunc::strchr, &StrChr); + addOpt(LibFunc::strrchr, &StrRChr); + addOpt(LibFunc::strcmp, &StrCmp); + addOpt(LibFunc::strncmp, &StrNCmp); + addOpt(LibFunc::strcpy, &StrCpy); + addOpt(LibFunc::stpcpy, &StpCpy); + addOpt(LibFunc::strncpy, &StrNCpy); + addOpt(LibFunc::strlen, &StrLen); + addOpt(LibFunc::strpbrk, &StrPBrk); + addOpt(LibFunc::strtol, &StrTo); + addOpt(LibFunc::strtod, &StrTo); + addOpt(LibFunc::strtof, &StrTo); + addOpt(LibFunc::strtoul, &StrTo); + addOpt(LibFunc::strtoll, &StrTo); + addOpt(LibFunc::strtold, &StrTo); + addOpt(LibFunc::strtoull, &StrTo); + addOpt(LibFunc::strspn, &StrSpn); + addOpt(LibFunc::strcspn, &StrCSpn); + addOpt(LibFunc::strstr, &StrStr); + + // Memory library call optimizations. + addOpt(LibFunc::memcmp, &MemCmp); + addOpt(LibFunc::memcpy, &MemCpy); + addOpt(LibFunc::memmove, &MemMove); + addOpt(LibFunc::memset, &MemSet); + + // Math library call optimizations. + addOpt(LibFunc::ceil, LibFunc::ceilf, &UnaryDoubleFP); + addOpt(LibFunc::fabs, LibFunc::fabsf, &UnaryDoubleFP); + addOpt(LibFunc::floor, LibFunc::floorf, &UnaryDoubleFP); + addOpt(LibFunc::rint, LibFunc::rintf, &UnaryDoubleFP); + addOpt(LibFunc::round, LibFunc::roundf, &UnaryDoubleFP); + addOpt(LibFunc::nearbyint, LibFunc::nearbyintf, &UnaryDoubleFP); + addOpt(LibFunc::trunc, LibFunc::truncf, &UnaryDoubleFP); + + if(UnsafeFPShrink) { + addOpt(LibFunc::acos, LibFunc::acosf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::acosh, LibFunc::acoshf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::asin, LibFunc::asinf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::asinh, LibFunc::asinhf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::atan, LibFunc::atanf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::atanh, LibFunc::atanhf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::cbrt, LibFunc::cbrtf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::cosh, LibFunc::coshf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::exp, LibFunc::expf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::exp10, LibFunc::exp10f, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::expm1, LibFunc::expm1f, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::log, LibFunc::logf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::log10, LibFunc::log10f, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::log1p, LibFunc::log1pf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::log2, LibFunc::log2f, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::logb, LibFunc::logbf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::sin, LibFunc::sinf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::sinh, LibFunc::sinhf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::sqrt, LibFunc::sqrtf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::tan, LibFunc::tanf, &UnsafeUnaryDoubleFP); + addOpt(LibFunc::tanh, LibFunc::tanhf, &UnsafeUnaryDoubleFP); + } + + addOpt(LibFunc::cosf, &Cos); + addOpt(LibFunc::cos, &Cos); + addOpt(LibFunc::cosl, &Cos); + addOpt(LibFunc::powf, &Pow); + addOpt(LibFunc::pow, &Pow); + addOpt(LibFunc::powl, &Pow); + Optimizations["llvm.pow.f32"] = &Pow; + Optimizations["llvm.pow.f64"] = &Pow; + Optimizations["llvm.pow.f80"] = &Pow; + Optimizations["llvm.pow.f128"] = &Pow; + Optimizations["llvm.pow.ppcf128"] = &Pow; + addOpt(LibFunc::exp2l, &Exp2); + addOpt(LibFunc::exp2, &Exp2); + addOpt(LibFunc::exp2f, &Exp2); + Optimizations["llvm.exp2.ppcf128"] = &Exp2; + Optimizations["llvm.exp2.f128"] = &Exp2; + Optimizations["llvm.exp2.f80"] = &Exp2; + Optimizations["llvm.exp2.f64"] = &Exp2; + Optimizations["llvm.exp2.f32"] = &Exp2; + + // Integer library call optimizations. + addOpt(LibFunc::ffs, &FFS); + addOpt(LibFunc::ffsl, &FFS); + addOpt(LibFunc::ffsll, &FFS); + addOpt(LibFunc::abs, &Abs); + addOpt(LibFunc::labs, &Abs); + addOpt(LibFunc::llabs, &Abs); + addOpt(LibFunc::isdigit, &IsDigit); + addOpt(LibFunc::isascii, &IsAscii); + addOpt(LibFunc::toascii, &ToAscii); + + // Formatting and IO library call optimizations. + addOpt(LibFunc::printf, &PrintF); + addOpt(LibFunc::sprintf, &SPrintF); + addOpt(LibFunc::fprintf, &FPrintF); + addOpt(LibFunc::fwrite, &FWrite); + addOpt(LibFunc::fputs, &FPuts); + addOpt(LibFunc::puts, &Puts); +} + +Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) { + if (Optimizations.empty()) + initOptimizations(); + + Function *Callee = CI->getCalledFunction(); + LibCallOptimization *LCO = Optimizations.lookup(Callee->getName()); + if (LCO) { + IRBuilder<> Builder(CI); + return LCO->optimizeCall(CI, TD, TLI, LCS, Builder); + } + return 0; +} + +void LibCallSimplifierImpl::addOpt(LibFunc::Func F, LibCallOptimization* Opt) { + if (TLI->has(F)) + Optimizations[TLI->getName(F)] = Opt; +} + +void LibCallSimplifierImpl::addOpt(LibFunc::Func F1, LibFunc::Func F2, + LibCallOptimization* Opt) { + if (TLI->has(F1) && TLI->has(F2)) + Optimizations[TLI->getName(F1)] = Opt; +} + +LibCallSimplifier::LibCallSimplifier(const DataLayout *TD, + const TargetLibraryInfo *TLI, + bool UnsafeFPShrink) { + Impl = new LibCallSimplifierImpl(TD, TLI, this, UnsafeFPShrink); +} + +LibCallSimplifier::~LibCallSimplifier() { + delete Impl; +} + +Value *LibCallSimplifier::optimizeCall(CallInst *CI) { + return Impl->optimizeCall(CI); +} + +void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const { + I->replaceAllUsesWith(With); + I->eraseFromParent(); +} + +} diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp index b1cad06..560f581 100644 --- a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp +++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp @@ -15,12 +15,12 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/BasicBlock.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/Type.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; char UnifyFunctionExitNodes::ID = 0; diff --git a/lib/Transforms/Utils/Utils.cpp b/lib/Transforms/Utils/Utils.cpp index 24e8c8f..5812d46 100644 --- a/lib/Transforms/Utils/Utils.cpp +++ b/lib/Transforms/Utils/Utils.cpp @@ -29,6 +29,7 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) { initializePromotePassPass(Registry); initializeUnifyFunctionExitNodesPass(Registry); initializeInstSimplifierPass(Registry); + initializeMetaRenamerPass(Registry); } /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses. diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp index fc2538d..a5e1643 100644 --- a/lib/Transforms/Utils/ValueMapper.cpp +++ b/lib/Transforms/Utils/ValueMapper.cpp @@ -13,15 +13,15 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/ValueMapper.h" -#include "llvm/Constants.h" -#include "llvm/Function.h" -#include "llvm/InlineAsm.h" -#include "llvm/Instructions.h" -#include "llvm/Metadata.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" using namespace llvm; // Out of line method to get vtable etc for class. -void ValueMapTypeRemapper::Anchor() {} +void ValueMapTypeRemapper::anchor() {} Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper) { diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index c09dcd2..d72a4a1 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -16,42 +16,54 @@ #define BBV_NAME "bb-vectorize" #define DEBUG_TYPE BBV_NAME -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Intrinsics.h" -#include "llvm/LLVMContext.h" -#include "llvm/Metadata.h" -#include "llvm/Pass.h" -#include "llvm/Type.h" +#include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Type.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/ValueHandle.h" -#include "llvm/Target/TargetData.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Vectorize.h" #include <algorithm> #include <map> using namespace llvm; +static cl::opt<bool> +IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), + cl::Hidden, cl::desc("Ignore target information")); + static cl::opt<unsigned> ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, cl::desc("The required chain depth for vectorization")); +static cl::opt<bool> +UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), + cl::Hidden, cl::desc("Use the chain depth requirement with" + " target information")); + static cl::opt<unsigned> SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, cl::desc("The maximum search distance for instruction pairs")); @@ -93,8 +105,9 @@ static cl::opt<bool> NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize floating-point values")); +// FIXME: This should default to false once pointer vector support works. static cl::opt<bool> -NoPointers("bb-vectorize-no-pointers", cl::init(false), cl::Hidden, +NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden, cl::desc("Don't try to vectorize pointer values")); static cl::opt<bool> @@ -159,6 +172,12 @@ DebugCycleCheck("bb-vectorize-debug-cycle-check", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the" " cycle-checking process")); + +static cl::opt<bool> +PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, dump the basic block after" + " every pair is fused")); #endif STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); @@ -177,13 +196,17 @@ namespace { BBVectorize(Pass *P, const VectorizeConfig &C) : BasicBlockPass(ID), Config(C) { AA = &P->getAnalysis<AliasAnalysis>(); + DT = &P->getAnalysis<DominatorTree>(); SE = &P->getAnalysis<ScalarEvolution>(); - TD = P->getAnalysisIfAvailable<TargetData>(); + TD = P->getAnalysisIfAvailable<DataLayout>(); + TTI = IgnoreTargetInfo ? 0 : &P->getAnalysis<TargetTransformInfo>(); } typedef std::pair<Value *, Value *> ValuePair; + typedef std::pair<ValuePair, int> ValuePairWithCost; typedef std::pair<ValuePair, size_t> ValuePairWithDepth; typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair + typedef std::pair<VPPair, unsigned> VPPairWithType; typedef std::pair<std::multimap<Value *, Value *>::iterator, std::multimap<Value *, Value *>::iterator> VPIteratorPair; typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator, @@ -191,8 +214,10 @@ namespace { VPPIteratorPair; AliasAnalysis *AA; + DominatorTree *DT; ScalarEvolution *SE; - TargetData *TD; + DataLayout *TD; + const TargetTransformInfo *TTI; // FIXME: const correct? @@ -201,11 +226,23 @@ namespace { bool getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, std::multimap<Value *, Value *> &CandidatePairs, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, bool NonPow2Len); + // FIXME: The current implementation does not account for pairs that + // are connected in multiple ways. For example: + // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) + enum PairConnectionType { + PairConnectionDirect, + PairConnectionSwap, + PairConnectionSplat + }; + void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, - std::multimap<ValuePair, ValuePair> &ConnectedPairs); + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes); void buildDepMap(BasicBlock &BB, std::multimap<Value *, Value *> &CandidatePairs, @@ -213,19 +250,29 @@ namespace { DenseSet<ValuePair> &PairableInstUsers); void choosePairs(std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, DenseSet<ValuePair> &PairableInstUsers, DenseMap<Value *, Value *>& ChosenPairs); void fuseChosenPairs(BasicBlock &BB, std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *>& ChosenPairs); + DenseMap<Value *, Value *>& ChosenPairs, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps); + bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); bool areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore, bool NonPow2Len); + bool IsSimpleLoadStore, bool NonPow2Len, + int &CostSavings, int &FixedOrder); bool trackUsesOfI(DenseSet<Value *> &Users, AliasSetTracker &WriteSet, Instruction *I, @@ -236,6 +283,7 @@ namespace { std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, ValuePair P); bool pairsConflict(ValuePair P, ValuePair Q, @@ -267,17 +315,21 @@ namespace { void findBestTreeFor( std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, DenseSet<ValuePair> &PairableInstUsers, std::multimap<ValuePair, ValuePair> &PairableInstUserMap, DenseMap<Value *, Value *> &ChosenPairs, DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, - size_t &BestEffSize, VPIteratorPair ChoiceRange, + int &BestEffSize, VPIteratorPair ChoiceRange, bool UseCycleCheck); Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool FlipMemInputs); + Instruction *J, unsigned o); void fillNewShuffleMask(LLVMContext& Context, Instruction *J, unsigned MaskOffset, unsigned NumInElem, @@ -289,20 +341,20 @@ namespace { bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, unsigned o, Value *&LOp, unsigned numElemL, - Type *ArgTypeL, Type *ArgTypeR, + Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ, unsigned IdxOff = 0); Value *getReplacementInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool FlipMemInputs); + Instruction *J, unsigned o, bool IBeforeJ); void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, - bool FlipMemInputs); + bool IBeforeJ); void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, Instruction *J, Instruction *K, Instruction *&InsertionPt, Instruction *&K1, - Instruction *&K2, bool FlipMemInputs); + Instruction *&K2); void collectPairLoadMoveSet(BasicBlock &BB, DenseMap<Value *, Value *> &ChosenPairs, @@ -314,10 +366,6 @@ namespace { DenseMap<Value *, Value *> &ChosenPairs, std::multimap<Value *, Value *> &LoadMoveSet); - void collectPtrInfo(std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *> &ChosenPairs, - DenseSet<Value *> &LowPtrInsts); - bool canMoveUsesOfIAfterJ(BasicBlock &BB, std::multimap<Value *, Value *> &LoadMoveSet, Instruction *I, Instruction *J); @@ -330,13 +378,22 @@ namespace { void combineMetadata(Instruction *K, const Instruction *J); bool vectorizeBB(BasicBlock &BB) { + if (!DT->isReachableFromEntry(&BB)) { + DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() << + " in " << BB.getParent()->getName() << "\n"); + return false; + } + + DEBUG(if (TTI) dbgs() << "BBV: using target information\n"); + bool changed = false; // Iterate a sufficient number of times to merge types of size 1 bit, // then 2 bits, then 4, etc. up to half of the target vector width of the // target vector register. unsigned n = 1; for (unsigned v = 2; - v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter); + (TTI || v <= Config.VectorBits) && + (!Config.MaxIter || n <= Config.MaxIter); v *= 2, ++n) { DEBUG(dbgs() << "BBV: fusing loop #" << n << " for " << BB.getName() << " in " << @@ -363,8 +420,10 @@ namespace { virtual bool runOnBasicBlock(BasicBlock &BB) { AA = &getAnalysis<AliasAnalysis>(); + DT = &getAnalysis<DominatorTree>(); SE = &getAnalysis<ScalarEvolution>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); + TTI = IgnoreTargetInfo ? 0 : &getAnalysis<TargetTransformInfo>(); return vectorizeBB(BB); } @@ -372,8 +431,11 @@ namespace { virtual void getAnalysisUsage(AnalysisUsage &AU) const { BasicBlockPass::getAnalysisUsage(AU); AU.addRequired<AliasAnalysis>(); + AU.addRequired<DominatorTree>(); AU.addRequired<ScalarEvolution>(); + AU.addRequired<TargetTransformInfo>(); AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<DominatorTree>(); AU.addPreserved<ScalarEvolution>(); AU.setPreservesCFG(); } @@ -415,6 +477,14 @@ namespace { T2 = cast<CastInst>(I)->getSrcTy(); else T2 = T1; + + if (SelectInst *SI = dyn_cast<SelectInst>(I)) { + T2 = SI->getCondition()->getType(); + } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { + T2 = SI->getOperand(0)->getType(); + } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) { + T2 = CI->getOperand(0)->getType(); + } } // Returns the weight associated with the provided value. A chain of @@ -446,6 +516,62 @@ namespace { return 1; } + // Returns the cost of the provided instruction using TTI. + // This does not handle loads and stores. + unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) { + switch (Opcode) { + default: break; + case Instruction::GetElementPtr: + // We mark this instruction as zero-cost because scalar GEPs are usually + // lowered to the intruction addressing mode. At the moment we don't + // generate vector GEPs. + return 0; + case Instruction::Br: + return TTI->getCFInstrCost(Opcode); + case Instruction::PHI: + return 0; + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return TTI->getArithmeticInstrCost(Opcode, T1); + case Instruction::Select: + case Instruction::ICmp: + case Instruction::FCmp: + return TTI->getCmpSelInstrCost(Opcode, T1, T2); + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: + case Instruction::ShuffleVector: + return TTI->getCastInstrCost(Opcode, T1, T2); + } + + return 1; + } + // This determines the relative offset of two loads or stores, returning // true if the offset could be determined to be some constant value. // For example, if OffsetInElmts == 1, then J accesses the memory directly @@ -453,20 +579,30 @@ namespace { // directly after J. bool getPairPtrInfo(Instruction *I, Instruction *J, Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, - int64_t &OffsetInElmts) { + unsigned &IAddressSpace, unsigned &JAddressSpace, + int64_t &OffsetInElmts, bool ComputeOffset = true) { OffsetInElmts = 0; - if (isa<LoadInst>(I)) { - IPtr = cast<LoadInst>(I)->getPointerOperand(); - JPtr = cast<LoadInst>(J)->getPointerOperand(); - IAlignment = cast<LoadInst>(I)->getAlignment(); - JAlignment = cast<LoadInst>(J)->getAlignment(); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + LoadInst *LJ = cast<LoadInst>(J); + IPtr = LI->getPointerOperand(); + JPtr = LJ->getPointerOperand(); + IAlignment = LI->getAlignment(); + JAlignment = LJ->getAlignment(); + IAddressSpace = LI->getPointerAddressSpace(); + JAddressSpace = LJ->getPointerAddressSpace(); } else { - IPtr = cast<StoreInst>(I)->getPointerOperand(); - JPtr = cast<StoreInst>(J)->getPointerOperand(); - IAlignment = cast<StoreInst>(I)->getAlignment(); - JAlignment = cast<StoreInst>(J)->getAlignment(); + StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J); + IPtr = SI->getPointerOperand(); + JPtr = SJ->getPointerOperand(); + IAlignment = SI->getAlignment(); + JAlignment = SJ->getAlignment(); + IAddressSpace = SI->getPointerAddressSpace(); + JAddressSpace = SJ->getPointerAddressSpace(); } + if (!ComputeOffset) + return true; + const SCEV *IPtrSCEV = SE->getSCEV(IPtr); const SCEV *JPtrSCEV = SE->getSCEV(JPtr); @@ -502,7 +638,7 @@ namespace { Function *F = I->getCalledFunction(); if (!F) return false; - unsigned IID = F->getIntrinsicID(); + Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); if (!IID) return false; switch(IID) { @@ -520,6 +656,7 @@ namespace { case Intrinsic::pow: return Config.VectorizeMath; case Intrinsic::fma: + case Intrinsic::fmuladd: return Config.VectorizeFMA; } } @@ -536,6 +673,19 @@ namespace { return false; } + + bool isPureIEChain(InsertElementInst *IE) { + InsertElementInst *IENext = IE; + do { + if (!isa<UndefValue>(IENext->getOperand(0)) && + !isa<InsertElementInst>(IENext->getOperand(0))) { + return false; + } + } while ((IENext = + dyn_cast<InsertElementInst>(IENext->getOperand(0)))); + + return true; + } }; // This function implements one vectorization iteration on the provided @@ -546,11 +696,18 @@ namespace { std::vector<Value *> AllPairableInsts; DenseMap<Value *, Value *> AllChosenPairs; + DenseSet<ValuePair> AllFixedOrderPairs; + DenseMap<VPPair, unsigned> AllPairConnectionTypes; + std::multimap<ValuePair, ValuePair> AllConnectedPairs, AllConnectedPairDeps; do { std::vector<Value *> PairableInsts; std::multimap<Value *, Value *> CandidatePairs; + DenseSet<ValuePair> FixedOrderPairs; + DenseMap<ValuePair, int> CandidatePairCostSavings; ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, + FixedOrderPairs, + CandidatePairCostSavings, PairableInsts, NonPow2Len); if (PairableInsts.empty()) continue; @@ -563,10 +720,18 @@ namespace { // Note that it only matters that both members of the second pair use some // element of the first pair (to allow for splatting). - std::multimap<ValuePair, ValuePair> ConnectedPairs; - computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs); + std::multimap<ValuePair, ValuePair> ConnectedPairs, ConnectedPairDeps; + DenseMap<VPPair, unsigned> PairConnectionTypes; + computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs, + PairConnectionTypes); if (ConnectedPairs.empty()) continue; + for (std::multimap<ValuePair, ValuePair>::iterator + I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); + I != IE; ++I) { + ConnectedPairDeps.insert(VPPair(I->second, I->first)); + } + // Build the pairable-instruction dependency map DenseSet<ValuePair> PairableInstUsers; buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); @@ -578,13 +743,48 @@ namespace { // variables. DenseMap<Value *, Value *> ChosenPairs; - choosePairs(CandidatePairs, PairableInsts, ConnectedPairs, + choosePairs(CandidatePairs, CandidatePairCostSavings, + PairableInsts, FixedOrderPairs, PairConnectionTypes, + ConnectedPairs, ConnectedPairDeps, PairableInstUsers, ChosenPairs); if (ChosenPairs.empty()) continue; AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), PairableInsts.end()); AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); + + // Only for the chosen pairs, propagate information on fixed-order pairs, + // pair connections, and their types to the data structures used by the + // pair fusion procedures. + for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(), + IE = ChosenPairs.end(); I != IE; ++I) { + if (FixedOrderPairs.count(*I)) + AllFixedOrderPairs.insert(*I); + else if (FixedOrderPairs.count(ValuePair(I->second, I->first))) + AllFixedOrderPairs.insert(ValuePair(I->second, I->first)); + + for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin(); + J != IE; ++J) { + DenseMap<VPPair, unsigned>::iterator K = + PairConnectionTypes.find(VPPair(*I, *J)); + if (K != PairConnectionTypes.end()) { + AllPairConnectionTypes.insert(*K); + } else { + K = PairConnectionTypes.find(VPPair(*J, *I)); + if (K != PairConnectionTypes.end()) + AllPairConnectionTypes.insert(*K); + } + } + } + + for (std::multimap<ValuePair, ValuePair>::iterator + I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); + I != IE; ++I) { + if (AllPairConnectionTypes.count(*I)) { + AllConnectedPairs.insert(*I); + AllConnectedPairDeps.insert(VPPair(I->second, I->first)); + } + } } while (ShouldContinue); if (AllChosenPairs.empty()) return false; @@ -597,7 +797,9 @@ namespace { // replaced with a vector_extract on the result. Subsequent optimization // passes should coalesce the build/extract combinations. - fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs); + fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs, + AllPairConnectionTypes, + AllConnectedPairs, AllConnectedPairDeps); // It is important to cleanup here so that future iterations of this // function have less work to do. @@ -667,15 +869,22 @@ namespace { !(VectorType::isValidElementType(T2) || T2->isVectorTy())) return false; - if (T1->getScalarSizeInBits() == 1 && T2->getScalarSizeInBits() == 1) { + if (T1->getScalarSizeInBits() == 1) { if (!Config.VectorizeBools) return false; } else { - if (!Config.VectorizeInts - && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) + if (!Config.VectorizeInts && T1->isIntOrIntVectorTy()) return false; } - + + if (T2->getScalarSizeInBits() == 1) { + if (!Config.VectorizeBools) + return false; + } else { + if (!Config.VectorizeInts && T2->isIntOrIntVectorTy()) + return false; + } + if (!Config.VectorizeFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) return false; @@ -691,8 +900,8 @@ namespace { T2->getScalarType()->isPointerTy())) return false; - if (T1->getPrimitiveSizeInBits() >= Config.VectorBits || - T2->getPrimitiveSizeInBits() >= Config.VectorBits) + if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || + T2->getPrimitiveSizeInBits() >= Config.VectorBits)) return false; return true; @@ -703,10 +912,14 @@ namespace { // that I has already been determined to be vectorizable and that J is not // in the use tree of I. bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore, bool NonPow2Len) { + bool IsSimpleLoadStore, bool NonPow2Len, + int &CostSavings, int &FixedOrder) { DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << " <-> " << *J << "\n"); + CostSavings = 0; + FixedOrder = 0; + // Loads and stores can be merged if they have different alignments, // but are otherwise the same. if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | @@ -719,52 +932,151 @@ namespace { unsigned MaxTypeBits = std::max( IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); - if (MaxTypeBits > Config.VectorBits) + if (!TTI && MaxTypeBits > Config.VectorBits) return false; // FIXME: handle addsub-type operations! if (IsSimpleLoadStore) { Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment; + unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; int64_t OffsetInElmts = 0; if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, + IAddressSpace, JAddressSpace, OffsetInElmts) && abs64(OffsetInElmts) == 1) { - if (Config.AlignedOnly) { - Type *aTypeI = isa<StoreInst>(I) ? - cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); - Type *aTypeJ = isa<StoreInst>(J) ? - cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); + FixedOrder = (int) OffsetInElmts; + unsigned BottomAlignment = IAlignment; + if (OffsetInElmts < 0) BottomAlignment = JAlignment; + Type *aTypeI = isa<StoreInst>(I) ? + cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); + Type *aTypeJ = isa<StoreInst>(J) ? + cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); + Type *VType = getVecTypeForPair(aTypeI, aTypeJ); + + if (Config.AlignedOnly) { // An aligned load or store is possible only if the instruction // with the lower offset has an alignment suitable for the // vector type. - unsigned BottomAlignment = IAlignment; - if (OffsetInElmts < 0) BottomAlignment = JAlignment; - - Type *VType = getVecTypeForPair(aTypeI, aTypeJ); unsigned VecAlignment = TD->getPrefTypeAlignment(VType); if (BottomAlignment < VecAlignment) return false; } + + if (TTI) { + unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI, + IAlignment, IAddressSpace); + unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ, + JAlignment, JAddressSpace); + unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType, + BottomAlignment, + IAddressSpace); + if (VCost > ICost + JCost) + return false; + + // We don't want to fuse to a type that will be split, even + // if the two input types will also be split and there is no other + // associated cost. + unsigned VParts = TTI->getNumberOfParts(VType); + if (VParts > 1) + return false; + else if (!VParts && VCost == ICost + JCost) + return false; + + CostSavings = ICost + JCost - VCost; + } } else { return false; } + } else if (TTI) { + unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2); + unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); + Type *VT1 = getVecTypeForPair(IT1, JT1), + *VT2 = getVecTypeForPair(IT2, JT2); + unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2); + + if (VCost > ICost + JCost) + return false; + + // We don't want to fuse to a type that will be split, even + // if the two input types will also be split and there is no other + // associated cost. + unsigned VParts1 = TTI->getNumberOfParts(VT1), + VParts2 = TTI->getNumberOfParts(VT2); + if (VParts1 > 1 || VParts2 > 1) + return false; + else if ((!VParts1 || !VParts2) && VCost == ICost + JCost) + return false; + + CostSavings = ICost + JCost - VCost; } // The powi intrinsic is special because only the first argument is // vectorized, the second arguments must be equal. CallInst *CI = dyn_cast<CallInst>(I); Function *FI; - if (CI && (FI = CI->getCalledFunction()) && - FI->getIntrinsicID() == Intrinsic::powi) { - - Value *A1I = CI->getArgOperand(1), - *A1J = cast<CallInst>(J)->getArgOperand(1); - const SCEV *A1ISCEV = SE->getSCEV(A1I), - *A1JSCEV = SE->getSCEV(A1J); - return (A1ISCEV == A1JSCEV); + if (CI && (FI = CI->getCalledFunction())) { + Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID(); + if (IID == Intrinsic::powi) { + Value *A1I = CI->getArgOperand(1), + *A1J = cast<CallInst>(J)->getArgOperand(1); + const SCEV *A1ISCEV = SE->getSCEV(A1I), + *A1JSCEV = SE->getSCEV(A1J); + return (A1ISCEV == A1JSCEV); + } + + if (IID && TTI) { + SmallVector<Type*, 4> Tys; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) + Tys.push_back(CI->getArgOperand(i)->getType()); + unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys); + + Tys.clear(); + CallInst *CJ = cast<CallInst>(J); + for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i) + Tys.push_back(CJ->getArgOperand(i)->getType()); + unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys); + + Tys.clear(); + assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && + "Intrinsic argument counts differ"); + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + if (IID == Intrinsic::powi && i == 1) + Tys.push_back(CI->getArgOperand(i)->getType()); + else + Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), + CJ->getArgOperand(i)->getType())); + } + + Type *RetTy = getVecTypeForPair(IT1, JT1); + unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys); + + if (VCost > ICost + JCost) + return false; + + // We don't want to fuse to a type that will be split, even + // if the two input types will also be split and there is no other + // associated cost. + unsigned RetParts = TTI->getNumberOfParts(RetTy); + if (RetParts > 1) + return false; + else if (!RetParts && VCost == ICost + JCost) + return false; + + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + if (!Tys[i]->isVectorTy()) + continue; + + unsigned NumParts = TTI->getNumberOfParts(Tys[i]); + if (NumParts > 1) + return false; + else if (!NumParts && VCost == ICost + JCost) + return false; + } + + CostSavings = ICost + JCost - VCost; + } } return true; @@ -833,6 +1145,8 @@ namespace { bool BBVectorize::getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, std::multimap<Value *, Value *> &CandidatePairs, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, bool NonPow2Len) { BasicBlock::iterator E = BB.end(); if (Start == E) return false; @@ -869,7 +1183,9 @@ namespace { // J does not use I, and comes before the first use of I, so it can be // merged with I if the instructions are compatible. - if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len)) continue; + int CostSavings, FixedOrder; + if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len, + CostSavings, FixedOrder)) continue; // J is a candidate for merging with I. if (!PairableInsts.size() || @@ -878,6 +1194,14 @@ namespace { } CandidatePairs.insert(ValuePair(I, J)); + if (TTI) + CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J), + CostSavings)); + + if (FixedOrder == 1) + FixedOrderPairs.insert(ValuePair(I, J)); + else if (FixedOrder == -1) + FixedOrderPairs.insert(ValuePair(J, I)); // The next call to this function must start after the last instruction // selected during this invocation. @@ -887,7 +1211,8 @@ namespace { } DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " - << *I << " <-> " << *J << "\n"); + << *I << " <-> " << *J << " (cost savings: " << + CostSavings << ")\n"); // If we have already found too many pairs, break here and this function // will be called again starting after the last instruction selected @@ -915,6 +1240,7 @@ namespace { std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, ValuePair P) { StoreInst *SI, *SJ; @@ -946,12 +1272,18 @@ namespace { VPIteratorPair JPairRange = CandidatePairs.equal_range(*J); // Look for <I, J>: - if (isSecondInIteratorPair<Value*>(*J, IPairRange)) - ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) { + VPPair VP(P, ValuePair(*I, *J)); + ConnectedPairs.insert(VP); + PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect)); + } // Look for <J, I>: - if (isSecondInIteratorPair<Value*>(*I, JPairRange)) - ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I))); + if (isSecondInIteratorPair<Value*>(*I, JPairRange)) { + VPPair VP(P, ValuePair(*J, *I)); + ConnectedPairs.insert(VP); + PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap)); + } } if (Config.SplatBreaksChain) continue; @@ -962,8 +1294,11 @@ namespace { P.first == SJ->getPointerOperand()) continue; - if (isSecondInIteratorPair<Value*>(*J, IPairRange)) - ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) { + VPPair VP(P, ValuePair(*I, *J)); + ConnectedPairs.insert(VP); + PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); + } } } @@ -985,8 +1320,11 @@ namespace { P.second == SJ->getPointerOperand()) continue; - if (isSecondInIteratorPair<Value*>(*J, IPairRange)) - ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) { + VPPair VP(P, ValuePair(*I, *J)); + ConnectedPairs.insert(VP); + PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); + } } } } @@ -997,7 +1335,8 @@ namespace { void BBVectorize::computeConnectedPairs( std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, - std::multimap<ValuePair, ValuePair> &ConnectedPairs) { + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes) { for (std::vector<Value *>::iterator PI = PairableInsts.begin(), PE = PairableInsts.end(); PI != PE; ++PI) { @@ -1006,7 +1345,7 @@ namespace { for (std::multimap<Value *, Value *>::iterator P = choiceRange.first; P != choiceRange.second; ++P) computePairsConnectedTo(CandidatePairs, PairableInsts, - ConnectedPairs, *P); + ConnectedPairs, PairConnectionTypes, *P); } DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size() @@ -1196,7 +1535,7 @@ namespace { PrunedTree.insert(QTop.first); // Visit each child, pruning as necessary... - DenseMap<ValuePair, size_t> BestChildren; + SmallVector<ValuePairWithDepth, 8> BestChildren; VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first); for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first; K != QTopRange.second; ++K) { @@ -1228,7 +1567,7 @@ namespace { DenseSet<ValuePair> CurrentPairs; bool CanAdd = true; - for (DenseMap<ValuePair, size_t>::iterator C2 + for (SmallVector<ValuePairWithDepth, 8>::iterator C2 = BestChildren.begin(), E2 = BestChildren.end(); C2 != E2; ++C2) { if (C2->first.first == C->first.first || @@ -1313,22 +1652,22 @@ namespace { // to an already-selected child. Check for this here, and if a // conflict is found, then remove the previously-selected child // before adding this one in its place. - for (DenseMap<ValuePair, size_t>::iterator C2 + for (SmallVector<ValuePairWithDepth, 8>::iterator C2 = BestChildren.begin(); C2 != BestChildren.end();) { if (C2->first.first == C->first.first || C2->first.first == C->first.second || C2->first.second == C->first.first || C2->first.second == C->first.second || pairsConflict(C2->first, C->first, PairableInstUsers)) - BestChildren.erase(C2++); + C2 = BestChildren.erase(C2); else ++C2; } - BestChildren.insert(ValuePairWithDepth(C->first, C->second)); + BestChildren.push_back(ValuePairWithDepth(C->first, C->second)); } - for (DenseMap<ValuePair, size_t>::iterator C + for (SmallVector<ValuePairWithDepth, 8>::iterator C = BestChildren.begin(), E2 = BestChildren.end(); C != E2; ++C) { size_t DepthF = getDepthFactor(C->first.first); @@ -1341,13 +1680,17 @@ namespace { // pairs, given the choice of root pairs as an iterator range. void BBVectorize::findBestTreeFor( std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, DenseSet<ValuePair> &PairableInstUsers, std::multimap<ValuePair, ValuePair> &PairableInstUserMap, DenseMap<Value *, Value *> &ChosenPairs, DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, - size_t &BestEffSize, VPIteratorPair ChoiceRange, + int &BestEffSize, VPIteratorPair ChoiceRange, bool UseCycleCheck) { for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first; J != ChoiceRange.second; ++J) { @@ -1397,17 +1740,289 @@ namespace { PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree, PrunedTree, *J, UseCycleCheck); - size_t EffSize = 0; - for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), - E = PrunedTree.end(); S != E; ++S) - EffSize += getDepthFactor(S->first); + int EffSize = 0; + if (TTI) { + DenseSet<Value *> PrunedTreeInstrs; + for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) { + PrunedTreeInstrs.insert(S->first); + PrunedTreeInstrs.insert(S->second); + } + + // The set of pairs that have already contributed to the total cost. + DenseSet<ValuePair> IncomingPairs; + + // If the cost model were perfect, this might not be necessary; but we + // need to make sure that we don't get stuck vectorizing our own + // shuffle chains. + bool HasNontrivialInsts = false; + + // The node weights represent the cost savings associated with + // fusing the pair of instructions. + for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) { + if (!isa<ShuffleVectorInst>(S->first) && + !isa<InsertElementInst>(S->first) && + !isa<ExtractElementInst>(S->first)) + HasNontrivialInsts = true; + + bool FlipOrder = false; + + if (getDepthFactor(S->first)) { + int ESContrib = CandidatePairCostSavings.find(*S)->second; + DEBUG(if (DebugPairSelection) dbgs() << "\tweight {" + << *S->first << " <-> " << *S->second << "} = " << + ESContrib << "\n"); + EffSize += ESContrib; + } + + // The edge weights contribute in a negative sense: they represent + // the cost of shuffles. + VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S); + if (IP.first != ConnectedPairDeps.end()) { + unsigned NumDepsDirect = 0, NumDepsSwap = 0; + for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; + Q != IP.second; ++Q) { + if (!PrunedTree.count(Q->second)) + continue; + DenseMap<VPPair, unsigned>::iterator R = + PairConnectionTypes.find(VPPair(Q->second, Q->first)); + assert(R != PairConnectionTypes.end() && + "Cannot find pair connection type"); + if (R->second == PairConnectionDirect) + ++NumDepsDirect; + else if (R->second == PairConnectionSwap) + ++NumDepsSwap; + } + + // If there are more swaps than direct connections, then + // the pair order will be flipped during fusion. So the real + // number of swaps is the minimum number. + FlipOrder = !FixedOrderPairs.count(*S) && + ((NumDepsSwap > NumDepsDirect) || + FixedOrderPairs.count(ValuePair(S->second, S->first))); + + for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; + Q != IP.second; ++Q) { + if (!PrunedTree.count(Q->second)) + continue; + DenseMap<VPPair, unsigned>::iterator R = + PairConnectionTypes.find(VPPair(Q->second, Q->first)); + assert(R != PairConnectionTypes.end() && + "Cannot find pair connection type"); + Type *Ty1 = Q->second.first->getType(), + *Ty2 = Q->second.second->getType(); + Type *VTy = getVecTypeForPair(Ty1, Ty2); + if ((R->second == PairConnectionDirect && FlipOrder) || + (R->second == PairConnectionSwap && !FlipOrder) || + R->second == PairConnectionSplat) { + int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, VTy); + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *Q->second.first << " <-> " << *Q->second.second << + "} -> {" << + *S->first << " <-> " << *S->second << "} = " << + ESContrib << "\n"); + EffSize -= ESContrib; + } + } + } + + // Compute the cost of outgoing edges. We assume that edges outgoing + // to shuffles, inserts or extracts can be merged, and so contribute + // no additional cost. + if (!S->first->getType()->isVoidTy()) { + Type *Ty1 = S->first->getType(), + *Ty2 = S->second->getType(); + Type *VTy = getVecTypeForPair(Ty1, Ty2); + + bool NeedsExtraction = false; + for (Value::use_iterator I = S->first->use_begin(), + IE = S->first->use_end(); I != IE; ++I) { + if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) { + // Shuffle can be folded if it has no other input + if (isa<UndefValue>(SI->getOperand(1))) + continue; + } + if (isa<ExtractElementInst>(*I)) + continue; + if (PrunedTreeInstrs.count(*I)) + continue; + NeedsExtraction = true; + break; + } + + if (NeedsExtraction) { + int ESContrib; + if (Ty1->isVectorTy()) + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + Ty1, VTy); + else + ESContrib = (int) TTI->getVectorInstrCost( + Instruction::ExtractElement, VTy, 0); + + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *S->first << "} = " << ESContrib << "\n"); + EffSize -= ESContrib; + } + + NeedsExtraction = false; + for (Value::use_iterator I = S->second->use_begin(), + IE = S->second->use_end(); I != IE; ++I) { + if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) { + // Shuffle can be folded if it has no other input + if (isa<UndefValue>(SI->getOperand(1))) + continue; + } + if (isa<ExtractElementInst>(*I)) + continue; + if (PrunedTreeInstrs.count(*I)) + continue; + NeedsExtraction = true; + break; + } + + if (NeedsExtraction) { + int ESContrib; + if (Ty2->isVectorTy()) + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + Ty2, VTy); + else + ESContrib = (int) TTI->getVectorInstrCost( + Instruction::ExtractElement, VTy, 1); + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *S->second << "} = " << ESContrib << "\n"); + EffSize -= ESContrib; + } + } + + // Compute the cost of incoming edges. + if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) { + Instruction *S1 = cast<Instruction>(S->first), + *S2 = cast<Instruction>(S->second); + for (unsigned o = 0; o < S1->getNumOperands(); ++o) { + Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); + + // Combining constants into vector constants (or small vector + // constants into larger ones are assumed free). + if (isa<Constant>(O1) && isa<Constant>(O2)) + continue; + + if (FlipOrder) + std::swap(O1, O2); + + ValuePair VP = ValuePair(O1, O2); + ValuePair VPR = ValuePair(O2, O1); + + // Internal edges are not handled here. + if (PrunedTree.count(VP) || PrunedTree.count(VPR)) + continue; + + Type *Ty1 = O1->getType(), + *Ty2 = O2->getType(); + Type *VTy = getVecTypeForPair(Ty1, Ty2); + + // Combining vector operations of the same type is also assumed + // folded with other operations. + if (Ty1 == Ty2) { + // If both are insert elements, then both can be widened. + InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1), + *IEO2 = dyn_cast<InsertElementInst>(O2); + if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2)) + continue; + // If both are extract elements, and both have the same input + // type, then they can be replaced with a shuffle + ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1), + *EIO2 = dyn_cast<ExtractElementInst>(O2); + if (EIO1 && EIO2 && + EIO1->getOperand(0)->getType() == + EIO2->getOperand(0)->getType()) + continue; + // If both are a shuffle with equal operand types and only two + // unqiue operands, then they can be replaced with a single + // shuffle + ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1), + *SIO2 = dyn_cast<ShuffleVectorInst>(O2); + if (SIO1 && SIO2 && + SIO1->getOperand(0)->getType() == + SIO2->getOperand(0)->getType()) { + SmallSet<Value *, 4> SIOps; + SIOps.insert(SIO1->getOperand(0)); + SIOps.insert(SIO1->getOperand(1)); + SIOps.insert(SIO2->getOperand(0)); + SIOps.insert(SIO2->getOperand(1)); + if (SIOps.size() <= 2) + continue; + } + } + + int ESContrib; + // This pair has already been formed. + if (IncomingPairs.count(VP)) { + continue; + } else if (IncomingPairs.count(VPR)) { + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, VTy); + } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) { + ESContrib = (int) TTI->getVectorInstrCost( + Instruction::InsertElement, VTy, 0); + ESContrib += (int) TTI->getVectorInstrCost( + Instruction::InsertElement, VTy, 1); + } else if (!Ty1->isVectorTy()) { + // O1 needs to be inserted into a vector of size O2, and then + // both need to be shuffled together. + ESContrib = (int) TTI->getVectorInstrCost( + Instruction::InsertElement, Ty2, 0); + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + VTy, Ty2); + } else if (!Ty2->isVectorTy()) { + // O2 needs to be inserted into a vector of size O1, and then + // both need to be shuffled together. + ESContrib = (int) TTI->getVectorInstrCost( + Instruction::InsertElement, Ty1, 0); + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + VTy, Ty1); + } else { + Type *TyBig = Ty1, *TySmall = Ty2; + if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) + std::swap(TyBig, TySmall); + + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, TyBig); + if (TyBig != TySmall) + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + TyBig, TySmall); + } + + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" + << *O1 << " <-> " << *O2 << "} = " << + ESContrib << "\n"); + EffSize -= ESContrib; + IncomingPairs.insert(VP); + } + } + } + + if (!HasNontrivialInsts) { + DEBUG(if (DebugPairSelection) dbgs() << + "\tNo non-trivial instructions in tree;" + " override to zero effective size\n"); + EffSize = 0; + } + } else { + for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) + EffSize += (int) getDepthFactor(S->first); + } DEBUG(if (DebugPairSelection) dbgs() << "BBV: found pruned Tree for pair {" << *J->first << " <-> " << *J->second << "} of depth " << MaxDepth << " and size " << PrunedTree.size() << " (effective size: " << EffSize << ")\n"); - if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) { + if (((TTI && !UseChainDepthWithTI) || + MaxDepth >= Config.ReqChainDepth) && + EffSize > 0 && EffSize > BestEffSize) { BestMaxDepth = MaxDepth; BestEffSize = EffSize; BestTree = PrunedTree; @@ -1419,8 +2034,12 @@ namespace { // that will be fused into vector instructions. void BBVectorize::choosePairs( std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, DenseSet<ValuePair> &PairableInstUsers, DenseMap<Value *, Value *>& ChosenPairs) { bool UseCycleCheck = @@ -1435,9 +2054,12 @@ namespace { VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I); // The best pair to choose and its tree: - size_t BestMaxDepth = 0, BestEffSize = 0; + size_t BestMaxDepth = 0; + int BestEffSize = 0; DenseSet<ValuePair> BestTree; - findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, + findBestTreeFor(CandidatePairs, CandidatePairCostSavings, + PairableInsts, FixedOrderPairs, PairConnectionTypes, + ConnectedPairs, ConnectedPairDeps, PairableInstUsers, PairableInstUserMap, ChosenPairs, BestTree, BestMaxDepth, BestEffSize, ChoiceRange, UseCycleCheck); @@ -1490,24 +2112,19 @@ namespace { // Returns the value that is to be used as the pointer input to the vector // instruction that fuses I with J. Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, - Instruction *I, Instruction *J, unsigned o, - bool FlipMemInputs) { + Instruction *I, Instruction *J, unsigned o) { Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment; + unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; int64_t OffsetInElmts; - // Note: the analysis might fail here, that is why FlipMemInputs has + // Note: the analysis might fail here, that is why the pair order has // been precomputed (OffsetInElmts must be unused here). (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, - OffsetInElmts); + IAddressSpace, JAddressSpace, + OffsetInElmts, false); // The pointer value is taken to be the one with the lowest offset. - Value *VPtr; - if (!FlipMemInputs) { - VPtr = IPtr; - } else { - VPtr = JPtr; - } + Value *VPtr = IPtr; Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType(); Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType(); @@ -1515,7 +2132,7 @@ namespace { Type *VArgPtrType = PointerType::get(VArgType, cast<PointerType>(IPtr->getType())->getAddressSpace()); return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), - /* insert before */ FlipMemInputs ? J : I); + /* insert before */ I); } void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, @@ -1585,23 +2202,12 @@ namespace { Instruction *J, unsigned o, Value *&LOp, unsigned numElemL, Type *ArgTypeL, Type *ArgTypeH, - unsigned IdxOff) { + bool IBeforeJ, unsigned IdxOff) { bool ExpandedIEChain = false; if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) { // If we have a pure insertelement chain, then this can be rewritten // into a chain that directly builds the larger type. - bool PureChain = true; - InsertElementInst *LIENext = LIE; - do { - if (!isa<UndefValue>(LIENext->getOperand(0)) && - !isa<InsertElementInst>(LIENext->getOperand(0))) { - PureChain = false; - break; - } - } while ((LIENext = - dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); - - if (PureChain) { + if (isPureIEChain(LIE)) { SmallVector<Value *, 8> VectElemts(numElemL, UndefValue::get(ArgTypeL->getScalarType())); InsertElementInst *LIENext = LIE; @@ -1619,8 +2225,9 @@ namespace { LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], ConstantInt::get(Type::getInt32Ty(Context), i + IdxOff), - getReplacementName(I, true, o, i+1)); - LIENext->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o, i+1)); + LIENext->insertBefore(IBeforeJ ? J : I); LIEPrev = LIENext; } @@ -1635,7 +2242,7 @@ namespace { // Returns the value to be used as the specified operand of the vector // instruction that fuses I with J. Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool FlipMemInputs) { + Instruction *J, unsigned o, bool IBeforeJ) { Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); @@ -1646,12 +2253,6 @@ namespace { Instruction *L = I, *H = J; Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; - if (FlipMemInputs) { - L = J; - H = I; - ArgTypeL = ArgTypeJ; - ArgTypeH = ArgTypeI; - } unsigned numElemL; if (ArgTypeL->isVectorTy()) @@ -1804,8 +2405,9 @@ namespace { Instruction *S = new ShuffleVectorInst(I1, UndefValue::get(I1T), ConstantVector::get(Mask), - getReplacementName(I, true, o)); - S->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o)); + S->insertBefore(IBeforeJ ? J : I); return S; } @@ -1826,8 +2428,9 @@ namespace { Instruction *NewI1 = new ShuffleVectorInst(I1, UndefValue::get(I1T), ConstantVector::get(Mask), - getReplacementName(I, true, o, 1)); - NewI1->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); + NewI1->insertBefore(IBeforeJ ? J : I); I1 = NewI1; I1T = I2T; I1Elem = I2Elem; @@ -1842,8 +2445,9 @@ namespace { Instruction *NewI2 = new ShuffleVectorInst(I2, UndefValue::get(I2T), ConstantVector::get(Mask), - getReplacementName(I, true, o, 1)); - NewI2->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); + NewI2->insertBefore(IBeforeJ ? J : I); I2 = NewI2; I2T = I1T; I2Elem = I1Elem; @@ -1863,8 +2467,8 @@ namespace { Instruction *NewOp = new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), - getReplacementName(I, true, o)); - NewOp->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, true, o)); + NewOp->insertBefore(IBeforeJ ? J : I); return NewOp; } } @@ -1872,17 +2476,17 @@ namespace { Type *ArgType = ArgTypeL; if (numElemL < numElemH) { if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH, - ArgTypeL, VArgType, 1)) { + ArgTypeL, VArgType, IBeforeJ, 1)) { // This is another short-circuit case: we're combining a scalar into // a vector that is formed by an IE chain. We've just expanded the IE // chain, now insert the scalar and we're done. Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, - getReplacementName(I, true, o)); - S->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, true, o)); + S->insertBefore(IBeforeJ ? J : I); return S; } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL, - ArgTypeH)) { + ArgTypeH, IBeforeJ)) { // The two vector inputs to the shuffle must be the same length, // so extend the smaller vector to be the same length as the larger one. Instruction *NLOp; @@ -1897,29 +2501,32 @@ namespace { NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), ConstantVector::get(Mask), - getReplacementName(I, true, o, 1)); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); } else { NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, - getReplacementName(I, true, o, 1)); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); } - NLOp->insertBefore(J); + NLOp->insertBefore(IBeforeJ ? J : I); LOp = NLOp; } ArgType = ArgTypeH; } else if (numElemL > numElemH) { if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, - ArgTypeH, VArgType)) { + ArgTypeH, VArgType, IBeforeJ)) { Instruction *S = InsertElementInst::Create(LOp, HOp, ConstantInt::get(Type::getInt32Ty(Context), numElemL), - getReplacementName(I, true, o)); - S->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o)); + S->insertBefore(IBeforeJ ? J : I); return S; } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH, - ArgTypeL)) { + ArgTypeL, IBeforeJ)) { Instruction *NHOp; if (numElemH > 1) { std::vector<Constant *> Mask(numElemL); @@ -1931,13 +2538,15 @@ namespace { NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), ConstantVector::get(Mask), - getReplacementName(I, true, o, 1)); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); } else { NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, - getReplacementName(I, true, o, 1)); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); } - NHOp->insertBefore(J); + NHOp->insertBefore(IBeforeJ ? J : I); HOp = NHOp; } } @@ -1955,19 +2564,21 @@ namespace { } Instruction *BV = new ShuffleVectorInst(LOp, HOp, - ConstantVector::get(Mask), - getReplacementName(I, true, o)); - BV->insertBefore(J); + ConstantVector::get(Mask), + getReplacementName(IBeforeJ ? I : J, true, o)); + BV->insertBefore(IBeforeJ ? J : I); return BV; } Instruction *BV1 = InsertElementInst::Create( UndefValue::get(VArgType), LOp, CV0, - getReplacementName(I, true, o, 1)); - BV1->insertBefore(I); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); + BV1->insertBefore(IBeforeJ ? J : I); Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, - getReplacementName(I, true, o, 2)); - BV2->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o, 2)); + BV2->insertBefore(IBeforeJ ? J : I); return BV2; } @@ -1976,7 +2587,7 @@ namespace { void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, Instruction *I, Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, - bool FlipMemInputs) { + bool IBeforeJ) { unsigned NumOperands = I->getNumOperands(); for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { @@ -1985,12 +2596,11 @@ namespace { if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { // This is the pointer for a load/store instruction. - ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o, - FlipMemInputs); + ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o); continue; } else if (isa<CallInst>(I)) { Function *F = cast<CallInst>(I)->getCalledFunction(); - unsigned IID = F->getIntrinsicID(); + Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); if (o == NumOperands-1) { BasicBlock &BB = *I->getParent(); @@ -1999,8 +2609,7 @@ namespace { Type *ArgTypeJ = J->getType(); Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - ReplacedOperands[o] = Intrinsic::getDeclaration(M, - (Intrinsic::ID) IID, VArgType); + ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType); continue; } else if (IID == Intrinsic::powi && o == 1) { // The second argument of powi is a single integer and we've already @@ -2014,8 +2623,7 @@ namespace { continue; } - ReplacedOperands[o] = - getReplacementInput(Context, I, J, o, FlipMemInputs); + ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ); } } @@ -2026,8 +2634,7 @@ namespace { void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, Instruction *J, Instruction *K, Instruction *&InsertionPt, - Instruction *&K1, Instruction *&K2, - bool FlipMemInputs) { + Instruction *&K1, Instruction *&K2) { if (isa<StoreInst>(I)) { AA->replaceWithNewValue(I, K); AA->replaceWithNewValue(J, K); @@ -2057,13 +2664,11 @@ namespace { } K1 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( - FlipMemInputs ? Mask2 : Mask1), + ConstantVector::get( Mask1), getReplacementName(K, false, 1)); } else { Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); - Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); - K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0, + K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1)); } @@ -2075,13 +2680,11 @@ namespace { } K2 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( - FlipMemInputs ? Mask1 : Mask2), + ConstantVector::get( Mask2), getReplacementName(K, false, 2)); } else { - Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); - K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1, + K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2)); } @@ -2181,36 +2784,6 @@ namespace { } } - // As with the aliasing information, SCEV can also change because of - // vectorization. This information is used to compute relative pointer - // offsets; the necessary information will be cached here prior to - // fusion. - void BBVectorize::collectPtrInfo(std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *> &ChosenPairs, - DenseSet<Value *> &LowPtrInsts) { - for (std::vector<Value *>::iterator PI = PairableInsts.begin(), - PIE = PairableInsts.end(); PI != PIE; ++PI) { - DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); - if (P == ChosenPairs.end()) continue; - - Instruction *I = cast<Instruction>(P->first); - Instruction *J = cast<Instruction>(P->second); - - if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) - continue; - - Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment; - int64_t OffsetInElmts; - if (!getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, - OffsetInElmts) || abs64(OffsetInElmts) != 1) - llvm_unreachable("Pre-fusion pointer analysis failed"); - - Value *LowPI = (OffsetInElmts > 0) ? I : J; - LowPtrInsts.insert(LowPI); - } - } - // When the first instruction in each pair is cloned, it will inherit its // parent's metadata. This metadata must be combined with that of the other // instruction in a safe way. @@ -2244,27 +2817,27 @@ namespace { // second member). void BBVectorize::fuseChosenPairs(BasicBlock &BB, std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *> &ChosenPairs) { + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps) { LLVMContext& Context = BB.getContext(); // During the vectorization process, the order of the pairs to be fused // could be flipped. So we'll add each pair, flipped, into the ChosenPairs // list. After a pair is fused, the flipped pair is removed from the list. - std::vector<ValuePair> FlippedPairs; - FlippedPairs.reserve(ChosenPairs.size()); + DenseSet<ValuePair> FlippedPairs; for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), E = ChosenPairs.end(); P != E; ++P) - FlippedPairs.push_back(ValuePair(P->second, P->first)); - for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(), + FlippedPairs.insert(ValuePair(P->second, P->first)); + for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(), E = FlippedPairs.end(); P != E; ++P) ChosenPairs.insert(*P); std::multimap<Value *, Value *> LoadMoveSet; collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet); - DenseSet<Value *> LowPtrInsts; - collectPtrInfo(PairableInsts, ChosenPairs, LowPtrInsts); - DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { @@ -2304,44 +2877,92 @@ namespace { continue; } - bool FlipMemInputs = false; - if (isa<LoadInst>(I) || isa<StoreInst>(I)) - FlipMemInputs = (LowPtrInsts.find(I) == LowPtrInsts.end()); + // If the pair must have the other order, then flip it. + bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I)); + if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) { + // This pair does not have a fixed order, and so we might want to + // flip it if that will yield fewer shuffles. We count the number + // of dependencies connected via swaps, and those directly connected, + // and flip the order if the number of swaps is greater. + bool OrigOrder = true; + VPPIteratorPair IP = ConnectedPairDeps.equal_range(ValuePair(I, J)); + if (IP.first == ConnectedPairDeps.end()) { + IP = ConnectedPairDeps.equal_range(ValuePair(J, I)); + OrigOrder = false; + } + + if (IP.first != ConnectedPairDeps.end()) { + unsigned NumDepsDirect = 0, NumDepsSwap = 0; + for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; + Q != IP.second; ++Q) { + DenseMap<VPPair, unsigned>::iterator R = + PairConnectionTypes.find(VPPair(Q->second, Q->first)); + assert(R != PairConnectionTypes.end() && + "Cannot find pair connection type"); + if (R->second == PairConnectionDirect) + ++NumDepsDirect; + else if (R->second == PairConnectionSwap) + ++NumDepsSwap; + } + + if (!OrigOrder) + std::swap(NumDepsDirect, NumDepsSwap); + + if (NumDepsSwap > NumDepsDirect) { + FlipPairOrder = true; + DEBUG(dbgs() << "BBV: reordering pair: " << *I << + " <-> " << *J << "\n"); + } + } + } + + Instruction *L = I, *H = J; + if (FlipPairOrder) + std::swap(H, L); + + // If the pair being fused uses the opposite order from that in the pair + // connection map, then we need to flip the types. + VPPIteratorPair IP = ConnectedPairs.equal_range(ValuePair(H, L)); + for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; + Q != IP.second; ++Q) { + DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(*Q); + assert(R != PairConnectionTypes.end() && + "Cannot find pair connection type"); + if (R->second == PairConnectionDirect) + R->second = PairConnectionSwap; + else if (R->second == PairConnectionSwap) + R->second = PairConnectionDirect; + } + bool LBeforeH = !FlipPairOrder; unsigned NumOperands = I->getNumOperands(); SmallVector<Value *, 3> ReplacedOperands(NumOperands); - getReplacementInputsForPair(Context, I, J, ReplacedOperands, - FlipMemInputs); + getReplacementInputsForPair(Context, L, H, ReplacedOperands, + LBeforeH); // Make a copy of the original operation, change its type to the vector // type and replace its operands with the vector operands. - Instruction *K = I->clone(); - if (I->hasName()) K->takeName(I); + Instruction *K = L->clone(); + if (L->hasName()) + K->takeName(L); + else if (H->hasName()) + K->takeName(H); if (!isa<StoreInst>(K)) - K->mutateType(getVecTypeForPair(I->getType(), J->getType())); + K->mutateType(getVecTypeForPair(L->getType(), H->getType())); - combineMetadata(K, J); + combineMetadata(K, H); + K->intersectOptionalDataWith(H); for (unsigned o = 0; o < NumOperands; ++o) K->setOperand(o, ReplacedOperands[o]); - // If we've flipped the memory inputs, make sure that we take the correct - // alignment. - if (FlipMemInputs) { - if (isa<StoreInst>(K)) - cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment()); - else - cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment()); - } - K->insertAfter(J); // Instruction insertion point: Instruction *InsertionPt = K; Instruction *K1 = 0, *K2 = 0; - replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2, - FlipMemInputs); + replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); // The use tree of the first original instruction must be moved to after // the location of the second instruction. The entire use tree of the @@ -2351,10 +2972,10 @@ namespace { moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J); if (!isa<StoreInst>(I)) { - I->replaceAllUsesWith(K1); - J->replaceAllUsesWith(K2); - AA->replaceWithNewValue(I, K1); - AA->replaceWithNewValue(J, K2); + L->replaceAllUsesWith(K1); + H->replaceAllUsesWith(K2); + AA->replaceWithNewValue(L, K1); + AA->replaceWithNewValue(H, K2); } // Instructions that may read from memory may be in the load move set. @@ -2387,6 +3008,9 @@ namespace { SE->forgetValue(J); I->eraseFromParent(); J->eraseFromParent(); + + DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" << + BB << "\n"); } DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); @@ -2397,6 +3021,8 @@ char BBVectorize::ID = 0; static const char bb_vectorize_name[] = "Basic-Block Vectorization"; INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 06cf1e4..e64034a 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_library(LLVMVectorize BBVectorize.cpp Vectorize.cpp + LoopVectorize.cpp ) add_dependencies(LLVMVectorize intrinsics_gen) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp new file mode 100644 index 0000000..9c82cb8 --- /dev/null +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -0,0 +1,3080 @@ +//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops +// and generates target-independent LLVM-IR. Legalization of the IR is done +// in the codegen. However, the vectorizes uses (will use) the codegen +// interfaces to generate IR that is likely to result in an optimal binary. +// +// The loop vectorizer combines consecutive loop iteration into a single +// 'wide' iteration. After this transformation the index is incremented +// by the SIMD vector width, and not by one. +// +// This pass has three parts: +// 1. The main loop pass that drives the different parts. +// 2. LoopVectorizationLegality - A unit that checks for the legality +// of the vectorization. +// 3. InnerLoopVectorizer - A unit that performs the actual +// widening of instructions. +// 4. LoopVectorizationCostModel - A unit that checks for the profitability +// of vectorization. It decides on the optimal vector width, which +// can be one, if vectorization is not profitable. +// +//===----------------------------------------------------------------------===// +// +// The reduction-variable vectorization is based on the paper: +// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. +// +// Variable uniformity checks are inspired by: +// Karrenberg, R. and Hack, S. Whole Function Vectorization. +// +// Other ideas/concepts are from: +// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. +// +// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of +// Vectorizing Compilers. +// +//===----------------------------------------------------------------------===// + +#define LV_NAME "loop-vectorize" +#define DEBUG_TYPE LV_NAME + +#include "llvm/Transforms/Vectorize.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include <algorithm> +#include <map> + +using namespace llvm; + +static cl::opt<unsigned> +VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, + cl::desc("Sets the SIMD width. Zero is autoselect.")); + +static cl::opt<unsigned> +VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden, + cl::desc("Sets the vectorization unroll count. " + "Zero is autoselect.")); + +static cl::opt<bool> +EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, + cl::desc("Enable if-conversion during vectorization.")); + +/// We don't vectorize loops with a known constant trip count below this number. +static const unsigned TinyTripCountVectorThreshold = 16; + +/// We don't unroll loops with a known constant trip count below this number. +static const unsigned TinyTripCountUnrollThreshold = 128; + +/// We don't unroll loops that are larget than this threshold. +static const unsigned MaxLoopSizeThreshold = 32; + +/// When performing a runtime memory check, do not check more than this +/// number of pointers. Notice that the check is quadratic! +static const unsigned RuntimeMemoryCheckThreshold = 4; + +/// This is the highest vector width that we try to generate. +static const unsigned MaxVectorSize = 8; + +/// This is the highest Unroll Factor. +static const unsigned MaxUnrollSize = 4; + +namespace { + +// Forward declarations. +class LoopVectorizationLegality; +class LoopVectorizationCostModel; + +/// InnerLoopVectorizer vectorizes loops which contain only one basic +/// block to a specified vectorization factor (VF). +/// This class performs the widening of scalars into vectors, or multiple +/// scalars. This class also implements the following features: +/// * It inserts an epilogue loop for handling loops that don't have iteration +/// counts that are known to be a multiple of the vectorization factor. +/// * It handles the code generation for reduction variables. +/// * Scalarization (implementation using scalars) of un-vectorizable +/// instructions. +/// InnerLoopVectorizer does not perform any vectorization-legality +/// checks, and relies on the caller to check for the different legality +/// aspects. The InnerLoopVectorizer relies on the +/// LoopVectorizationLegality class to provide information about the induction +/// and reduction variables that were found to a given vectorization factor. +class InnerLoopVectorizer { +public: + InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, + DominatorTree *DT, DataLayout *DL, unsigned VecWidth, + unsigned UnrollFactor) + : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), VF(VecWidth), + UF(UnrollFactor), Builder(SE->getContext()), Induction(0), + OldInduction(0), WidenMap(UnrollFactor) {} + + // Perform the actual loop widening (vectorization). + void vectorize(LoopVectorizationLegality *Legal) { + // Create a new empty loop. Unlink the old loop and connect the new one. + createEmptyLoop(Legal); + // Widen each instruction in the old loop to a new one in the new loop. + // Use the Legality module to find the induction and reduction variables. + vectorizeLoop(Legal); + // Register the new loop and update the analysis passes. + updateAnalysis(); + } + +private: + /// A small list of PHINodes. + typedef SmallVector<PHINode*, 4> PhiVector; + /// When we unroll loops we have multiple vector values for each scalar. + /// This data structure holds the unrolled and vectorized values that + /// originated from one scalar instruction. + typedef SmallVector<Value*, 2> VectorParts; + + /// Add code that checks at runtime if the accessed arrays overlap. + /// Returns the comparator value or NULL if no check is needed. + Value *addRuntimeCheck(LoopVectorizationLegality *Legal, + Instruction *Loc); + /// Create an empty loop, based on the loop ranges of the old loop. + void createEmptyLoop(LoopVectorizationLegality *Legal); + /// Copy and widen the instructions from the old loop. + void vectorizeLoop(LoopVectorizationLegality *Legal); + + /// A helper function that computes the predicate of the block BB, assuming + /// that the header block of the loop is set to True. It returns the *entry* + /// mask for the block BB. + VectorParts createBlockInMask(BasicBlock *BB); + /// A helper function that computes the predicate of the edge between SRC + /// and DST. + VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); + + /// A helper function to vectorize a single BB within the innermost loop. + void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, + PhiVector *PV); + + /// Insert the new loop to the loop hierarchy and pass manager + /// and update the analysis passes. + void updateAnalysis(); + + /// This instruction is un-vectorizable. Implement it as a sequence + /// of scalars. + void scalarizeInstruction(Instruction *Instr); + + /// Create a broadcast instruction. This method generates a broadcast + /// instruction (shuffle) for loop invariant values and for the induction + /// value. If this is the induction variable then we extend it to N, N+1, ... + /// this is needed because each iteration in the loop corresponds to a SIMD + /// element. + Value *getBroadcastInstrs(Value *V); + + /// This function adds 0, 1, 2 ... to each vector element, starting at zero. + /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). + /// The sequence starts at StartIndex. + Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate); + + /// When we go over instructions in the basic block we rely on previous + /// values within the current basic block or on loop invariant values. + /// When we widen (vectorize) values we place them in the map. If the values + /// are not within the map, they have to be loop invariant, so we simply + /// broadcast them into a vector. + VectorParts &getVectorValue(Value *V); + + /// Generate a shuffle sequence that will reverse the vector Vec. + Value *reverseVector(Value *Vec); + + /// This is a helper class that holds the vectorizer state. It maps scalar + /// instructions to vector instructions. When the code is 'unrolled' then + /// then a single scalar value is mapped to multiple vector parts. The parts + /// are stored in the VectorPart type. + struct ValueMap { + /// C'tor. UnrollFactor controls the number of vectors ('parts') that + /// are mapped. + ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} + + /// \return True if 'Key' is saved in the Value Map. + bool has(Value *Key) { return MapStoreage.count(Key); } + + /// Initializes a new entry in the map. Sets all of the vector parts to the + /// save value in 'Val'. + /// \return A reference to a vector with splat values. + VectorParts &splat(Value *Key, Value *Val) { + MapStoreage[Key].clear(); + MapStoreage[Key].append(UF, Val); + return MapStoreage[Key]; + } + + ///\return A reference to the value that is stored at 'Key'. + VectorParts &get(Value *Key) { + if (!has(Key)) + MapStoreage[Key].resize(UF); + return MapStoreage[Key]; + } + + /// The unroll factor. Each entry in the map stores this number of vector + /// elements. + unsigned UF; + + /// Map storage. We use std::map and not DenseMap because insertions to a + /// dense map invalidates its iterators. + std::map<Value*, VectorParts> MapStoreage; + }; + + /// The original loop. + Loop *OrigLoop; + /// Scev analysis to use. + ScalarEvolution *SE; + /// Loop Info. + LoopInfo *LI; + /// Dominator Tree. + DominatorTree *DT; + /// Data Layout. + DataLayout *DL; + /// The vectorization SIMD factor to use. Each vector will have this many + /// vector elements. + unsigned VF; + /// The vectorization unroll factor to use. Each scalar is vectorized to this + /// many different vector instructions. + unsigned UF; + + /// The builder that we use + IRBuilder<> Builder; + + // --- Vectorization state --- + + /// The vector-loop preheader. + BasicBlock *LoopVectorPreHeader; + /// The scalar-loop preheader. + BasicBlock *LoopScalarPreHeader; + /// Middle Block between the vector and the scalar. + BasicBlock *LoopMiddleBlock; + ///The ExitBlock of the scalar loop. + BasicBlock *LoopExitBlock; + ///The vector loop body. + BasicBlock *LoopVectorBody; + ///The scalar loop body. + BasicBlock *LoopScalarBody; + ///The first bypass block. + BasicBlock *LoopBypassBlock; + + /// The new Induction variable which was added to the new block. + PHINode *Induction; + /// The induction variable of the old basic block. + PHINode *OldInduction; + /// Maps scalars to widened vectors. + ValueMap WidenMap; +}; + +/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and +/// to what vectorization factor. +/// This class does not look at the profitability of vectorization, only the +/// legality. This class has two main kinds of checks: +/// * Memory checks - The code in canVectorizeMemory checks if vectorization +/// will change the order of memory accesses in a way that will change the +/// correctness of the program. +/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory +/// checks for a number of different conditions, such as the availability of a +/// single induction variable, that all types are supported and vectorize-able, +/// etc. This code reflects the capabilities of InnerLoopVectorizer. +/// This class is also used by InnerLoopVectorizer for identifying +/// induction variable and the different reduction variables. +class LoopVectorizationLegality { +public: + LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, + DominatorTree *DT) + : TheLoop(L), SE(SE), DL(DL), DT(DT), Induction(0) {} + + /// This enum represents the kinds of reductions that we support. + enum ReductionKind { + RK_NoReduction, ///< Not a reduction. + RK_IntegerAdd, ///< Sum of integers. + RK_IntegerMult, ///< Product of integers. + RK_IntegerOr, ///< Bitwise or logical OR of numbers. + RK_IntegerAnd, ///< Bitwise or logical AND of numbers. + RK_IntegerXor, ///< Bitwise or logical XOR of numbers. + RK_FloatAdd, ///< Sum of floats. + RK_FloatMult ///< Product of floats. + }; + + /// This enum represents the kinds of inductions that we support. + enum InductionKind { + IK_NoInduction, ///< Not an induction variable. + IK_IntInduction, ///< Integer induction variable. Step = 1. + IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. + IK_PtrInduction ///< Pointer induction variable. Step = sizeof(elem). + }; + + /// This POD struct holds information about reduction variables. + struct ReductionDescriptor { + ReductionDescriptor() : StartValue(0), LoopExitInstr(0), + Kind(RK_NoReduction) {} + + ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K) + : StartValue(Start), LoopExitInstr(Exit), Kind(K) {} + + // The starting value of the reduction. + // It does not have to be zero! + Value *StartValue; + // The instruction who's value is used outside the loop. + Instruction *LoopExitInstr; + // The kind of the reduction. + ReductionKind Kind; + }; + + // This POD struct holds information about the memory runtime legality + // check that a group of pointers do not overlap. + struct RuntimePointerCheck { + RuntimePointerCheck() : Need(false) {} + + /// Reset the state of the pointer runtime information. + void reset() { + Need = false; + Pointers.clear(); + Starts.clear(); + Ends.clear(); + } + + /// Insert a pointer and calculate the start and end SCEVs. + void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr); + + /// This flag indicates if we need to add the runtime check. + bool Need; + /// Holds the pointers that we need to check. + SmallVector<Value*, 2> Pointers; + /// Holds the pointer value at the beginning of the loop. + SmallVector<const SCEV*, 2> Starts; + /// Holds the pointer value at the end of the loop. + SmallVector<const SCEV*, 2> Ends; + }; + + /// A POD for saving information about induction variables. + struct InductionInfo { + InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} + InductionInfo() : StartValue(0), IK(IK_NoInduction) {} + /// Start value. + Value *StartValue; + /// Induction kind. + InductionKind IK; + }; + + /// ReductionList contains the reduction descriptors for all + /// of the reductions that were found in the loop. + typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; + + /// InductionList saves induction variables and maps them to the + /// induction descriptor. + typedef MapVector<PHINode*, InductionInfo> InductionList; + + /// Returns true if it is legal to vectorize this loop. + /// This does not mean that it is profitable to vectorize this + /// loop, only that it is legal to do so. + bool canVectorize(); + + /// Returns the Induction variable. + PHINode *getInduction() { return Induction; } + + /// Returns the reduction variables found in the loop. + ReductionList *getReductionVars() { return &Reductions; } + + /// Returns the induction variables found in the loop. + InductionList *getInductionVars() { return &Inductions; } + + /// Returns True if V is an induction variable in this loop. + bool isInductionVariable(const Value *V); + + /// Return true if the block BB needs to be predicated in order for the loop + /// to be vectorized. + bool blockNeedsPredication(BasicBlock *BB); + + /// Check if this pointer is consecutive when vectorizing. This happens + /// when the last index of the GEP is the induction variable, or that the + /// pointer itself is an induction variable. + /// This check allows us to vectorize A[idx] into a wide load/store. + /// Returns: + /// 0 - Stride is unknown or non consecutive. + /// 1 - Address is consecutive. + /// -1 - Address is consecutive, and decreasing. + int isConsecutivePtr(Value *Ptr); + + /// Returns true if the value V is uniform within the loop. + bool isUniform(Value *V); + + /// Returns true if this instruction will remain scalar after vectorization. + bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } + + /// Returns the information that we collected about runtime memory check. + RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } +private: + /// Check if a single basic block loop is vectorizable. + /// At this point we know that this is a loop with a constant trip count + /// and we only need to check individual instructions. + bool canVectorizeInstrs(); + + /// When we vectorize loops we may change the order in which + /// we read and write from memory. This method checks if it is + /// legal to vectorize the code, considering only memory constrains. + /// Returns true if the loop is vectorizable + bool canVectorizeMemory(); + + /// Return true if we can vectorize this loop using the IF-conversion + /// transformation. + bool canVectorizeWithIfConvert(); + + /// Collect the variables that need to stay uniform after vectorization. + void collectLoopUniforms(); + + /// Return true if all of the instructions in the block can be speculatively + /// executed. + bool blockCanBePredicated(BasicBlock *BB); + + /// Returns True, if 'Phi' is the kind of reduction variable for type + /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. + bool AddReductionVar(PHINode *Phi, ReductionKind Kind); + /// Returns true if the instruction I can be a reduction variable of type + /// 'Kind'. + bool isReductionInstr(Instruction *I, ReductionKind Kind); + /// Returns the induction kind of Phi. This function may return NoInduction + /// if the PHI is not an induction variable. + InductionKind isInductionVariable(PHINode *Phi); + /// Return true if can compute the address bounds of Ptr within the loop. + bool hasComputableBounds(Value *Ptr); + + /// The loop that we evaluate. + Loop *TheLoop; + /// Scev analysis. + ScalarEvolution *SE; + /// DataLayout analysis. + DataLayout *DL; + // Dominators. + DominatorTree *DT; + + // --- vectorization state --- // + + /// Holds the integer induction variable. This is the counter of the + /// loop. + PHINode *Induction; + /// Holds the reduction variables. + ReductionList Reductions; + /// Holds all of the induction variables that we found in the loop. + /// Notice that inductions don't need to start at zero and that induction + /// variables can be pointers. + InductionList Inductions; + + /// Allowed outside users. This holds the reduction + /// vars which can be accessed from outside the loop. + SmallPtrSet<Value*, 4> AllowedExit; + /// This set holds the variables which are known to be uniform after + /// vectorization. + SmallPtrSet<Instruction*, 4> Uniforms; + /// We need to check that all of the pointers in this list are disjoint + /// at runtime. + RuntimePointerCheck PtrRtCheck; +}; + +/// LoopVectorizationCostModel - estimates the expected speedups due to +/// vectorization. +/// In many cases vectorization is not profitable. This can happen because of +/// a number of reasons. In this class we mainly attempt to predict the +/// expected speedup/slowdowns due to the supported instruction set. We use the +/// TargetTransformInfo to query the different backends for the cost of +/// different operations. +class LoopVectorizationCostModel { +public: + LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, + LoopVectorizationLegality *Legal, + const TargetTransformInfo &TTI) + : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI) {} + + /// \return The most profitable vectorization factor. + /// This method checks every power of two up to VF. If UserVF is not ZERO + /// then this vectorization factor will be selected if vectorization is + /// possible. + unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF); + + + /// \return The most profitable unroll factor. + /// If UserUF is non-zero then this method finds the best unroll-factor + /// based on register pressure and other parameters. + unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF); + + /// \brief A struct that represents some properties of the register usage + /// of a loop. + struct RegisterUsage { + /// Holds the number of loop invariant values that are used in the loop. + unsigned LoopInvariantRegs; + /// Holds the maximum number of concurrent live intervals in the loop. + unsigned MaxLocalUsers; + /// Holds the number of instructions in the loop. + unsigned NumInstructions; + }; + + /// \return information about the register usage of the loop. + RegisterUsage calculateRegisterUsage(); + +private: + /// Returns the expected execution cost. The unit of the cost does + /// not matter because we use the 'cost' units to compare different + /// vector widths. The cost that is returned is *not* normalized by + /// the factor width. + unsigned expectedCost(unsigned VF); + + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + unsigned getInstructionCost(Instruction *I, unsigned VF); + + /// A helper function for converting Scalar types to vector types. + /// If the incoming type is void, we return void. If the VF is 1, we return + /// the scalar type. + static Type* ToVectorTy(Type *Scalar, unsigned VF); + + /// The loop that we evaluate. + Loop *TheLoop; + /// Scev analysis. + ScalarEvolution *SE; + /// Loop Info analysis. + LoopInfo *LI; + /// Vectorization legality. + LoopVectorizationLegality *Legal; + /// Vector target information. + const TargetTransformInfo &TTI; +}; + +/// The LoopVectorize Pass. +struct LoopVectorize : public LoopPass { + /// Pass identification, replacement for typeid + static char ID; + + explicit LoopVectorize() : LoopPass(ID) { + initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); + } + + ScalarEvolution *SE; + DataLayout *DL; + LoopInfo *LI; + TargetTransformInfo *TTI; + DominatorTree *DT; + + virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { + // We only vectorize innermost loops. + if (!L->empty()) + return false; + + SE = &getAnalysis<ScalarEvolution>(); + DL = getAnalysisIfAvailable<DataLayout>(); + LI = &getAnalysis<LoopInfo>(); + TTI = &getAnalysis<TargetTransformInfo>(); + DT = &getAnalysis<DominatorTree>(); + + DEBUG(dbgs() << "LV: Checking a loop in \"" << + L->getHeader()->getParent()->getName() << "\"\n"); + + // Check if it is legal to vectorize the loop. + LoopVectorizationLegality LVL(L, SE, DL, DT); + if (!LVL.canVectorize()) { + DEBUG(dbgs() << "LV: Not vectorizing.\n"); + return false; + } + + // Use the cost model. + LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI); + + // Check the function attribues to find out if this function should be + // optimized for size. + Function *F = L->getHeader()->getParent(); + Attribute::AttrKind SzAttr = Attribute::OptimizeForSize; + Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat; + unsigned FnIndex = AttributeSet::FunctionIndex; + bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr); + bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr); + + if (NoFloat) { + DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" + "attribute is used.\n"); + return false; + } + + unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); + unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll); + + if (VF == 1) { + DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); + return false; + } + + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< + F->getParent()->getModuleIdentifier()<<"\n"); + DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n"); + + // If we decided that it is *legal* to vectorizer the loop then do it. + InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF); + LB.vectorize(&LVL); + + DEBUG(verifyFunction(*L->getHeader()->getParent())); + return true; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + LoopPass::getAnalysisUsage(AU); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addRequired<DominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<TargetTransformInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addPreserved<DominatorTree>(); + } + +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and +// LoopVectorizationCostModel. +//===----------------------------------------------------------------------===// + +void +LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, + Loop *Lp, Value *Ptr) { + const SCEV *Sc = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); + assert(AR && "Invalid addrec expression"); + const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); + const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); + Pointers.push_back(Ptr); + Starts.push_back(AR->getStart()); + Ends.push_back(ScEnd); +} + +Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { + // Save the current insertion location. + Instruction *Loc = Builder.GetInsertPoint(); + + // We need to place the broadcast of invariant variables outside the loop. + Instruction *Instr = dyn_cast<Instruction>(V); + bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); + bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; + + // Place the code for broadcasting invariant variables in the new preheader. + if (Invariant) + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + + // Broadcast the scalar into all locations in the vector. + Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); + + // Restore the builder insertion point. + if (Invariant) + Builder.SetInsertPoint(Loc); + + return Shuf; +} + +Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx, + bool Negate) { + assert(Val->getType()->isVectorTy() && "Must be a vector"); + assert(Val->getType()->getScalarType()->isIntegerTy() && + "Elem must be an integer"); + // Create the types. + Type *ITy = Val->getType()->getScalarType(); + VectorType *Ty = cast<VectorType>(Val->getType()); + int VLen = Ty->getNumElements(); + SmallVector<Constant*, 8> Indices; + + // Create a vector of consecutive numbers from zero to VF. + for (int i = 0; i < VLen; ++i) { + int Idx = Negate ? (-i): i; + Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx)); + } + + // Add the consecutive indices to the vector value. + Constant *Cv = ConstantVector::get(Indices); + assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); + return Builder.CreateAdd(Val, Cv, "induction"); +} + +int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { + assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); + + // If this value is a pointer induction variable we know it is consecutive. + PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); + if (Phi && Inductions.count(Phi)) { + InductionInfo II = Inductions[Phi]; + if (IK_PtrInduction == II.IK) + return 1; + } + + GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); + if (!Gep) + return 0; + + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = Gep->getOperand(NumOperands - 1); + + // Check that all of the gep indices are uniform except for the last. + for (unsigned i = 0; i < NumOperands - 1; ++i) + if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + return 0; + + // We can emit wide load/stores only if the last index is the induction + // variable. + const SCEV *Last = SE->getSCEV(LastIndex); + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { + const SCEV *Step = AR->getStepRecurrence(*SE); + + // The memory is consecutive because the last index is consecutive + // and all other indices are loop invariant. + if (Step->isOne()) + return 1; + if (Step->isAllOnesValue()) + return -1; + } + + return 0; +} + +bool LoopVectorizationLegality::isUniform(Value *V) { + return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); +} + +InnerLoopVectorizer::VectorParts& +InnerLoopVectorizer::getVectorValue(Value *V) { + assert(V != Induction && "The new induction variable should not be used."); + assert(!V->getType()->isVectorTy() && "Can't widen a vector"); + + // If we have this scalar in the map, return it. + if (WidenMap.has(V)) + return WidenMap.get(V); + + // If this scalar is unknown, assume that it is a constant or that it is + // loop invariant. Broadcast V and save the value for future uses. + Value *B = getBroadcastInstrs(V); + WidenMap.splat(V, B); + return WidenMap.get(V); +} + +Value *InnerLoopVectorizer::reverseVector(Value *Vec) { + assert(Vec->getType()->isVectorTy() && "Invalid type"); + SmallVector<Constant*, 8> ShuffleMask; + for (unsigned i = 0; i < VF; ++i) + ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); + + return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), + ConstantVector::get(ShuffleMask), + "reverse"); +} + +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { + assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + // Holds vector parameters or scalars, in case of uniform vals. + SmallVector<VectorParts, 4> Params; + + // Find all of the vectorized parameters. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *SrcOp = Instr->getOperand(op); + + // If we are accessing the old induction variable, use the new one. + if (SrcOp == OldInduction) { + Params.push_back(getVectorValue(SrcOp)); + continue; + } + + // Try using previously calculated values. + Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); + + // If the src is an instruction that appeared earlier in the basic block + // then it should already be vectorized. + if (SrcInst && OrigLoop->contains(SrcInst)) { + assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); + // The parameter is a vector value from earlier. + Params.push_back(WidenMap.get(SrcInst)); + } else { + // The parameter is a scalar from outside the loop. Maybe even a constant. + VectorParts Scalars; + Scalars.append(UF, SrcOp); + Params.push_back(Scalars); + } + } + + assert(Params.size() == Instr->getNumOperands() && + "Invalid number of operands"); + + // Does this instruction return a value ? + bool IsVoidRetTy = Instr->getType()->isVoidTy(); + + Value *UndefVec = IsVoidRetTy ? 0 : + UndefValue::get(VectorType::get(Instr->getType(), VF)); + // Create a new entry in the WidenMap and initialize it to Undef or Null. + VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + + // For each scalar that we create: + for (unsigned Width = 0; Width < VF; ++Width) { + // For each vector unroll 'part': + for (unsigned Part = 0; Part < UF; ++Part) { + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); + // Replace the operands of the cloned instrucions with extracted scalars. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *Op = Params[op][Part]; + // Param is a vector. Need to extract the right lane. + if (Op->getType()->isVectorTy()) + Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); + Cloned->setOperand(op, Op); + } + + // Place the cloned scalar in the new loop. + Builder.Insert(Cloned); + + // If the original scalar returns a value we need to place it in a vector + // so that future users will be able to use it. + if (!IsVoidRetTy) + VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, + Builder.getInt32(Width)); + } + } +} + +Value* +InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, + Instruction *Loc) { + LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = + Legal->getRuntimePointerCheck(); + + if (!PtrRtCheck->Need) + return NULL; + + Value *MemoryRuntimeCheck = 0; + unsigned NumPointers = PtrRtCheck->Pointers.size(); + SmallVector<Value* , 2> Starts; + SmallVector<Value* , 2> Ends; + + SCEVExpander Exp(*SE, "induction"); + + // Use this type for pointer arithmetic. + Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0); + + for (unsigned i = 0; i < NumPointers; ++i) { + Value *Ptr = PtrRtCheck->Pointers[i]; + const SCEV *Sc = SE->getSCEV(Ptr); + + if (SE->isLoopInvariant(Sc, OrigLoop)) { + DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << + *Ptr <<"\n"); + Starts.push_back(Ptr); + Ends.push_back(Ptr); + } else { + DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); + + Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); + Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); + Starts.push_back(Start); + Ends.push_back(End); + } + } + + for (unsigned i = 0; i < NumPointers; ++i) { + for (unsigned j = i+1; j < NumPointers; ++j) { + Instruction::CastOps Op = Instruction::BitCast; + Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc); + Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc); + Value *End0 = CastInst::Create(Op, Ends[i], PtrArithTy, "bc", Loc); + Value *End1 = CastInst::Create(Op, Ends[j], PtrArithTy, "bc", Loc); + + Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, + Start0, End1, "bound0", Loc); + Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, + Start1, End0, "bound1", Loc); + Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, + "found.conflict", Loc); + if (MemoryRuntimeCheck) + MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, + MemoryRuntimeCheck, + IsConflict, + "conflict.rdx", Loc); + else + MemoryRuntimeCheck = IsConflict; + + } + } + + return MemoryRuntimeCheck; +} + +void +InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { + /* + In this function we generate a new loop. The new loop will contain + the vectorized instructions while the old loop will continue to run the + scalar remainder. + + [ ] <-- vector loop bypass. + / | + / v + | [ ] <-- vector pre header. + | | + | v + | [ ] \ + | [ ]_| <-- vector loop. + | | + \ v + >[ ] <--- middle-block. + / | + / v + | [ ] <--- new preheader. + | | + | v + | [ ] \ + | [ ]_| <-- old scalar loop to handle remainder. + \ | + \ v + >[ ] <-- exit block. + ... + */ + + BasicBlock *OldBasicBlock = OrigLoop->getHeader(); + BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); + BasicBlock *ExitBlock = OrigLoop->getExitBlock(); + assert(ExitBlock && "Must have an exit block"); + + // Some loops have a single integer induction variable, while other loops + // don't. One example is c++ iterators that often have multiple pointer + // induction variables. In the code below we also support a case where we + // don't have a single induction variable. + OldInduction = Legal->getInduction(); + Type *IdxTy = OldInduction ? OldInduction->getType() : + DL->getIntPtrType(SE->getContext()); + + // Find the loop boundaries. + const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); + assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); + + // Get the total trip count from the count by adding 1. + ExitCount = SE->getAddExpr(ExitCount, + SE->getConstant(ExitCount->getType(), 1)); + + // Expand the trip count and place the new instructions in the preheader. + // Notice that the pre-header does not change, only the loop body. + SCEVExpander Exp(*SE, "induction"); + + // Count holds the overall loop count (N). + Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + BypassBlock->getTerminator()); + + // The loop index does not have to start at Zero. Find the original start + // value from the induction PHI node. If we don't have an induction variable + // then we know that it starts at zero. + Value *StartIdx = OldInduction ? + OldInduction->getIncomingValueForBlock(BypassBlock): + ConstantInt::get(IdxTy, 0); + + assert(BypassBlock && "Invalid loop structure"); + + // Generate the code that checks in runtime if arrays overlap. + Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, + BypassBlock->getTerminator()); + + // Split the single block loop into the two loop structure described above. + BasicBlock *VectorPH = + BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); + BasicBlock *VecBody = + VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); + BasicBlock *MiddleBlock = + VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); + BasicBlock *ScalarPH = + MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + + // This is the location in which we add all of the logic for bypassing + // the new vector loop. + Instruction *Loc = BypassBlock->getTerminator(); + + // Use this IR builder to create the loop instructions (Phi, Br, Cmp) + // inside the loop. + Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + + // Generate the induction variable. + Induction = Builder.CreatePHI(IdxTy, 2, "index"); + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Constant *Step = ConstantInt::get(IdxTy, VF * UF); + + // We may need to extend the index in case there is a type mismatch. + // We know that the count starts at zero and does not overflow. + if (Count->getType() != IdxTy) { + // The exit count can be of pointer type. Convert it to the correct + // integer type. + if (ExitCount->getType()->isPointerTy()) + Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); + else + Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); + } + + // Add the start index to the loop count to get the new end index. + Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); + + // Now we need to generate the expression for N - (N % VF), which is + // the part that the vectorized body will execute. + Value *R = BinaryOperator::CreateURem(Count, Step, "n.mod.vf", Loc); + Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); + Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, + "end.idx.rnd.down", Loc); + + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. + Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + IdxEndRoundDown, + StartIdx, + "cmp.zero", Loc); + + // If we are using memory runtime checks, include them in. + if (MemoryRuntimeCheck) + Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, + "CntOrMem", Loc); + + BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); + // Remove the old terminator. + Loc->eraseFromParent(); + + // We are going to resume the execution of the scalar loop. + // Go over all of the induction variables that we found and fix the + // PHIs that are left in the scalar version of the loop. + // The starting values of PHI nodes depend on the counter of the last + // iteration in the vectorized loop. + // If we come from a bypass edge then we need to start from the original + // start value. + + // This variable saves the new starting index for the scalar loop. + PHINode *ResumeIndex = 0; + LoopVectorizationLegality::InductionList::iterator I, E; + LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); + for (I = List->begin(), E = List->end(); I != E; ++I) { + PHINode *OrigPhi = I->first; + LoopVectorizationLegality::InductionInfo II = I->second; + PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", + MiddleBlock->getTerminator()); + Value *EndValue = 0; + switch (II.IK) { + case LoopVectorizationLegality::IK_NoInduction: + llvm_unreachable("Unknown induction"); + case LoopVectorizationLegality::IK_IntInduction: { + // Handle the integer induction counter: + assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); + assert(OrigPhi == OldInduction && "Unknown integer PHI"); + // We know what the end value is. + EndValue = IdxEndRoundDown; + // We also know which PHI node holds it. + ResumeIndex = ResumeVal; + break; + } + case LoopVectorizationLegality::IK_ReverseIntInduction: { + // Convert the CountRoundDown variable to the PHI size. + unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits(); + unsigned IISize = II.StartValue->getType()->getScalarSizeInBits(); + Value *CRD = CountRoundDown; + if (CRDSize > IISize) + CRD = CastInst::Create(Instruction::Trunc, CountRoundDown, + II.StartValue->getType(), + "tr.crd", BypassBlock->getTerminator()); + else if (CRDSize < IISize) + CRD = CastInst::Create(Instruction::SExt, CountRoundDown, + II.StartValue->getType(), + "sext.crd", BypassBlock->getTerminator()); + // Handle reverse integer induction counter: + EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", + BypassBlock->getTerminator()); + break; + } + case LoopVectorizationLegality::IK_PtrInduction: { + // For pointer induction variables, calculate the offset using + // the end index. + EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown, + "ptr.ind.end", + BypassBlock->getTerminator()); + break; + } + }// end of case + + // The new PHI merges the original incoming value, in case of a bypass, + // or the value at the end of the vectorized loop. + ResumeVal->addIncoming(II.StartValue, BypassBlock); + ResumeVal->addIncoming(EndValue, VecBody); + + // Fix the scalar body counter (PHI node). + unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); + OrigPhi->setIncomingValue(BlockIdx, ResumeVal); + } + + // If we are generating a new induction variable then we also need to + // generate the code that calculates the exit value. This value is not + // simply the end of the counter because we may skip the vectorized body + // in case of a runtime check. + if (!OldInduction){ + assert(!ResumeIndex && "Unexpected resume value found"); + ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", + MiddleBlock->getTerminator()); + ResumeIndex->addIncoming(StartIdx, BypassBlock); + ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); + } + + // Make sure that we found the index where scalar loop needs to continue. + assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && + "Invalid resume Index"); + + // Add a check in the middle block to see if we have completed + // all of the iterations in the first vector loop. + // If (N - N%VF) == N, then we *don't* need to run the remainder. + Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, + ResumeIndex, "cmp.n", + MiddleBlock->getTerminator()); + + BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); + // Remove the old terminator. + MiddleBlock->getTerminator()->eraseFromParent(); + + // Create i+1 and fill the PHINode. + Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); + Induction->addIncoming(StartIdx, VectorPH); + Induction->addIncoming(NextIdx, VecBody); + // Create the compare. + Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); + Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); + + // Now we have two terminators. Remove the old one from the block. + VecBody->getTerminator()->eraseFromParent(); + + // Get ready to start creating new instructions into the vectorized body. + Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + + // Create and register the new vector loop. + Loop* Lp = new Loop(); + Loop *ParentLoop = OrigLoop->getParentLoop(); + + // Insert the new loop into the loop nest and register the new basic blocks. + if (ParentLoop) { + ParentLoop->addChildLoop(Lp); + ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); + ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); + ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); + } else { + LI->addTopLevelLoop(Lp); + } + + Lp->addBasicBlockToLoop(VecBody, LI->getBase()); + + // Save the state. + LoopVectorPreHeader = VectorPH; + LoopScalarPreHeader = ScalarPH; + LoopMiddleBlock = MiddleBlock; + LoopExitBlock = ExitBlock; + LoopVectorBody = VecBody; + LoopScalarBody = OldBasicBlock; + LoopBypassBlock = BypassBlock; +} + +/// This function returns the identity element (or neutral element) for +/// the operation K. +static Constant* +getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) { + switch (K) { + case LoopVectorizationLegality:: RK_IntegerXor: + case LoopVectorizationLegality:: RK_IntegerAdd: + case LoopVectorizationLegality:: RK_IntegerOr: + // Adding, Xoring, Oring zero to a number does not change it. + return ConstantInt::get(Tp, 0); + case LoopVectorizationLegality:: RK_IntegerMult: + // Multiplying a number by 1 does not change it. + return ConstantInt::get(Tp, 1); + case LoopVectorizationLegality:: RK_IntegerAnd: + // AND-ing a number with an all-1 value does not change it. + return ConstantInt::get(Tp, -1, true); + case LoopVectorizationLegality:: RK_FloatMult: + // Multiplying a number by 1 does not change it. + return ConstantFP::get(Tp, 1.0L); + case LoopVectorizationLegality:: RK_FloatAdd: + // Adding zero to a number does not change it. + return ConstantFP::get(Tp, 0.0L); + default: + llvm_unreachable("Unknown reduction kind"); + } +} + +static bool +isTriviallyVectorizableIntrinsic(Instruction *Inst) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); + if (!II) + return false; + switch (II->getIntrinsicID()) { + case Intrinsic::sqrt: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::fabs: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::pow: + case Intrinsic::fma: + case Intrinsic::fmuladd: + return true; + default: + return false; + } + return false; +} + +/// This function translates the reduction kind to an LLVM binary operator. +static Instruction::BinaryOps +getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { + switch (Kind) { + case LoopVectorizationLegality::RK_IntegerAdd: + return Instruction::Add; + case LoopVectorizationLegality::RK_IntegerMult: + return Instruction::Mul; + case LoopVectorizationLegality::RK_IntegerOr: + return Instruction::Or; + case LoopVectorizationLegality::RK_IntegerAnd: + return Instruction::And; + case LoopVectorizationLegality::RK_IntegerXor: + return Instruction::Xor; + case LoopVectorizationLegality::RK_FloatMult: + return Instruction::FMul; + case LoopVectorizationLegality::RK_FloatAdd: + return Instruction::FAdd; + default: + llvm_unreachable("Unknown reduction operation"); + } +} + +void +InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { + //===------------------------------------------------===// + // + // Notice: any optimization or new instruction that go + // into the code below should be also be implemented in + // the cost-model. + // + //===------------------------------------------------===// + BasicBlock &BB = *OrigLoop->getHeader(); + Constant *Zero = + ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0); + + // In order to support reduction variables we need to be able to vectorize + // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two + // stages. First, we create a new vector PHI node with no incoming edges. + // We use this value when we vectorize all of the instructions that use the + // PHI. Next, after all of the instructions in the block are complete we + // add the new incoming edges to the PHI. At this point all of the + // instructions in the basic block are vectorized, so we can use them to + // construct the PHI. + PhiVector RdxPHIsToFix; + + // Scan the loop in a topological order to ensure that defs are vectorized + // before users. + LoopBlocksDFS DFS(OrigLoop); + DFS.perform(LI); + + // Vectorize all of the blocks in the original loop. + for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), + be = DFS.endRPO(); bb != be; ++bb) + vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix); + + // At this point every instruction in the original loop is widened to + // a vector form. We are almost done. Now, we need to fix the PHI nodes + // that we vectorized. The PHI nodes are currently empty because we did + // not want to introduce cycles. Notice that the remaining PHI nodes + // that we need to fix are reduction variables. + + // Create the 'reduced' values for each of the induction vars. + // The reduced values are the vector values that we scalarize and combine + // after the loop is finished. + for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); + it != e; ++it) { + PHINode *RdxPhi = *it; + assert(RdxPhi && "Unable to recover vectorized PHI"); + + // Find the reduction variable descriptor. + assert(Legal->getReductionVars()->count(RdxPhi) && + "Unable to find the reduction variable"); + LoopVectorizationLegality::ReductionDescriptor RdxDesc = + (*Legal->getReductionVars())[RdxPhi]; + + // We need to generate a reduction vector from the incoming scalar. + // To do so, we need to generate the 'identity' vector and overide + // one of the elements with the incoming scalar reduction. We need + // to do it in the vector-loop preheader. + Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); + + // This is the vector-clone of the value that leaves the loop. + VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); + Type *VecTy = VectorExit[0]->getType(); + + // Find the reduction identity variable. Zero for addition, or, xor, + // one for multiplication, -1 for And. + Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType()); + Constant *Identity = ConstantVector::getSplat(VF, Iden); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + Value *VectorStart = Builder.CreateInsertElement(Identity, + RdxDesc.StartValue, Zero); + + // Fix the vector-loop phi. + // We created the induction variable so we know that the + // preheader is the first entry. + BasicBlock *VecPreheader = Induction->getIncomingBlock(0); + + // Reductions do not have to start at zero. They can start with + // any loop invariant values. + VectorParts &VecRdxPhi = WidenMap.get(RdxPhi); + BasicBlock *Latch = OrigLoop->getLoopLatch(); + Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); + VectorParts &Val = getVectorValue(LoopVal); + for (unsigned part = 0; part < UF; ++part) { + // Make sure to add the reduction stat value only to the + // first unroll part. + Value *StartVal = (part == 0) ? VectorStart : Identity; + cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); + cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody); + } + + // Before each round, move the insertion point right between + // the PHIs and the values we are going to write. + // This allows us to write both PHINodes and the extractelement + // instructions. + Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); + + VectorParts RdxParts; + for (unsigned part = 0; part < UF; ++part) { + // This PHINode contains the vectorized reduction variable, or + // the initial value vector, if we bypass the vector loop. + VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); + PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); + Value *StartVal = (part == 0) ? VectorStart : Identity; + NewPhi->addIncoming(StartVal, LoopBypassBlock); + NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody); + RdxParts.push_back(NewPhi); + } + + // Reduce all of the unrolled parts into a single vector. + Value *ReducedPartRdx = RdxParts[0]; + for (unsigned part = 1; part < UF; ++part) { + Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind); + ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx, + "bin.rdx"); + } + + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = ReducedPartRdx; + SmallVector<Constant*, 32> ShuffleMask(VF, 0); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i/2; ++j) + ShuffleMask[j] = Builder.getInt32(i/2 + j); + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i/2], ShuffleMask.end(), + UndefValue::get(Builder.getInt32Ty())); + + Value *Shuf = + Builder.CreateShuffleVector(TmpVec, + UndefValue::get(TmpVec->getType()), + ConstantVector::get(ShuffleMask), + "rdx.shuf"); + + Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind); + TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx"); + } + + // The result is in the first element of the vector. + Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + + // Now, we need to fix the users of the reduction variable + // inside and outside of the scalar remainder loop. + // We know that the loop is in LCSSA form. We need to update the + // PHI nodes in the exit blocks. + for (BasicBlock::iterator LEI = LoopExitBlock->begin(), + LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { + PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); + if (!LCSSAPhi) continue; + + // All PHINodes need to have a single entry edge, or two if + // we already fixed them. + assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); + + // We found our reduction value exit-PHI. Update it with the + // incoming bypass edge. + if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { + // Add an edge coming from the bypass. + LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); + break; + } + }// end of the LCSSA phi scan. + + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. + int IncomingEdgeBlockIdx = + (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); + assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); + // Pick the other block. + int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); + (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); + (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); + }// end of for each redux variable. + + // The Loop exit block may have single value PHI nodes where the incoming + // value is 'undef'. While vectorizing we only handled real values that + // were defined inside the loop. Here we handle the 'undef case'. + // See PR14725. + for (BasicBlock::iterator LEI = LoopExitBlock->begin(), + LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { + PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); + if (!LCSSAPhi) continue; + if (LCSSAPhi->getNumIncomingValues() == 1) + LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), + LoopMiddleBlock); + } +} + +InnerLoopVectorizer::VectorParts +InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { + assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && + "Invalid edge"); + + VectorParts SrcMask = createBlockInMask(Src); + + // The terminator has to be a branch inst! + BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); + assert(BI && "Unexpected terminator found"); + + if (BI->isConditional()) { + VectorParts EdgeMask = getVectorValue(BI->getCondition()); + + if (BI->getSuccessor(0) != Dst) + for (unsigned part = 0; part < UF; ++part) + EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); + + for (unsigned part = 0; part < UF; ++part) + EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); + return EdgeMask; + } + + return SrcMask; +} + +InnerLoopVectorizer::VectorParts +InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { + assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); + + // Loop incoming mask is all-one. + if (OrigLoop->getHeader() == BB) { + Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); + return getVectorValue(C); + } + + // This is the block mask. We OR all incoming edges, and with zero. + Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); + VectorParts BlockMask = getVectorValue(Zero); + + // For each pred: + for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { + VectorParts EM = createEdgeMask(*it, BB); + for (unsigned part = 0; part < UF; ++part) + BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); + } + + return BlockMask; +} + +void +InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, + BasicBlock *BB, PhiVector *PV) { + Constant *Zero = Builder.getInt32(0); + + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + VectorParts &Entry = WidenMap.get(it); + switch (it->getOpcode()) { + case Instruction::Br: + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + continue; + case Instruction::PHI:{ + PHINode* P = cast<PHINode>(it); + // Handle reduction variables: + if (Legal->getReductionVars()->count(P)) { + for (unsigned part = 0; part < UF; ++part) { + // This is phase one of vectorizing PHIs. + Type *VecTy = VectorType::get(it->getType(), VF); + Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", + LoopVectorBody-> getFirstInsertionPt()); + } + PV->push_back(P); + continue; + } + + // Check for PHI nodes that are lowered to vector selects. + if (P->getParent() != OrigLoop->getHeader()) { + // We know that all PHIs in non header blocks are converted into + // selects, so we don't have to worry about the insertion order and we + // can just use the builder. + + // At this point we generate the predication tree. There may be + // duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + VectorParts Cond = createEdgeMask(P->getIncomingBlock(0), + P->getParent()); + + for (unsigned part = 0; part < UF; ++part) { + VectorParts &In0 = getVectorValue(P->getIncomingValue(0)); + VectorParts &In1 = getVectorValue(P->getIncomingValue(1)); + Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part], + "predphi"); + } + continue; + } + + // This PHINode must be an induction variable. + // Make sure that we know about it. + assert(Legal->getInductionVars()->count(P) && + "Not an induction variable"); + + LoopVectorizationLegality::InductionInfo II = + Legal->getInductionVars()->lookup(P); + + switch (II.IK) { + case LoopVectorizationLegality::IK_NoInduction: + llvm_unreachable("Unknown induction"); + case LoopVectorizationLegality::IK_IntInduction: { + assert(P == OldInduction && "Unexpected PHI"); + Value *Broadcasted = getBroadcastInstrs(Induction); + // After broadcasting the induction variable we need to make the + // vector consecutive by adding 0, 1, 2 ... + for (unsigned part = 0; part < UF; ++part) + Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); + continue; + } + case LoopVectorizationLegality::IK_ReverseIntInduction: + case LoopVectorizationLegality::IK_PtrInduction: + // Handle reverse integer and pointer inductions. + Value *StartIdx = 0; + // If we have a single integer induction variable then use it. + // Otherwise, start counting at zero. + if (OldInduction) { + LoopVectorizationLegality::InductionInfo OldII = + Legal->getInductionVars()->lookup(OldInduction); + StartIdx = OldII.StartValue; + } else { + StartIdx = ConstantInt::get(Induction->getType(), 0); + } + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, + "normalized.idx"); + + // Handle the reverse integer induction variable case. + if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { + IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); + Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, + "resize.norm.idx"); + Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, + "reverse.idx"); + + // This is a new value so do not hoist it out. + Value *Broadcasted = getBroadcastInstrs(ReverseInd); + // After broadcasting the induction variable we need to make the + // vector consecutive by adding ... -3, -2, -1, 0. + for (unsigned part = 0; part < UF; ++part) + Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true); + continue; + } + + // Handle the pointer induction variable case. + assert(P->getType()->isPointerTy() && "Unexpected type."); + + // This is the vector of results. Notice that we don't generate + // vector geps because scalar geps result in better code. + for (unsigned part = 0; part < UF; ++part) { + Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); + for (unsigned int i = 0; i < VF; ++i) { + Constant *Idx = ConstantInt::get(Induction->getType(), + i + part * VF); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, + "gep.idx"); + Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, + "next.gep"); + VecVal = Builder.CreateInsertElement(VecVal, SclrGep, + Builder.getInt32(i), + "insert.gep"); + } + Entry[part] = VecVal; + } + continue; + } + + }// End of PHI. + + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen binops. + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); + VectorParts &A = getVectorValue(it->getOperand(0)); + VectorParts &B = getVectorValue(it->getOperand(1)); + + // Use this vector value for all users of the original instruction. + for (unsigned Part = 0; Part < UF; ++Part) { + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); + + // Update the NSW, NUW and Exact flags. + BinaryOperator *VecOp = cast<BinaryOperator>(V); + if (isa<OverflowingBinaryOperator>(BinOp)) { + VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); + VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); + } + if (isa<PossiblyExactOperator>(VecOp)) + VecOp->setIsExact(BinOp->isExact()); + + Entry[Part] = V; + } + break; + } + case Instruction::Select: { + // Widen selects. + // If the selector is loop invariant we can create a select + // instruction with a scalar condition. Otherwise, use vector-select. + bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), + OrigLoop); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + VectorParts &Cond = getVectorValue(it->getOperand(0)); + VectorParts &Op0 = getVectorValue(it->getOperand(1)); + VectorParts &Op1 = getVectorValue(it->getOperand(2)); + Value *ScalarCond = Builder.CreateExtractElement(Cond[0], + Builder.getInt32(0)); + for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part] = Builder.CreateSelect( + InvariantCond ? ScalarCond : Cond[Part], + Op0[Part], + Op1[Part]); + } + break; + } + + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (it->getOpcode() == Instruction::FCmp); + CmpInst *Cmp = dyn_cast<CmpInst>(it); + VectorParts &A = getVectorValue(it->getOperand(0)); + VectorParts &B = getVectorValue(it->getOperand(1)); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *C = 0; + if (FCmp) + C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); + else + C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); + Entry[Part] = C; + } + break; + } + + case Instruction::Store: { + // Attempt to issue a wide store. + StoreInst *SI = dyn_cast<StoreInst>(it); + Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); + Value *Ptr = SI->getPointerOperand(); + unsigned Alignment = SI->getAlignment(); + + assert(!Legal->isUniform(Ptr) && + "We do not allow storing to uniform addresses"); + + + int Stride = Legal->isConsecutivePtr(Ptr); + bool Reverse = Stride < 0; + if (Stride == 0) { + scalarizeInstruction(it); + break; + } + + // Handle consecutive stores. + + GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + if (Gep) { + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + + Value *LastGepOperand = Gep->getOperand(NumOperands - 1); + VectorParts &GEPParts = getVectorValue(LastGepOperand); + Value *LastIndex = GEPParts[0]; + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + } else { + // Use the induction element ptr. + assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); + VectorParts &PtrVal = getVectorValue(Ptr); + Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); + } + + VectorParts &StoredVal = getVectorValue(SI->getValueOperand()); + for (unsigned Part = 0; Part < UF; ++Part) { + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + + if (Reverse) { + // If we store to reverse consecutive memory locations then we need + // to reverse the order of elements in the stored value. + StoredVal[Part] = reverseVector(StoredVal[Part]); + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + } + + Value *VecPtr = Builder.CreateBitCast(PartPtr, StTy->getPointerTo()); + Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); + } + break; + } + case Instruction::Load: { + // Attempt to issue a wide load. + LoadInst *LI = dyn_cast<LoadInst>(it); + Type *RetTy = VectorType::get(LI->getType(), VF); + Value *Ptr = LI->getPointerOperand(); + unsigned Alignment = LI->getAlignment(); + + // If the pointer is loop invariant or if it is non consecutive, + // scalarize the load. + int Stride = Legal->isConsecutivePtr(Ptr); + bool Reverse = Stride < 0; + if (Legal->isUniform(Ptr) || Stride == 0) { + scalarizeInstruction(it); + break; + } + + GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + if (Gep) { + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + + Value *LastGepOperand = Gep->getOperand(NumOperands - 1); + VectorParts &GEPParts = getVectorValue(LastGepOperand); + Value *LastIndex = GEPParts[0]; + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + } else { + // Use the induction element ptr. + assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); + VectorParts &PtrVal = getVectorValue(Ptr); + Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); + } + + for (unsigned Part = 0; Part < UF; ++Part) { + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + + if (Reverse) { + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + } + + Value *VecPtr = Builder.CreateBitCast(PartPtr, RetTy->getPointerTo()); + Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); + cast<LoadInst>(LI)->setAlignment(Alignment); + Entry[Part] = Reverse ? reverseVector(LI) : LI; + } + break; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + CastInst *CI = dyn_cast<CastInst>(it); + /// Optimize the special case where the source is the induction + /// variable. Notice that we can only optimize the 'trunc' case + /// because: a. FP conversions lose precision, b. sext/zext may wrap, + /// c. other casts depend on pointer size. + if (CI->getOperand(0) == OldInduction && + it->getOpcode() == Instruction::Trunc) { + Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, + CI->getType()); + Value *Broadcasted = getBroadcastInstrs(ScalarCast); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); + break; + } + /// Vectorize casts. + Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); + + VectorParts &A = getVectorValue(it->getOperand(0)); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); + break; + } + + case Instruction::Call: { + assert(isTriviallyVectorizableIntrinsic(it)); + Module *M = BB->getParent()->getParent(); + IntrinsicInst *II = cast<IntrinsicInst>(it); + Intrinsic::ID ID = II->getIntrinsicID(); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Value*, 4> Args; + for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) { + VectorParts &Arg = getVectorValue(II->getArgOperand(i)); + Args.push_back(Arg[Part]); + } + Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) }; + Function *F = Intrinsic::getDeclaration(M, ID, Tys); + Entry[Part] = Builder.CreateCall(F, Args); + } + break; + } + + default: + // All other instructions are unsupported. Scalarize them. + scalarizeInstruction(it); + break; + }// end of switch. + }// end of for_each instr. +} + +void InnerLoopVectorizer::updateAnalysis() { + // Forget the original basic block. + SE->forgetLoop(OrigLoop); + + // Update the dominator tree information. + assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && + "Entry does not dominate exit."); + + DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); + DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); + DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); + DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); + DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); + DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); + + DEBUG(DT->verifyAnalysis()); +} + +bool LoopVectorizationLegality::canVectorizeWithIfConvert() { + if (!EnableIfConversion) + return false; + + assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); + std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); + + // Collect the blocks that need predication. + for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { + BasicBlock *BB = LoopBlocks[i]; + + // We don't support switch statements inside loops. + if (!isa<BranchInst>(BB->getTerminator())) + return false; + + // We must have at most two predecessors because we need to convert + // all PHIs to selects. + unsigned Preds = std::distance(pred_begin(BB), pred_end(BB)); + if (Preds > 2) + return false; + + // We must be able to predicate all blocks that need to be predicated. + if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) + return false; + } + + // We can if-convert this loop. + return true; +} + +bool LoopVectorizationLegality::canVectorize() { + assert(TheLoop->getLoopPreheader() && "No preheader!!"); + + // We can only vectorize innermost loops. + if (TheLoop->getSubLoopsVector().size()) + return false; + + // We must have a single backedge. + if (TheLoop->getNumBackEdges() != 1) + return false; + + // We must have a single exiting block. + if (!TheLoop->getExitingBlock()) + return false; + + unsigned NumBlocks = TheLoop->getNumBlocks(); + + // Check if we can if-convert non single-bb loops. + if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { + DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); + return false; + } + + // We need to have a loop header. + BasicBlock *Latch = TheLoop->getLoopLatch(); + DEBUG(dbgs() << "LV: Found a loop: " << + TheLoop->getHeader()->getName() << "\n"); + + // ScalarEvolution needs to be able to find the exit count. + const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); + if (ExitCount == SE->getCouldNotCompute()) { + DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); + return false; + } + + // Do not loop-vectorize loops with a tiny trip count. + unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); + if (TC > 0u && TC < TinyTripCountVectorThreshold) { + DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << + "This loop is not worth vectorizing.\n"); + return false; + } + + // Check if we can vectorize the instructions and CFG in this loop. + if (!canVectorizeInstrs()) { + DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); + return false; + } + + // Go over each instruction and look at memory deps. + if (!canVectorizeMemory()) { + DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); + return false; + } + + // Collect all of the variables that remain uniform after vectorization. + collectLoopUniforms(); + + DEBUG(dbgs() << "LV: We can vectorize this loop" << + (PtrRtCheck.Need ? " (with a runtime bound check)" : "") + <<"!\n"); + + // Okay! We can vectorize. At this point we don't have any other mem analysis + // which may limit our maximum vectorization factor, so just return true with + // no restrictions. + return true; +} + +bool LoopVectorizationLegality::canVectorizeInstrs() { + BasicBlock *PreHeader = TheLoop->getLoopPreheader(); + BasicBlock *Header = TheLoop->getHeader(); + + // For each block in the loop. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); bb != be; ++bb) { + + // Scan the instructions in the block and look for hazards. + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + + if (PHINode *Phi = dyn_cast<PHINode>(it)) { + // This should not happen because the loop should be normalized. + if (Phi->getNumIncomingValues() != 2) { + DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + return false; + } + + // Check that this PHI type is allowed. + if (!Phi->getType()->isIntegerTy() && + !Phi->getType()->isFloatingPointTy() && + !Phi->getType()->isPointerTy()) { + DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); + return false; + } + + // If this PHINode is not in the header block, then we know that we + // can convert it to select during if-conversion. No need to check if + // the PHIs in this block are induction or reduction variables. + if (*bb != Header) + continue; + + // This is the value coming from the preheader. + Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); + // Check if this is an induction variable. + InductionKind IK = isInductionVariable(Phi); + + if (IK_NoInduction != IK) { + // Int inductions are special because we only allow one IV. + if (IK == IK_IntInduction) { + if (Induction) { + DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); + return false; + } + Induction = Phi; + } + + DEBUG(dbgs() << "LV: Found an induction variable.\n"); + Inductions[Phi] = InductionInfo(StartValue, IK); + continue; + } + + if (AddReductionVar(Phi, RK_IntegerAdd)) { + DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, RK_IntegerMult)) { + DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, RK_IntegerOr)) { + DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, RK_IntegerAnd)) { + DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, RK_IntegerXor)) { + DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, RK_FloatMult)) { + DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, RK_FloatAdd)) { + DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n"); + continue; + } + + DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); + return false; + }// end of PHI handling + + // We still don't handle functions. + CallInst *CI = dyn_cast<CallInst>(it); + if (CI && !isTriviallyVectorizableIntrinsic(it)) { + DEBUG(dbgs() << "LV: Found a call site.\n"); + return false; + } + + // Check that the instruction return type is vectorizable. + if (!VectorType::isValidElementType(it->getType()) && + !it->getType()->isVoidTy()) { + DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); + return false; + } + + // Check that the stored type is vectorizable. + if (StoreInst *ST = dyn_cast<StoreInst>(it)) { + Type *T = ST->getValueOperand()->getType(); + if (!VectorType::isValidElementType(T)) + return false; + } + + // Reduction instructions are allowed to have exit users. + // All other instructions must not have external users. + if (!AllowedExit.count(it)) + //Check that all of the users of the loop are inside the BB. + for (Value::use_iterator I = it->use_begin(), E = it->use_end(); + I != E; ++I) { + Instruction *U = cast<Instruction>(*I); + // This user may be a reduction exit value. + if (!TheLoop->contains(U)) { + DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + return false; + } + } + } // next instr. + + } + + if (!Induction) { + DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); + assert(getInductionVars()->size() && "No induction variables"); + } + + return true; +} + +void LoopVectorizationLegality::collectLoopUniforms() { + // We now know that the loop is vectorizable! + // Collect variables that will remain uniform after vectorization. + std::vector<Value*> Worklist; + BasicBlock *Latch = TheLoop->getLoopLatch(); + + // Start with the conditional branch and walk up the block. + Worklist.push_back(Latch->getTerminator()->getOperand(0)); + + while (Worklist.size()) { + Instruction *I = dyn_cast<Instruction>(Worklist.back()); + Worklist.pop_back(); + + // Look at instructions inside this loop. + // Stop when reaching PHI nodes. + // TODO: we need to follow values all over the loop, not only in this block. + if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) + continue; + + // This is a known uniform. + Uniforms.insert(I); + + // Insert all operands. + for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { + Worklist.push_back(I->getOperand(i)); + } + } +} + +bool LoopVectorizationLegality::canVectorizeMemory() { + typedef SmallVector<Value*, 16> ValueVector; + typedef SmallPtrSet<Value*, 16> ValueSet; + // Holds the Load and Store *instructions*. + ValueVector Loads; + ValueVector Stores; + PtrRtCheck.Pointers.clear(); + PtrRtCheck.Need = false; + + // For each block. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); bb != be; ++bb) { + + // Scan the BB and collect legal loads and stores. + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + + // If this is a load, save it. If this instruction can read from memory + // but is not a load, then we quit. Notice that we don't handle function + // calls that read or write. + if (it->mayReadFromMemory()) { + LoadInst *Ld = dyn_cast<LoadInst>(it); + if (!Ld) return false; + if (!Ld->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple load.\n"); + return false; + } + Loads.push_back(Ld); + continue; + } + + // Save 'store' instructions. Abort if other instructions write to memory. + if (it->mayWriteToMemory()) { + StoreInst *St = dyn_cast<StoreInst>(it); + if (!St) return false; + if (!St->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple store.\n"); + return false; + } + Stores.push_back(St); + } + } // next instr. + } // next block. + + // Now we have two lists that hold the loads and the stores. + // Next, we find the pointers that they use. + + // Check if we see any stores. If there are no stores, then we don't + // care if the pointers are *restrict*. + if (!Stores.size()) { + DEBUG(dbgs() << "LV: Found a read-only loop!\n"); + return true; + } + + // Holds the read and read-write *pointers* that we find. + ValueVector Reads; + ValueVector ReadWrites; + + // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects + // multiple times on the same object. If the ptr is accessed twice, once + // for read and once for write, it will only appear once (on the write + // list). This is okay, since we are going to check for conflicts between + // writes and between reads and writes, but not between reads and reads. + ValueSet Seen; + + ValueVector::iterator I, IE; + for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { + StoreInst *ST = cast<StoreInst>(*I); + Value* Ptr = ST->getPointerOperand(); + + if (isUniform(Ptr)) { + DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); + return false; + } + + // If we did *not* see this pointer before, insert it to + // the read-write list. At this phase it is only a 'write' list. + if (Seen.insert(Ptr)) + ReadWrites.push_back(Ptr); + } + + for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { + LoadInst *LD = cast<LoadInst>(*I); + Value* Ptr = LD->getPointerOperand(); + // If we did *not* see this pointer before, insert it to the + // read list. If we *did* see it before, then it is already in + // the read-write list. This allows us to vectorize expressions + // such as A[i] += x; Because the address of A[i] is a read-write + // pointer. This only works if the index of A[i] is consecutive. + // If the address of i is unknown (for example A[B[i]]) then we may + // read a few words, modify, and write a few words, and some of the + // words may be written to the same address. + if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr)) + Reads.push_back(Ptr); + } + + // If we write (or read-write) to a single destination and there are no + // other reads in this loop then is it safe to vectorize. + if (ReadWrites.size() == 1 && Reads.size() == 0) { + DEBUG(dbgs() << "LV: Found a write-only loop!\n"); + return true; + } + + // Find pointers with computable bounds. We are going to use this information + // to place a runtime bound check. + bool CanDoRT = true; + for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) + if (hasComputableBounds(*I)) { + PtrRtCheck.insert(SE, TheLoop, *I); + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); + } else { + CanDoRT = false; + break; + } + for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) + if (hasComputableBounds(*I)) { + PtrRtCheck.insert(SE, TheLoop, *I); + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); + } else { + CanDoRT = false; + break; + } + + // Check that we did not collect too many pointers or found a + // unsizeable pointer. + if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { + PtrRtCheck.reset(); + CanDoRT = false; + } + + if (CanDoRT) { + DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); + } + + bool NeedRTCheck = false; + + // Now that the pointers are in two lists (Reads and ReadWrites), we + // can check that there are no conflicts between each of the writes and + // between the writes to the reads. + ValueSet WriteObjects; + ValueVector TempObjects; + + // Check that the read-writes do not conflict with other read-write + // pointers. + bool AllWritesIdentified = true; + for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { + GetUnderlyingObjects(*I, TempObjects, DL); + for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); + it != e; ++it) { + if (!isIdentifiedObject(*it)) { + DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); + NeedRTCheck = true; + AllWritesIdentified = false; + } + if (!WriteObjects.insert(*it)) { + DEBUG(dbgs() << "LV: Found a possible write-write reorder:" + << **it <<"\n"); + return false; + } + } + TempObjects.clear(); + } + + /// Check that the reads don't conflict with the read-writes. + for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { + GetUnderlyingObjects(*I, TempObjects, DL); + for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); + it != e; ++it) { + // If all of the writes are identified then we don't care if the read + // pointer is identified or not. + if (!AllWritesIdentified && !isIdentifiedObject(*it)) { + DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); + NeedRTCheck = true; + } + if (WriteObjects.count(*it)) { + DEBUG(dbgs() << "LV: Found a possible read/write reorder:" + << **it <<"\n"); + return false; + } + } + TempObjects.clear(); + } + + PtrRtCheck.Need = NeedRTCheck; + if (NeedRTCheck && !CanDoRT) { + DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << + "the array bounds.\n"); + PtrRtCheck.reset(); + return false; + } + + DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << + " need a runtime memory check.\n"); + return true; +} + +bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, + ReductionKind Kind) { + if (Phi->getNumIncomingValues() != 2) + return false; + + // Reduction variables are only found in the loop header block. + if (Phi->getParent() != TheLoop->getHeader()) + return false; + + // Obtain the reduction start value from the value that comes from the loop + // preheader. + Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); + + // ExitInstruction is the single value which is used outside the loop. + // We only allow for a single reduction value to be used outside the loop. + // This includes users of the reduction, variables (which form a cycle + // which ends in the phi node). + Instruction *ExitInstruction = 0; + // Indicates that we found a binary operation in our scan. + bool FoundBinOp = false; + + // Iter is our iterator. We start with the PHI node and scan for all of the + // users of this instruction. All users must be instructions that can be + // used as reduction variables (such as ADD). We may have a single + // out-of-block user. The cycle must end with the original PHI. + Instruction *Iter = Phi; + while (true) { + // If the instruction has no users then this is a broken + // chain and can't be a reduction variable. + if (Iter->use_empty()) + return false; + + // Did we find a user inside this loop already ? + bool FoundInBlockUser = false; + // Did we reach the initial PHI node already ? + bool FoundStartPHI = false; + + // Is this a bin op ? + FoundBinOp |= !isa<PHINode>(Iter); + + // For each of the *users* of iter. + for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); + it != e; ++it) { + Instruction *U = cast<Instruction>(*it); + // We already know that the PHI is a user. + if (U == Phi) { + FoundStartPHI = true; + continue; + } + + // Check if we found the exit user. + BasicBlock *Parent = U->getParent(); + if (!TheLoop->contains(Parent)) { + // Exit if you find multiple outside users. + if (ExitInstruction != 0) + return false; + ExitInstruction = Iter; + } + + // We allow in-loop PHINodes which are not the original reduction PHI + // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE + // structure) then don't skip this PHI. + if (isa<PHINode>(Iter) && isa<PHINode>(U) && + U->getParent() != TheLoop->getHeader() && + TheLoop->contains(U) && + Iter->getNumUses() > 1) + continue; + + // We can't have multiple inside users. + if (FoundInBlockUser) + return false; + FoundInBlockUser = true; + + // Any reduction instr must be of one of the allowed kinds. + if (!isReductionInstr(U, Kind)) + return false; + + // Reductions of instructions such as Div, and Sub is only + // possible if the LHS is the reduction variable. + if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter) + return false; + + Iter = U; + } + + // We found a reduction var if we have reached the original + // phi node and we only have a single instruction with out-of-loop + // users. + if (FoundStartPHI) { + // This instruction is allowed to have out-of-loop users. + AllowedExit.insert(ExitInstruction); + + // Save the description of this reduction variable. + ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); + Reductions[Phi] = RD; + // We've ended the cycle. This is a reduction variable if we have an + // outside user and it has a binary op. + return FoundBinOp && ExitInstruction; + } + } +} + +bool +LoopVectorizationLegality::isReductionInstr(Instruction *I, + ReductionKind Kind) { + bool FP = I->getType()->isFloatingPointTy(); + bool FastMath = (FP && I->isCommutative() && I->isAssociative()); + + switch (I->getOpcode()) { + default: + return false; + case Instruction::PHI: + if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd)) + return false; + // possibly. + return true; + case Instruction::Sub: + case Instruction::Add: + return Kind == RK_IntegerAdd; + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::Mul: + return Kind == RK_IntegerMult; + case Instruction::And: + return Kind == RK_IntegerAnd; + case Instruction::Or: + return Kind == RK_IntegerOr; + case Instruction::Xor: + return Kind == RK_IntegerXor; + case Instruction::FMul: + return Kind == RK_FloatMult && FastMath; + case Instruction::FAdd: + return Kind == RK_FloatAdd && FastMath; + } +} + +LoopVectorizationLegality::InductionKind +LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { + Type *PhiTy = Phi->getType(); + // We only handle integer and pointer inductions variables. + if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) + return IK_NoInduction; + + // Check that the PHI is consecutive and starts at zero. + const SCEV *PhiScev = SE->getSCEV(Phi); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); + if (!AR) { + DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); + return IK_NoInduction; + } + const SCEV *Step = AR->getStepRecurrence(*SE); + + // Integer inductions need to have a stride of one. + if (PhiTy->isIntegerTy()) { + if (Step->isOne()) + return IK_IntInduction; + if (Step->isAllOnesValue()) + return IK_ReverseIntInduction; + return IK_NoInduction; + } + + // Calculate the pointer stride and check if it is consecutive. + const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); + if (!C) + return IK_NoInduction; + + assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); + uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); + if (C->getValue()->equalsInt(Size)) + return IK_PtrInduction; + + return IK_NoInduction; +} + +bool LoopVectorizationLegality::isInductionVariable(const Value *V) { + Value *In0 = const_cast<Value*>(V); + PHINode *PN = dyn_cast_or_null<PHINode>(In0); + if (!PN) + return false; + + return Inductions.count(PN); +} + +bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { + assert(TheLoop->contains(BB) && "Unknown block used"); + + // Blocks that do not dominate the latch need predication. + BasicBlock* Latch = TheLoop->getLoopLatch(); + return !DT->dominates(BB, Latch); +} + +bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // We don't predicate loads/stores at the moment. + if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) + return false; + + // The instructions below can trap. + switch (it->getOpcode()) { + default: continue; + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return false; + } + } + + return true; +} + +bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { + const SCEV *PhiScev = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); + if (!AR) + return false; + + return AR->isAffine(); +} + +unsigned +LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, + unsigned UserVF) { + if (OptForSize && Legal->getRuntimePointerCheck()->Need) { + DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); + return 1; + } + + // Find the trip count. + unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); + DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n"); + + unsigned VF = MaxVectorSize; + + // If we optimize the program for size, avoid creating the tail loop. + if (OptForSize) { + // If we are unable to calculate the trip count then don't try to vectorize. + if (TC < 2) { + DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); + return 1; + } + + // Find the maximum SIMD width that can fit within the trip count. + VF = TC % MaxVectorSize; + + if (VF == 0) + VF = MaxVectorSize; + + // If the trip count that we found modulo the vectorization factor is not + // zero then we require a tail. + if (VF < 2) { + DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); + return 1; + } + } + + if (UserVF != 0) { + assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); + DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n"); + + return UserVF; + } + + float Cost = expectedCost(1); + unsigned Width = 1; + DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); + for (unsigned i=2; i <= VF; i*=2) { + // Notice that the vector loop needs to be executed less times, so + // we need to divide the cost of the vector loops by the width of + // the vector elements. + float VectorCost = expectedCost(i) / (float)i; + DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << + (int)VectorCost << ".\n"); + if (VectorCost < Cost) { + Cost = VectorCost; + Width = i; + } + } + + DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); + return Width; +} + +unsigned +LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, + unsigned UserUF) { + // Use the user preference, unless 'auto' is selected. + if (UserUF != 0) + return UserUF; + + // When we optimize for size we don't unroll. + if (OptForSize) + return 1; + + // Do not unroll loops with a relatively small trip count. + unsigned TC = SE->getSmallConstantTripCount(TheLoop, + TheLoop->getLoopLatch()); + if (TC > 1 && TC < TinyTripCountUnrollThreshold) + return 1; + + unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true); + DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters << + " vector registers\n"); + + LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); + // We divide by these constants so assume that we have at least one + // instruction that uses at least one register. + R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); + R.NumInstructions = std::max(R.NumInstructions, 1U); + + // We calculate the unroll factor using the following formula. + // Subtract the number of loop invariants from the number of available + // registers. These registers are used by all of the unrolled instances. + // Next, divide the remaining registers by the number of registers that is + // required by the loop, in order to estimate how many parallel instances + // fit without causing spills. + unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers; + + // We don't want to unroll the loops to the point where they do not fit into + // the decoded cache. Assume that we only allow 32 IR instructions. + UF = std::min(UF, (MaxLoopSizeThreshold / R.NumInstructions)); + + // Clamp the unroll factor ranges to reasonable factors. + if (UF > MaxUnrollSize) + UF = MaxUnrollSize; + else if (UF < 1) + UF = 1; + + return UF; +} + +LoopVectorizationCostModel::RegisterUsage +LoopVectorizationCostModel::calculateRegisterUsage() { + // This function calculates the register usage by measuring the highest number + // of values that are alive at a single location. Obviously, this is a very + // rough estimation. We scan the loop in a topological order in order and + // assign a number to each instruction. We use RPO to ensure that defs are + // met before their users. We assume that each instruction that has in-loop + // users starts an interval. We record every time that an in-loop value is + // used, so we have a list of the first and last occurrences of each + // instruction. Next, we transpose this data structure into a multi map that + // holds the list of intervals that *end* at a specific location. This multi + // map allows us to perform a linear search. We scan the instructions linearly + // and record each time that a new interval starts, by placing it in a set. + // If we find this value in the multi-map then we remove it from the set. + // The max register usage is the maximum size of the set. + // We also search for instructions that are defined outside the loop, but are + // used inside the loop. We need this number separately from the max-interval + // usage number because when we unroll, loop-invariant values do not take + // more register. + LoopBlocksDFS DFS(TheLoop); + DFS.perform(LI); + + RegisterUsage R; + R.NumInstructions = 0; + + // Each 'key' in the map opens a new interval. The values + // of the map are the index of the 'last seen' usage of the + // instruction that is the key. + typedef DenseMap<Instruction*, unsigned> IntervalMap; + // Maps instruction to its index. + DenseMap<unsigned, Instruction*> IdxToInstr; + // Marks the end of each interval. + IntervalMap EndPoint; + // Saves the list of instruction indices that are used in the loop. + SmallSet<Instruction*, 8> Ends; + // Saves the list of values that are used in the loop but are + // defined outside the loop, such as arguments and constants. + SmallPtrSet<Value*, 8> LoopInvariants; + + unsigned Index = 0; + for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), + be = DFS.endRPO(); bb != be; ++bb) { + R.NumInstructions += (*bb)->size(); + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + Instruction *I = it; + IdxToInstr[Index++] = I; + + // Save the end location of each USE. + for (unsigned i = 0; i < I->getNumOperands(); ++i) { + Value *U = I->getOperand(i); + Instruction *Instr = dyn_cast<Instruction>(U); + + // Ignore non-instruction values such as arguments, constants, etc. + if (!Instr) continue; + + // If this instruction is outside the loop then record it and continue. + if (!TheLoop->contains(Instr)) { + LoopInvariants.insert(Instr); + continue; + } + + // Overwrite previous end points. + EndPoint[Instr] = Index; + Ends.insert(Instr); + } + } + } + + // Saves the list of intervals that end with the index in 'key'. + typedef SmallVector<Instruction*, 2> InstrList; + DenseMap<unsigned, InstrList> TransposeEnds; + + // Transpose the EndPoints to a list of values that end at each index. + for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); + it != e; ++it) + TransposeEnds[it->second].push_back(it->first); + + SmallSet<Instruction*, 8> OpenIntervals; + unsigned MaxUsage = 0; + + + DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + for (unsigned int i = 0; i < Index; ++i) { + Instruction *I = IdxToInstr[i]; + // Ignore instructions that are never used within the loop. + if (!Ends.count(I)) continue; + + // Remove all of the instructions that end at this location. + InstrList &List = TransposeEnds[i]; + for (unsigned int j=0, e = List.size(); j < e; ++j) + OpenIntervals.erase(List[j]); + + // Count the number of live interals. + MaxUsage = std::max(MaxUsage, OpenIntervals.size()); + + DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << + OpenIntervals.size() <<"\n"); + + // Add the current instruction to the list of open intervals. + OpenIntervals.insert(I); + } + + unsigned Invariant = LoopInvariants.size(); + DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n"); + DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n"); + DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n"); + + R.LoopInvariantRegs = Invariant; + R.MaxLocalUsers = MaxUsage; + return R; +} + +unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { + unsigned Cost = 0; + + // For each block. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); bb != be; ++bb) { + unsigned BlockCost = 0; + BasicBlock *BB = *bb; + + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + unsigned C = getInstructionCost(it, VF); + Cost += C; + DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << + VF << " For instruction: "<< *it << "\n"); + } + + // We assume that if-converted blocks have a 50% chance of being executed. + // When the code is scalar then some of the blocks are avoided due to CF. + // When the code is vectorized we execute all code paths. + if (Legal->blockNeedsPredication(*bb) && VF == 1) + BlockCost /= 2; + + Cost += BlockCost; + } + + return Cost; +} + +unsigned +LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { + // If we know that this instruction will remain uniform, check the cost of + // the scalar version. + if (Legal->isUniformAfterVectorization(I)) + VF = 1; + + Type *RetTy = I->getType(); + Type *VectorTy = ToVectorTy(RetTy, VF); + + // TODO: We need to estimate the cost of intrinsic calls. + switch (I->getOpcode()) { + case Instruction::GetElementPtr: + // We mark this instruction as zero-cost because scalar GEPs are usually + // lowered to the intruction addressing mode. At the moment we don't + // generate vector geps. + return 0; + case Instruction::Br: { + return TTI.getCFInstrCost(I->getOpcode()); + } + case Instruction::PHI: + //TODO: IF-converted IFs become selects. + return 0; + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy); + case Instruction::Select: { + SelectInst *SI = cast<SelectInst>(I); + const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); + bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); + Type *CondTy = SI->getCondition()->getType(); + if (ScalarCond) + CondTy = VectorType::get(CondTy, VF); + + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); + } + case Instruction::ICmp: + case Instruction::FCmp: { + Type *ValTy = I->getOperand(0)->getType(); + VectorTy = ToVectorTy(ValTy, VF); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); + } + case Instruction::Store: { + StoreInst *SI = cast<StoreInst>(I); + Type *ValTy = SI->getValueOperand()->getType(); + VectorTy = ToVectorTy(ValTy, VF); + + if (VF == 1) + return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, + SI->getAlignment(), + SI->getPointerAddressSpace()); + + // Scalarized stores. + int Stride = Legal->isConsecutivePtr(SI->getPointerOperand()); + bool Reverse = Stride < 0; + if (0 == Stride) { + unsigned Cost = 0; + + // The cost of extracting from the value vector and pointer vector. + Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); + for (unsigned i = 0; i < VF; ++i) { + Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, + i); + Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); + } + + // The cost of the scalar stores. + Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), + SI->getAlignment(), + SI->getPointerAddressSpace()); + return Cost; + } + + // Wide stores. + unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy, + SI->getAlignment(), + SI->getPointerAddressSpace()); + if (Reverse) + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, + VectorTy, 0); + return Cost; + } + case Instruction::Load: { + LoadInst *LI = cast<LoadInst>(I); + + if (VF == 1) + return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), + LI->getPointerAddressSpace()); + + // Scalarized loads. + int Stride = Legal->isConsecutivePtr(LI->getPointerOperand()); + bool Reverse = Stride < 0; + if (0 == Stride) { + unsigned Cost = 0; + Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); + + // The cost of extracting from the pointer vector. + for (unsigned i = 0; i < VF; ++i) + Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); + + // The cost of inserting data to the result vector. + for (unsigned i = 0; i < VF; ++i) + Cost += TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy, i); + + // The cost of the scalar stores. + Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), RetTy->getScalarType(), + LI->getAlignment(), + LI->getPointerAddressSpace()); + return Cost; + } + + // Wide loads. + unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy, + LI->getAlignment(), + LI->getPointerAddressSpace()); + if (Reverse) + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + return Cost; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + // We optimize the truncation of induction variable. + // The cost of these is the same as the scalar operation. + if (I->getOpcode() == Instruction::Trunc && + Legal->isInductionVariable(I->getOperand(0))) + return TTI.getCastInstrCost(I->getOpcode(), I->getType(), + I->getOperand(0)->getType()); + + Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); + return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + } + case Instruction::Call: { + assert(isTriviallyVectorizableIntrinsic(I)); + IntrinsicInst *II = cast<IntrinsicInst>(I); + Type *RetTy = ToVectorTy(II->getType(), VF); + SmallVector<Type*, 4> Tys; + for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) + Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF)); + return TTI.getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); + } + default: { + // We are scalarizing the instruction. Return the cost of the scalar + // instruction, plus the cost of insert and extract into vector + // elements, times the vector width. + unsigned Cost = 0; + + if (!RetTy->isVoidTy() && VF != 1) { + unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement, + VectorTy); + unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement, + VectorTy); + + // The cost of inserting the results plus extracting each one of the + // operands. + Cost += VF * (InsCost + ExtCost * I->getNumOperands()); + } + + // The cost of executing VF copies of the scalar instruction. This opcode + // is unknown. Assume that it is the same as 'mul'. + Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); + return Cost; + } + }// end of switch. +} + +Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { + if (Scalar->isVoidTy() || VF == 1) + return Scalar; + return VectorType::get(Scalar, VF); +} + +char LoopVectorize::ID = 0; +static const char lv_name[] = "Loop Vectorization"; +INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) + +namespace llvm { + Pass *createLoopVectorizePass() { + return new LoopVectorize(); + } +} + + diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index 1ef6002..19eefd2 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -1,4 +1,4 @@ -//===-- Vectorize.cpp -----------------------------------------------------===// + //===-- Vectorize.cpp -----------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -7,26 +7,27 @@ // //===----------------------------------------------------------------------===// // -// This file implements common infrastructure for libLLVMVectorizeOpts.a, which +// This file implements common infrastructure for libLLVMVectorizeOpts.a, which // implements several vectorization transformations over the LLVM intermediate // representation, including the C bindings for that library. // //===----------------------------------------------------------------------===// -#include "llvm-c/Transforms/Vectorize.h" +#include "llvm/Transforms/Vectorize.h" #include "llvm-c/Initialization.h" -#include "llvm/InitializePasses.h" -#include "llvm/PassManager.h" +#include "llvm-c/Transforms/Vectorize.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/Verifier.h" -#include "llvm/Transforms/Vectorize.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassManager.h" using namespace llvm; -/// initializeVectorizationPasses - Initialize all passes linked into the +/// initializeVectorizationPasses - Initialize all passes linked into the /// Vectorization library. void llvm::initializeVectorization(PassRegistry &Registry) { initializeBBVectorizePass(Registry); + initializeLoopVectorizePass(Registry); } void LLVMInitializeVectorization(LLVMPassRegistryRef R) { @@ -37,3 +38,6 @@ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createBBVectorizePass()); } +void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopVectorizePass()); +} |
