diff options
Diffstat (limited to 'lib/Transforms/Scalar')
-rw-r--r-- | lib/Transforms/Scalar/Android.mk | 2 | ||||
-rw-r--r-- | lib/Transforms/Scalar/CMakeLists.txt | 2 | ||||
-rw-r--r-- | lib/Transforms/Scalar/GVN.cpp | 20 | ||||
-rw-r--r-- | lib/Transforms/Scalar/GlobalMerge.cpp | 313 | ||||
-rw-r--r-- | lib/Transforms/Scalar/JumpThreading.cpp | 9 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LICM.cpp | 73 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LoadCombine.cpp | 268 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 4 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LoopRerollPass.cpp | 6 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LoopUnrollPass.cpp | 360 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LowerAtomic.cpp | 5 | ||||
-rw-r--r-- | lib/Transforms/Scalar/Reassociate.cpp | 31 | ||||
-rw-r--r-- | lib/Transforms/Scalar/SCCP.cpp | 4 | ||||
-rw-r--r-- | lib/Transforms/Scalar/SROA.cpp | 18 | ||||
-rw-r--r-- | lib/Transforms/Scalar/SampleProfile.cpp | 7 | ||||
-rw-r--r-- | lib/Transforms/Scalar/Scalar.cpp | 1 | ||||
-rw-r--r-- | lib/Transforms/Scalar/ScalarReplAggregates.cpp | 6 | ||||
-rw-r--r-- | lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 633 | ||||
-rw-r--r-- | lib/Transforms/Scalar/Sink.cpp | 6 |
19 files changed, 1100 insertions, 668 deletions
diff --git a/lib/Transforms/Scalar/Android.mk b/lib/Transforms/Scalar/Android.mk index 079cc86..5e22de6 100644 --- a/lib/Transforms/Scalar/Android.mk +++ b/lib/Transforms/Scalar/Android.mk @@ -8,11 +8,11 @@ transforms_scalar_SRC_FILES := \ DCE.cpp \ DeadStoreElimination.cpp \ EarlyCSE.cpp \ - GlobalMerge.cpp \ GVN.cpp \ IndVarSimplify.cpp \ JumpThreading.cpp \ LICM.cpp \ + LoadCombine.cpp \ LoopDeletion.cpp \ LoopIdiomRecognize.cpp \ LoopInstSimplify.cpp \ diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index 3ad1488..2dcfa23 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -8,10 +8,10 @@ add_llvm_library(LLVMScalarOpts EarlyCSE.cpp FlattenCFGPass.cpp GVN.cpp - GlobalMerge.cpp IndVarSimplify.cpp JumpThreading.cpp LICM.cpp + LoadCombine.cpp LoopDeletion.cpp LoopIdiomRecognize.cpp LoopInstSimplify.cpp diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 6d07ddd..106eba0 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -1464,6 +1464,13 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, continue; } + // Loading from calloc (which zero initializes memory) -> zero + if (isCallocLikeFn(DepInst, TLI)) { + ValuesPerBlock.push_back(AvailableValueInBlock::get( + DepBB, Constant::getNullValue(LI->getType()))); + continue; + } + if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { // Reject loads and stores that are to the same address but are of // different types if we have to. @@ -1791,6 +1798,10 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { case LLVMContext::MD_fpmath: ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD)); break; + case LLVMContext::MD_invariant_load: + // Only set the !invariant.load if it is present in both instructions. + ReplInst->setMetadata(Kind, IMD); + break; } } } @@ -1988,6 +1999,15 @@ bool GVN::processLoad(LoadInst *L) { } } + // If this load follows a calloc (which zero initializes memory), + // then the loaded value is zero + if (isCallocLikeFn(DepInst, TLI)) { + L->replaceAllUsesWith(Constant::getNullValue(L->getType())); + markInstructionForDeletion(L); + ++NumGVNLoad; + return true; + } + return false; } diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp deleted file mode 100644 index 990d067..0000000 --- a/lib/Transforms/Scalar/GlobalMerge.cpp +++ /dev/null @@ -1,313 +0,0 @@ -//===-- GlobalMerge.cpp - Internal globals merging -----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// This pass merges globals with internal linkage into one. This way all the -// globals which were merged into a biggest one can be addressed using offsets -// from the same base pointer (no need for separate base pointer for each of the -// global). Such a transformation can significantly reduce the register pressure -// when many globals are involved. -// -// For example, consider the code which touches several global variables at -// once: -// -// static int foo[N], bar[N], baz[N]; -// -// for (i = 0; i < N; ++i) { -// foo[i] = bar[i] * baz[i]; -// } -// -// On ARM the addresses of 3 arrays should be kept in the registers, thus -// this code has quite large register pressure (loop body): -// -// ldr r1, [r5], #4 -// ldr r2, [r6], #4 -// mul r1, r2, r1 -// str r1, [r0], #4 -// -// Pass converts the code to something like: -// -// static struct { -// int foo[N]; -// int bar[N]; -// int baz[N]; -// } merged; -// -// for (i = 0; i < N; ++i) { -// merged.foo[i] = merged.bar[i] * merged.baz[i]; -// } -// -// and in ARM code this becomes: -// -// ldr r0, [r5, #40] -// ldr r1, [r5, #80] -// mul r0, r1, r0 -// str r0, [r5], #4 -// -// note that we saved 2 registers here almostly "for free". -// ===---------------------------------------------------------------------===// - -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetLoweringObjectFile.h" -using namespace llvm; - -#define DEBUG_TYPE "global-merge" - -cl::opt<bool> -EnableGlobalMerge("global-merge", cl::Hidden, - cl::desc("Enable global merge pass"), - cl::init(true)); - -static cl::opt<bool> -EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden, - cl::desc("Enable global merge pass on constants"), - cl::init(false)); - -STATISTIC(NumMerged , "Number of globals merged"); -namespace { - class GlobalMerge : public FunctionPass { - const TargetMachine *TM; - - bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, - Module &M, bool isConst, unsigned AddrSpace) const; - - /// \brief Check if the given variable has been identified as must keep - /// \pre setMustKeepGlobalVariables must have been called on the Module that - /// contains GV - bool isMustKeepGlobalVariable(const GlobalVariable *GV) const { - return MustKeepGlobalVariables.count(GV); - } - - /// Collect every variables marked as "used" or used in a landing pad - /// instruction for this Module. - void setMustKeepGlobalVariables(Module &M); - - /// Collect every variables marked as "used" - void collectUsedGlobalVariables(Module &M); - - /// Keep track of the GlobalVariable that must not be merged away - SmallPtrSet<const GlobalVariable *, 16> MustKeepGlobalVariables; - - public: - static char ID; // Pass identification, replacement for typeid. - explicit GlobalMerge(const TargetMachine *TM = nullptr) - : FunctionPass(ID), TM(TM) { - initializeGlobalMergePass(*PassRegistry::getPassRegistry()); - } - - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - bool doFinalization(Module &M) override; - - const char *getPassName() const override { - return "Merge internal globals"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - FunctionPass::getAnalysisUsage(AU); - } - }; -} // end anonymous namespace - -char GlobalMerge::ID = 0; -INITIALIZE_PASS(GlobalMerge, "global-merge", - "Global Merge", false, false) - - -bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, - Module &M, bool isConst, unsigned AddrSpace) const { - const TargetLowering *TLI = TM->getTargetLowering(); - const DataLayout *DL = TLI->getDataLayout(); - - // FIXME: Infer the maximum possible offset depending on the actual users - // (these max offsets are different for the users inside Thumb or ARM - // functions) - unsigned MaxOffset = TLI->getMaximalGlobalOffset(); - - // FIXME: Find better heuristics - std::stable_sort(Globals.begin(), Globals.end(), - [DL](const GlobalVariable *GV1, const GlobalVariable *GV2) { - Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType(); - Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType(); - - return (DL->getTypeAllocSize(Ty1) < DL->getTypeAllocSize(Ty2)); - }); - - Type *Int32Ty = Type::getInt32Ty(M.getContext()); - - for (size_t i = 0, e = Globals.size(); i != e; ) { - size_t j = 0; - uint64_t MergedSize = 0; - std::vector<Type*> Tys; - std::vector<Constant*> Inits; - for (j = i; j != e; ++j) { - Type *Ty = Globals[j]->getType()->getElementType(); - MergedSize += DL->getTypeAllocSize(Ty); - if (MergedSize > MaxOffset) { - break; - } - Tys.push_back(Ty); - Inits.push_back(Globals[j]->getInitializer()); - } - - StructType *MergedTy = StructType::get(M.getContext(), Tys); - Constant *MergedInit = ConstantStruct::get(MergedTy, Inits); - GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst, - GlobalValue::InternalLinkage, - MergedInit, "_MergedGlobals", - nullptr, - GlobalVariable::NotThreadLocal, - AddrSpace); - for (size_t k = i; k < j; ++k) { - Constant *Idx[2] = { - ConstantInt::get(Int32Ty, 0), - ConstantInt::get(Int32Ty, k-i) - }; - Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx); - Globals[k]->replaceAllUsesWith(GEP); - Globals[k]->eraseFromParent(); - NumMerged++; - } - i = j; - } - - return true; -} - -void GlobalMerge::collectUsedGlobalVariables(Module &M) { - // Extract global variables from llvm.used array - const GlobalVariable *GV = M.getGlobalVariable("llvm.used"); - if (!GV || !GV->hasInitializer()) return; - - // Should be an array of 'i8*'. - const ConstantArray *InitList = cast<ConstantArray>(GV->getInitializer()); - - for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) - if (const GlobalVariable *G = - dyn_cast<GlobalVariable>(InitList->getOperand(i)->stripPointerCasts())) - MustKeepGlobalVariables.insert(G); -} - -void GlobalMerge::setMustKeepGlobalVariables(Module &M) { - collectUsedGlobalVariables(M); - - for (Module::iterator IFn = M.begin(), IEndFn = M.end(); IFn != IEndFn; - ++IFn) { - for (Function::iterator IBB = IFn->begin(), IEndBB = IFn->end(); - IBB != IEndBB; ++IBB) { - // Follow the invoke link to find the landing pad instruction - const InvokeInst *II = dyn_cast<InvokeInst>(IBB->getTerminator()); - if (!II) continue; - - const LandingPadInst *LPInst = II->getUnwindDest()->getLandingPadInst(); - // Look for globals in the clauses of the landing pad instruction - for (unsigned Idx = 0, NumClauses = LPInst->getNumClauses(); - Idx != NumClauses; ++Idx) - if (const GlobalVariable *GV = - dyn_cast<GlobalVariable>(LPInst->getClause(Idx) - ->stripPointerCasts())) - MustKeepGlobalVariables.insert(GV); - } - } -} - -bool GlobalMerge::doInitialization(Module &M) { - if (!EnableGlobalMerge) - return false; - - DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals, - BSSGlobals; - const TargetLowering *TLI = TM->getTargetLowering(); - const DataLayout *DL = TLI->getDataLayout(); - unsigned MaxOffset = TLI->getMaximalGlobalOffset(); - bool Changed = false; - setMustKeepGlobalVariables(M); - - // Grab all non-const globals. - for (Module::global_iterator I = M.global_begin(), - E = M.global_end(); I != E; ++I) { - // Merge is safe for "normal" internal globals only - if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection()) - continue; - - PointerType *PT = dyn_cast<PointerType>(I->getType()); - assert(PT && "Global variable is not a pointer!"); - - unsigned AddressSpace = PT->getAddressSpace(); - - // Ignore fancy-aligned globals for now. - unsigned Alignment = DL->getPreferredAlignment(I); - Type *Ty = I->getType()->getElementType(); - if (Alignment > DL->getABITypeAlignment(Ty)) - continue; - - // Ignore all 'special' globals. - if (I->getName().startswith("llvm.") || - I->getName().startswith(".llvm.")) - continue; - - // Ignore all "required" globals: - if (isMustKeepGlobalVariable(I)) - continue; - - if (DL->getTypeAllocSize(Ty) < MaxOffset) { - if (TargetLoweringObjectFile::getKindForGlobal(I, TLI->getTargetMachine()) - .isBSSLocal()) - BSSGlobals[AddressSpace].push_back(I); - else if (I->isConstant()) - ConstGlobals[AddressSpace].push_back(I); - else - Globals[AddressSpace].push_back(I); - } - } - - for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator - I = Globals.begin(), E = Globals.end(); I != E; ++I) - if (I->second.size() > 1) - Changed |= doMerge(I->second, M, false, I->first); - - for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator - I = BSSGlobals.begin(), E = BSSGlobals.end(); I != E; ++I) - if (I->second.size() > 1) - Changed |= doMerge(I->second, M, false, I->first); - - if (EnableGlobalMergeOnConst) - for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator - I = ConstGlobals.begin(), E = ConstGlobals.end(); I != E; ++I) - if (I->second.size() > 1) - Changed |= doMerge(I->second, M, true, I->first); - - return Changed; -} - -bool GlobalMerge::runOnFunction(Function &F) { - return false; -} - -bool GlobalMerge::doFinalization(Module &M) { - MustKeepGlobalVariables.clear(); - return false; -} - -Pass *llvm::createGlobalMergePass(const TargetMachine *TM) { - return new GlobalMerge(TM); -} diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 230a381..6e50d33 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -158,6 +158,15 @@ bool JumpThreading::runOnFunction(Function &F) { TLI = &getAnalysis<TargetLibraryInfo>(); LVI = &getAnalysis<LazyValueInfo>(); + // Remove unreachable blocks from function as they may result in infinite + // loop. We do threading if we found something profitable. Jump threading a + // branch can create other opportunities. If these opportunities form a cycle + // i.e. if any jump treading is undoing previous threading in the path, then + // we will loop forever. We take care of this issue by not jump threading for + // back edges. This works for normal cases but not for unreachable blocks as + // they may have cycle with no back edge. + removeUnreachableBlocks(F); + FindLoopHeaders(F); bool Changed, EverChanged = false; diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 0a8d16f..abcceb2 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -192,6 +192,14 @@ namespace { SmallVectorImpl<BasicBlock*> &ExitBlocks, SmallVectorImpl<Instruction*> &InsertPts, PredIteratorCache &PIC); + + /// \brief Create a copy of the instruction in the exit block and patch up + /// SSA. + /// PN is a user of I in ExitBlock that can be used to get the number and + /// list of predecessors fast. + Instruction *CloneInstructionInExitBlock(Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN); }; } @@ -531,6 +539,35 @@ bool LICM::isNotUsedInLoop(Instruction &I) { return true; } +Instruction *LICM::CloneInstructionInExitBlock(Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN) { + Instruction *New = I.clone(); + ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New); + if (!I.getName().empty()) New->setName(I.getName() + ".le"); + + // Build LCSSA PHI nodes for any in-loop operands. Note that this is + // particularly cheap because we can rip off the PHI node that we're + // replacing for the number and blocks of the predecessors. + // OPT: If this shows up in a profile, we can instead finish sinking all + // invariant instructions, and then walk their operands to re-establish + // LCSSA. That will eliminate creating PHI nodes just to nuke them when + // sinking bottom-up. + for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE; + ++OI) + if (Instruction *OInst = dyn_cast<Instruction>(*OI)) + if (Loop *OLoop = LI->getLoopFor(OInst->getParent())) + if (!OLoop->contains(&PN)) { + PHINode *OpPN = + PHINode::Create(OInst->getType(), PN.getNumIncomingValues(), + OInst->getName() + ".lcssa", ExitBlock.begin()); + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) + OpPN->addIncoming(OInst, PN.getIncomingBlock(i)); + *OI = OpPN; + } + return New; +} + /// sink - When an instruction is found to only be used outside of the loop, /// this function moves it to the exit blocks and patches up SSA form as needed. /// This method is guaranteed to remove the original instruction from its @@ -550,6 +587,9 @@ void LICM::sink(Instruction &I) { SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end()); #endif + // Clones of this instruction. Don't create more than one per exit block! + SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies; + // If this instruction is only used outside of the loop, then all users are // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of // the instruction. @@ -561,30 +601,13 @@ void LICM::sink(Instruction &I) { assert(ExitBlockSet.count(ExitBlock) && "The LCSSA PHI is not in an exit block!"); - Instruction *New = I.clone(); - ExitBlock->getInstList().insert(ExitBlock->getFirstInsertionPt(), New); - if (!I.getName().empty()) - New->setName(I.getName() + ".le"); - - // Build LCSSA PHI nodes for any in-loop operands. Note that this is - // particularly cheap because we can rip off the PHI node that we're - // replacing for the number and blocks of the predecessors. - // OPT: If this shows up in a profile, we can instead finish sinking all - // invariant instructions, and then walk their operands to re-establish - // LCSSA. That will eliminate creating PHI nodes just to nuke them when - // sinking bottom-up. - for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE; - ++OI) - if (Instruction *OInst = dyn_cast<Instruction>(*OI)) - if (Loop *OLoop = LI->getLoopFor(OInst->getParent())) - if (!OLoop->contains(PN)) { - PHINode *OpPN = PHINode::Create( - OInst->getType(), PN->getNumIncomingValues(), - OInst->getName() + ".lcssa", ExitBlock->begin()); - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - OpPN->addIncoming(OInst, PN->getIncomingBlock(i)); - *OI = OpPN; - } + Instruction *New; + auto It = SunkCopies.find(ExitBlock); + if (It != SunkCopies.end()) + New = It->second; + else + New = SunkCopies[ExitBlock] = + CloneInstructionInExitBlock(I, *ExitBlock, *PN); PN->replaceAllUsesWith(New); PN->eraseFromParent(); @@ -616,7 +639,7 @@ void LICM::hoist(Instruction &I) { /// bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { // If it is not a trapping instruction, it is always safe to hoist. - if (isSafeToSpeculativelyExecute(&Inst)) + if (isSafeToSpeculativelyExecute(&Inst, DL)) return true; return isGuaranteedToExecute(Inst); diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp new file mode 100644 index 0000000..846aa70 --- /dev/null +++ b/lib/Transforms/Scalar/LoadCombine.cpp @@ -0,0 +1,268 @@ +//===- LoadCombine.cpp - Combine Adjacent Loads ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This transformation combines adjacent loads. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetFolder.h" +#include "llvm/Pass.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "load-combine" + +STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining"); +STATISTIC(NumLoadsCombined, "Number of loads combined"); + +namespace { +struct PointerOffsetPair { + Value *Pointer; + uint64_t Offset; +}; + +struct LoadPOPPair { + LoadPOPPair(LoadInst *L, PointerOffsetPair P, unsigned O) + : Load(L), POP(P), InsertOrder(O) {} + LoadPOPPair() {} + LoadInst *Load; + PointerOffsetPair POP; + /// \brief The new load needs to be created before the first load in IR order. + unsigned InsertOrder; +}; + +class LoadCombine : public BasicBlockPass { + LLVMContext *C; + const DataLayout *DL; + +public: + LoadCombine() + : BasicBlockPass(ID), + C(nullptr), DL(nullptr) { + initializeSROAPass(*PassRegistry::getPassRegistry()); + } + bool doInitialization(Function &) override; + bool runOnBasicBlock(BasicBlock &BB) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + + const char *getPassName() const override { return "LoadCombine"; } + static char ID; + + typedef IRBuilder<true, TargetFolder> BuilderTy; + +private: + BuilderTy *Builder; + + PointerOffsetPair getPointerOffsetPair(LoadInst &); + bool combineLoads(DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &); + bool aggregateLoads(SmallVectorImpl<LoadPOPPair> &); + bool combineLoads(SmallVectorImpl<LoadPOPPair> &); +}; +} + +bool LoadCombine::doInitialization(Function &F) { + DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n"); + C = &F.getContext(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + if (!DLP) { + DEBUG(dbgs() << " Skipping LoadCombine -- no target data!\n"); + return false; + } + DL = &DLP->getDataLayout(); + return true; +} + +PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) { + PointerOffsetPair POP; + POP.Pointer = LI.getPointerOperand(); + POP.Offset = 0; + while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) { + if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) { + unsigned BitWidth = DL->getPointerTypeSizeInBits(GEP->getType()); + APInt Offset(BitWidth, 0); + if (GEP->accumulateConstantOffset(*DL, Offset)) + POP.Offset += Offset.getZExtValue(); + else + // Can't handle GEPs with variable indices. + return POP; + POP.Pointer = GEP->getPointerOperand(); + } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer)) + POP.Pointer = BC->getOperand(0); + } + return POP; +} + +bool LoadCombine::combineLoads( + DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &LoadMap) { + bool Combined = false; + for (auto &Loads : LoadMap) { + if (Loads.second.size() < 2) + continue; + std::sort(Loads.second.begin(), Loads.second.end(), + [](const LoadPOPPair &A, const LoadPOPPair &B) { + return A.POP.Offset < B.POP.Offset; + }); + if (aggregateLoads(Loads.second)) + Combined = true; + } + return Combined; +} + +/// \brief Try to aggregate loads from a sorted list of loads to be combined. +/// +/// It is guaranteed that no writes occur between any of the loads. All loads +/// have the same base pointer. There are at least two loads. +bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) { + assert(Loads.size() >= 2 && "Insufficient loads!"); + LoadInst *BaseLoad = nullptr; + SmallVector<LoadPOPPair, 8> AggregateLoads; + bool Combined = false; + uint64_t PrevOffset = -1ull; + uint64_t PrevSize = 0; + for (auto &L : Loads) { + if (PrevOffset == -1ull) { + BaseLoad = L.Load; + PrevOffset = L.POP.Offset; + PrevSize = DL->getTypeStoreSize(L.Load->getType()); + AggregateLoads.push_back(L); + continue; + } + if (L.Load->getAlignment() > BaseLoad->getAlignment()) + continue; + if (L.POP.Offset > PrevOffset + PrevSize) { + // No other load will be combinable + if (combineLoads(AggregateLoads)) + Combined = true; + AggregateLoads.clear(); + PrevOffset = -1; + continue; + } + if (L.POP.Offset != PrevOffset + PrevSize) + // This load is offset less than the size of the last load. + // FIXME: We may want to handle this case. + continue; + PrevOffset = L.POP.Offset; + PrevSize = DL->getTypeStoreSize(L.Load->getType()); + AggregateLoads.push_back(L); + } + if (combineLoads(AggregateLoads)) + Combined = true; + return Combined; +} + +/// \brief Given a list of combinable load. Combine the maximum number of them. +bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) { + // Remove loads from the end while the size is not a power of 2. + unsigned TotalSize = 0; + for (const auto &L : Loads) + TotalSize += L.Load->getType()->getPrimitiveSizeInBits(); + while (TotalSize != 0 && !isPowerOf2_32(TotalSize)) + TotalSize -= Loads.pop_back_val().Load->getType()->getPrimitiveSizeInBits(); + if (Loads.size() < 2) + return false; + + DEBUG({ + dbgs() << "***** Combining Loads ******\n"; + for (const auto &L : Loads) { + dbgs() << L.POP.Offset << ": " << *L.Load << "\n"; + } + }); + + // Find first load. This is where we put the new load. + LoadPOPPair FirstLP; + FirstLP.InsertOrder = -1u; + for (const auto &L : Loads) + if (L.InsertOrder < FirstLP.InsertOrder) + FirstLP = L; + + unsigned AddressSpace = + FirstLP.POP.Pointer->getType()->getPointerAddressSpace(); + + Builder->SetInsertPoint(FirstLP.Load); + Value *Ptr = Builder->CreateConstGEP1_64( + Builder->CreatePointerCast(Loads[0].POP.Pointer, + Builder->getInt8PtrTy(AddressSpace)), + Loads[0].POP.Offset); + LoadInst *NewLoad = new LoadInst( + Builder->CreatePointerCast( + Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize), + Ptr->getType()->getPointerAddressSpace())), + Twine(Loads[0].Load->getName()) + ".combined", false, + Loads[0].Load->getAlignment(), FirstLP.Load); + + for (const auto &L : Loads) { + Builder->SetInsertPoint(L.Load); + Value *V = Builder->CreateExtractInteger( + *DL, NewLoad, cast<IntegerType>(L.Load->getType()), + L.POP.Offset - Loads[0].POP.Offset, "combine.extract"); + L.Load->replaceAllUsesWith(V); + } + + NumLoadsCombined = NumLoadsCombined + Loads.size(); + return true; +} + +bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { + if (skipOptnoneFunction(BB) || !DL) + return false; + + IRBuilder<true, TargetFolder> + TheBuilder(BB.getContext(), TargetFolder(DL)); + Builder = &TheBuilder; + + DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap; + + bool Combined = false; + unsigned Index = 0; + for (auto &I : BB) { + if (I.mayWriteToMemory() || I.mayThrow()) { + if (combineLoads(LoadMap)) + Combined = true; + LoadMap.clear(); + continue; + } + LoadInst *LI = dyn_cast<LoadInst>(&I); + if (!LI) + continue; + ++NumLoadsAnalyzed; + if (!LI->isSimple() || !LI->getType()->isIntegerTy()) + continue; + auto POP = getPointerOffsetPair(*LI); + if (!POP.Pointer) + continue; + LoadMap[POP.Pointer].push_back(LoadPOPPair(LI, POP, Index++)); + } + if (combineLoads(LoadMap)) + Combined = true; + return Combined; +} + +void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); +} + +char LoadCombine::ID = 0; + +BasicBlockPass *llvm::createLoadCombinePass() { + return new LoadCombine(); +} + +INITIALIZE_PASS(LoadCombine, "load-combine", "Combine Adjacent Loads", false, + false) diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 26a83df..a12f5a7 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -112,7 +112,7 @@ namespace { /// the variable involved in the comparion is returned. This function will /// be called to see if the precondition and postcondition of the loop /// are in desirable form. - Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const; + Value *matchCondition(BranchInst *Br, BasicBlock *NonZeroTarget) const; /// Return true iff the idiom is detected in the loop. and 1) \p CntInst /// is set to the instruction counting the population bit. 2) \p CntPhi @@ -122,7 +122,7 @@ namespace { (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const; /// Insert ctpop intrinsic function and some obviously dead instructions. - void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var); + void transform(Instruction *CntInst, PHINode *CntPhi, Value *Var); /// Create llvm.ctpop.* intrinsic function. CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL); diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp index 8b5e036..b6fbb16 100644 --- a/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -924,8 +924,10 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, // them, and this matching fails. As an exception, we allow the alias // set tracker to handle regular (simple) load/store dependencies. if (FutureSideEffects && - ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) || - (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) { + ((!isSimpleLoadStore(J1) && + !isSafeToSpeculativelyExecute(J1, DL)) || + (!isSimpleLoadStore(J2) && + !isSafeToSpeculativelyExecute(J2, DL)))) { DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << " vs. " << *J2 << " (side effects prevent reordering)\n"); diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index fc28fd2..00c0f88 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -18,8 +18,10 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -36,7 +38,8 @@ UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, static cl::opt<unsigned> UnrollCount("unroll-count", cl::init(0), cl::Hidden, - cl::desc("Use this unroll count for all loops, for testing purposes")); + cl::desc("Use this unroll count for all loops including those with " + "unroll_count pragma values, for testing purposes")); static cl::opt<bool> UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden, @@ -47,6 +50,11 @@ static cl::opt<bool> UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::init(false), cl::Hidden, cl::desc("Unroll loops with run-time trip counts")); +static cl::opt<unsigned> +PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden, + cl::desc("Unrolled size limit for loops with an unroll(enable) or " + "unroll_count pragma.")); + namespace { class LoopUnroll : public LoopPass { public: @@ -109,6 +117,66 @@ namespace { // For now, recreate dom info, if loop is unrolled. AU.addPreserved<DominatorTreeWrapperPass>(); } + + // Fill in the UnrollingPreferences parameter with values from the + // TargetTransformationInfo. + void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI, + TargetTransformInfo::UnrollingPreferences &UP) { + UP.Threshold = CurrentThreshold; + UP.OptSizeThreshold = OptSizeUnrollThreshold; + UP.PartialThreshold = CurrentThreshold; + UP.PartialOptSizeThreshold = OptSizeUnrollThreshold; + UP.Count = CurrentCount; + UP.MaxCount = UINT_MAX; + UP.Partial = CurrentAllowPartial; + UP.Runtime = CurrentRuntime; + TTI.getUnrollingPreferences(L, UP); + } + + // Select and return an unroll count based on parameters from + // user, unroll preferences, unroll pragmas, or a heuristic. + // SetExplicitly is set to true if the unroll count is is set by + // the user or a pragma rather than selected heuristically. + unsigned + selectUnrollCount(const Loop *L, unsigned TripCount, bool HasEnablePragma, + unsigned PragmaCount, + const TargetTransformInfo::UnrollingPreferences &UP, + bool &SetExplicitly); + + + // Select threshold values used to limit unrolling based on a + // total unrolled size. Parameters Threshold and PartialThreshold + // are set to the maximum unrolled size for fully and partially + // unrolled loops respectively. + void selectThresholds(const Loop *L, bool HasPragma, + const TargetTransformInfo::UnrollingPreferences &UP, + unsigned &Threshold, unsigned &PartialThreshold) { + // Determine the current unrolling threshold. While this is + // normally set from UnrollThreshold, it is overridden to a + // smaller value if the current function is marked as + // optimize-for-size, and the unroll threshold was not user + // specified. + Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; + PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold; + if (!UserThreshold && + L->getHeader()->getParent()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize)) { + Threshold = UP.OptSizeThreshold; + PartialThreshold = UP.PartialOptSizeThreshold; + } + if (HasPragma) { + // If the loop has an unrolling pragma, we want to be more + // aggressive with unrolling limits. Set thresholds to at + // least the PragmaTheshold value which is larger than the + // default limits. + if (Threshold != NoThreshold) + Threshold = std::max<unsigned>(Threshold, PragmaUnrollThreshold); + if (PartialThreshold != NoThreshold) + PartialThreshold = + std::max<unsigned>(PartialThreshold, PragmaUnrollThreshold); + } + } }; } @@ -151,6 +219,103 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, return LoopSize; } +// Returns the value associated with the given metadata node name (for +// example, "llvm.loop.unroll.count"). If no such named metadata node +// exists, then nullptr is returned. +static const ConstantInt *GetUnrollMetadataValue(const Loop *L, + StringRef Name) { + MDNode *LoopID = L->getLoopID(); + if (!LoopID) return nullptr; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { + const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (!MD) continue; + + const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + if (!S) continue; + + if (Name.equals(S->getString())) { + assert(MD->getNumOperands() == 2 && + "Unroll hint metadata should have two operands."); + return cast<ConstantInt>(MD->getOperand(1)); + } + } + return nullptr; +} + +// Returns true if the loop has an unroll(enable) pragma. +static bool HasUnrollEnablePragma(const Loop *L) { + const ConstantInt *EnableValue = + GetUnrollMetadataValue(L, "llvm.loop.unroll.enable"); + return (EnableValue && EnableValue->getZExtValue()); +} + +// Returns true if the loop has an unroll(disable) pragma. +static bool HasUnrollDisablePragma(const Loop *L) { + const ConstantInt *EnableValue = + GetUnrollMetadataValue(L, "llvm.loop.unroll.enable"); + return (EnableValue && !EnableValue->getZExtValue()); +} + +// If loop has an unroll_count pragma return the (necessarily +// positive) value from the pragma. Otherwise return 0. +static unsigned UnrollCountPragmaValue(const Loop *L) { + const ConstantInt *CountValue = + GetUnrollMetadataValue(L, "llvm.loop.unroll.count"); + if (CountValue) { + unsigned Count = CountValue->getZExtValue(); + assert(Count >= 1 && "Unroll count must be positive."); + return Count; + } + return 0; +} + +unsigned LoopUnroll::selectUnrollCount( + const Loop *L, unsigned TripCount, bool HasEnablePragma, + unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP, + bool &SetExplicitly) { + SetExplicitly = true; + + // User-specified count (either as a command-line option or + // constructor parameter) has highest precedence. + unsigned Count = UserCount ? CurrentCount : 0; + + // If there is no user-specified count, unroll pragmas have the next + // highest precendence. + if (Count == 0) { + if (PragmaCount) { + Count = PragmaCount; + } else if (HasEnablePragma) { + // unroll(enable) pragma without an unroll_count pragma + // indicates to unroll loop fully. + Count = TripCount; + } + } + + if (Count == 0) + Count = UP.Count; + + if (Count == 0) { + SetExplicitly = false; + if (TripCount == 0) + // Runtime trip count. + Count = UnrollRuntimeCount; + else + // Conservative heuristic: if we know the trip count, see if we can + // completely unroll (subject to the threshold, checked below); otherwise + // try to find greatest modulo of the trip count which is still under + // threshold value. + Count = TripCount; + } + if (TripCount && Count > TripCount) + return TripCount; + return Count; +} + bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { if (skipOptnoneFunction(L)) return false; @@ -162,33 +327,16 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() << "] Loop %" << Header->getName() << "\n"); - (void)Header; - TargetTransformInfo::UnrollingPreferences UP; - UP.Threshold = CurrentThreshold; - UP.OptSizeThreshold = OptSizeUnrollThreshold; - UP.PartialThreshold = CurrentThreshold; - UP.PartialOptSizeThreshold = OptSizeUnrollThreshold; - UP.Count = CurrentCount; - UP.MaxCount = UINT_MAX; - UP.Partial = CurrentAllowPartial; - UP.Runtime = CurrentRuntime; - TTI.getUnrollingPreferences(L, UP); - - // Determine the current unrolling threshold. While this is normally set - // from UnrollThreshold, it is overridden to a smaller value if the current - // function is marked as optimize-for-size, and the unroll threshold was - // not user specified. - unsigned Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; - unsigned PartialThreshold = - UserThreshold ? CurrentThreshold : UP.PartialThreshold; - if (!UserThreshold && - Header->getParent()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize)) { - Threshold = UP.OptSizeThreshold; - PartialThreshold = UP.PartialOptSizeThreshold; + if (HasUnrollDisablePragma(L)) { + return false; } + bool HasEnablePragma = HasUnrollEnablePragma(L); + unsigned PragmaCount = UnrollCountPragmaValue(L); + bool HasPragma = HasEnablePragma || PragmaCount > 0; + + TargetTransformInfo::UnrollingPreferences UP; + getUnrollingPreferences(L, TTI, UP); // Find trip count and trip multiple if count is not available unsigned TripCount = 0; @@ -202,79 +350,117 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock); } - bool Runtime = UserRuntime ? CurrentRuntime : UP.Runtime; - - // Use a default unroll-count if the user doesn't specify a value - // and the trip count is a run-time value. The default is different - // for run-time or compile-time trip count loops. - unsigned Count = UserCount ? CurrentCount : UP.Count; - if (Runtime && Count == 0 && TripCount == 0) - Count = UnrollRuntimeCount; + // Select an initial unroll count. This may be reduced later based + // on size thresholds. + bool CountSetExplicitly; + unsigned Count = selectUnrollCount(L, TripCount, HasEnablePragma, PragmaCount, + UP, CountSetExplicitly); + + unsigned NumInlineCandidates; + bool notDuplicatable; + unsigned LoopSize = + ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI); + DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); + uint64_t UnrolledSize = (uint64_t)LoopSize * Count; + if (notDuplicatable) { + DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" + << " instructions.\n"); + return false; + } + if (NumInlineCandidates != 0) { + DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); + return false; + } - if (Count == 0) { - // Conservative heuristic: if we know the trip count, see if we can - // completely unroll (subject to the threshold, checked below); otherwise - // try to find greatest modulo of the trip count which is still under - // threshold value. - if (TripCount == 0) - return false; - Count = TripCount; + unsigned Threshold, PartialThreshold; + selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold); + + // Given Count, TripCount and thresholds determine the type of + // unrolling which is to be performed. + enum { Full = 0, Partial = 1, Runtime = 2 }; + int Unrolling; + if (TripCount && Count == TripCount) { + if (Threshold != NoThreshold && UnrolledSize > Threshold) { + DEBUG(dbgs() << " Too large to fully unroll with count: " << Count + << " because size: " << UnrolledSize << ">" << Threshold + << "\n"); + Unrolling = Partial; + } else { + Unrolling = Full; + } + } else if (TripCount && Count < TripCount) { + Unrolling = Partial; + } else { + Unrolling = Runtime; } - // Enforce the threshold. - if (Threshold != NoThreshold && PartialThreshold != NoThreshold) { - unsigned NumInlineCandidates; - bool notDuplicatable; - unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, - notDuplicatable, TTI); - DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); - if (notDuplicatable) { - DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" - << " instructions.\n"); + // Reduce count based on the type of unrolling and the threshold values. + unsigned OriginalCount = Count; + bool AllowRuntime = UserRuntime ? CurrentRuntime : UP.Runtime; + if (Unrolling == Partial) { + bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; + if (!AllowPartial && !CountSetExplicitly) { + DEBUG(dbgs() << " will not try to unroll partially because " + << "-unroll-allow-partial not given\n"); return false; } - if (NumInlineCandidates != 0) { - DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); + if (PartialThreshold != NoThreshold && UnrolledSize > PartialThreshold) { + // Reduce unroll count to be modulo of TripCount for partial unrolling. + Count = PartialThreshold / LoopSize; + while (Count != 0 && TripCount % Count != 0) + Count--; + } + } else if (Unrolling == Runtime) { + if (!AllowRuntime && !CountSetExplicitly) { + DEBUG(dbgs() << " will not try to unroll loop with runtime trip count " + << "-unroll-runtime not given\n"); return false; } - uint64_t Size = (uint64_t)LoopSize*Count; - if (TripCount != 1 && - (Size > Threshold || (Count != TripCount && Size > PartialThreshold))) { - if (Size > Threshold) - DEBUG(dbgs() << " Too large to fully unroll with count: " << Count - << " because size: " << Size << ">" << Threshold << "\n"); - - bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; - if (!AllowPartial && !(Runtime && TripCount == 0)) { - DEBUG(dbgs() << " will not try to unroll partially because " - << "-unroll-allow-partial not given\n"); - return false; - } - if (TripCount) { - // Reduce unroll count to be modulo of TripCount for partial unrolling - Count = PartialThreshold / LoopSize; - while (Count != 0 && TripCount%Count != 0) - Count--; - } - else if (Runtime) { - // Reduce unroll count to be a lower power-of-two value - while (Count != 0 && Size > PartialThreshold) { - Count >>= 1; - Size = LoopSize*Count; - } - } - if (Count > UP.MaxCount) - Count = UP.MaxCount; - if (Count < 2) { - DEBUG(dbgs() << " could not unroll partially\n"); - return false; + // Reduce unroll count to be the largest power-of-two factor of + // the original count which satisfies the threshold limit. + while (Count != 0 && UnrolledSize > PartialThreshold) { + Count >>= 1; + UnrolledSize = LoopSize * Count; + } + if (Count > UP.MaxCount) + Count = UP.MaxCount; + DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n"); + } + + if (HasPragma) { + // Emit optimization remarks if we are unable to unroll the loop + // as directed by a pragma. + DebugLoc LoopLoc = L->getStartLoc(); + Function *F = Header->getParent(); + LLVMContext &Ctx = F->getContext(); + if (HasEnablePragma && PragmaCount == 0) { + if (TripCount && Count != TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll(enable) pragma " + "because unrolled size is too large."); + } else if (!TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll(enable) pragma " + "because loop has a runtime trip count."); } - DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n"); + } else if (PragmaCount > 0 && Count != OriginalCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to unroll loop the number of times directed by " + "unroll_count pragma because unrolled size is too large."); } } + if (Unrolling != Full && Count < 2) { + // Partial unrolling by 1 is a nop. For full unrolling, a factor + // of 1 makes sense because loop control can be eliminated. + return false; + } + // Unroll the loop. - if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, this, &LPM)) + if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this, &LPM)) return false; return true; diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp index 4251ac4..3314e1e 100644 --- a/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/lib/Transforms/Scalar/LowerAtomic.cpp @@ -32,7 +32,10 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { Value *Res = Builder.CreateSelect(Equal, Val, Orig); Builder.CreateStore(Res, Ptr); - CXI->replaceAllUsesWith(Orig); + Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0); + Res = Builder.CreateInsertValue(Res, Equal, 1); + + CXI->replaceAllUsesWith(Res); CXI->eraseFromParent(); return true; } diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index 986d6a4..ea2cf7c 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -1368,11 +1368,10 @@ Value *Reassociate::OptimizeXor(Instruction *I, Value *Reassociate::OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops) { // Scan the operand lists looking for X and -X pairs. If we find any, we - // can simplify the expression. X+-X == 0. While we're at it, scan for any + // can simplify expressions like X+-X == 0 and X+~X ==-1. While we're at it, + // scan for any // duplicates. We want to canonicalize Y+Y+Y+Z -> 3*Y+Z. - // - // TODO: We could handle "X + ~X" -> "-1" if we wanted, since "-X = ~X+1". - // + for (unsigned i = 0, e = Ops.size(); i != e; ++i) { Value *TheOp = Ops[i].Op; // Check to see if we've seen this operand before. If so, we factor all @@ -1412,19 +1411,28 @@ Value *Reassociate::OptimizeAdd(Instruction *I, continue; } - // Check for X and -X in the operand list. - if (!BinaryOperator::isNeg(TheOp)) + // Check for X and -X or X and ~X in the operand list. + if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isNot(TheOp)) continue; - Value *X = BinaryOperator::getNegArgument(TheOp); + Value *X = nullptr; + if (BinaryOperator::isNeg(TheOp)) + X = BinaryOperator::getNegArgument(TheOp); + else if (BinaryOperator::isNot(TheOp)) + X = BinaryOperator::getNotArgument(TheOp); + unsigned FoundX = FindInOperandList(Ops, i, X); if (FoundX == i) continue; // Remove X and -X from the operand list. - if (Ops.size() == 2) + if (Ops.size() == 2 && BinaryOperator::isNeg(TheOp)) return Constant::getNullValue(X->getType()); + // Remove X and ~X from the operand list. + if (Ops.size() == 2 && BinaryOperator::isNot(TheOp)) + return Constant::getAllOnesValue(X->getType()); + Ops.erase(Ops.begin()+i); if (i < FoundX) --FoundX; @@ -1434,6 +1442,13 @@ Value *Reassociate::OptimizeAdd(Instruction *I, ++NumAnnihil; --i; // Revisit element. e -= 2; // Removed two elements. + + // if X and ~X we append -1 to the operand list. + if (BinaryOperator::isNot(TheOp)) { + Value *V = Constant::getAllOnesValue(X->getType()); + Ops.insert(Ops.end(), ValueEntry(getRank(V), V)); + e += 1; + } } // Scan the operand list, checking to see if there are any common factors diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index feeb231..90c3520 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -494,7 +494,9 @@ private: void visitResumeInst (TerminatorInst &I) { /*returns void*/ } void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ } void visitFenceInst (FenceInst &I) { /*returns void*/ } - void visitAtomicCmpXchgInst (AtomicCmpXchgInst &I) { markOverdefined(&I); } + void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) { + markAnythingOverdefined(&I); + } void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); } void visitAllocaInst (Instruction &I) { markOverdefined(&I); } void visitVAArgInst (Instruction &I) { markAnythingOverdefined(&I); } diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 04bf4f8..8c7f253 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -1032,11 +1032,6 @@ static Type *findCommonType(AllocaSlices::const_iterator B, UserTy = SI->getValueOperand()->getType(); } - if (!UserTy || (Ty && Ty != UserTy)) - TyIsCommon = false; // Give up on anything but an iN type. - else - Ty = UserTy; - if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) { // If the type is larger than the partition, skip it. We only encounter // this for split integer operations where we want to use the type of the @@ -1051,6 +1046,13 @@ static Type *findCommonType(AllocaSlices::const_iterator B, if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth()) ITy = UserITy; } + + // To avoid depending on the order of slices, Ty and TyIsCommon must not + // depend on types skipped above. + if (!UserTy || (Ty && Ty != UserTy)) + TyIsCommon = false; // Give up on anything but an iN type. + else + Ty = UserTy; } return TyIsCommon ? Ty : ITy; @@ -1128,7 +1130,7 @@ static bool isSafePHIToSpeculate(PHINode &PN, // If this pointer is always safe to load, or if we can prove that there // is already a load in the block, then we can move the load to the pred // block. - if (InVal->isDereferenceablePointer() || + if (InVal->isDereferenceablePointer(DL) || isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL)) continue; @@ -1196,8 +1198,8 @@ static bool isSafeSelectToSpeculate(SelectInst &SI, const DataLayout *DL = nullptr) { Value *TValue = SI.getTrueValue(); Value *FValue = SI.getFalseValue(); - bool TDerefable = TValue->isDereferenceablePointer(); - bool FDerefable = FValue->isDereferenceablePointer(); + bool TDerefable = TValue->isDereferenceablePointer(DL); + bool FDerefable = FValue->isDereferenceablePointer(DL); for (User *U : SI.users()) { LoadInst *LI = dyn_cast<LoadInst>(U); diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp index 8e557aa..73c97ff 100644 --- a/lib/Transforms/Scalar/SampleProfile.cpp +++ b/lib/Transforms/Scalar/SampleProfile.cpp @@ -450,13 +450,14 @@ void SampleModuleProfile::dump() { /// /// \returns true if the file was loaded successfully, false otherwise. bool SampleModuleProfile::loadText() { - std::unique_ptr<MemoryBuffer> Buffer; - error_code EC = MemoryBuffer::getFile(Filename, Buffer); - if (EC) { + ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr = + MemoryBuffer::getFile(Filename); + if (std::error_code EC = BufferOrErr.getError()) { std::string Msg(EC.message()); M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg)); return false; } + std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get()); line_iterator LineIt(*Buffer, '#'); // Read the profile of each function. Since each function may be diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index f8f828c..edf012d 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -65,6 +65,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeSinkingPass(Registry); initializeTailCallElimPass(Registry); initializeSeparateConstOffsetFromGEPPass(Registry); + initializeLoadCombinePass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 58192fc..e2a24a7 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -1142,8 +1142,8 @@ public: /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) { - bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(); - bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(); + bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(DL); + bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(DL); for (User *U : SI->users()) { LoadInst *LI = dyn_cast<LoadInst>(U); @@ -1226,7 +1226,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) { // If this pointer is always safe to load, or if we can prove that there is // already a load in the block, then we can move the load to the pred block. - if (InVal->isDereferenceablePointer() || + if (InVal->isDereferenceablePointer(DL) || isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, DL)) continue; diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index b8529e1..62f2026 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -121,41 +121,75 @@ class ConstantOffsetExtractor { /// numeric value of the extracted constant offset (0 if failed), and a /// new index representing the remainder (equal to the original index minus /// the constant offset). - /// \p Idx The given GEP index - /// \p NewIdx The new index to replace - /// \p DL The datalayout of the module - /// \p IP Calculating the new index requires new instructions. IP indicates - /// where to insert them (typically right before the GEP). + /// \p Idx The given GEP index + /// \p NewIdx The new index to replace (output) + /// \p DL The datalayout of the module + /// \p GEP The given GEP static int64_t Extract(Value *Idx, Value *&NewIdx, const DataLayout *DL, - Instruction *IP); + GetElementPtrInst *GEP); /// Looks for a constant offset without extracting it. The meaning of the /// arguments and the return value are the same as Extract. - static int64_t Find(Value *Idx, const DataLayout *DL); + static int64_t Find(Value *Idx, const DataLayout *DL, GetElementPtrInst *GEP); private: ConstantOffsetExtractor(const DataLayout *Layout, Instruction *InsertionPt) : DL(Layout), IP(InsertionPt) {} - /// Searches the expression that computes V for a constant offset. If the - /// searching is successful, update UserChain as a path from V to the constant - /// offset. - int64_t find(Value *V); - /// A helper function to look into both operands of a binary operator U. - /// \p IsSub Whether U is a sub operator. If so, we need to negate the - /// constant offset at some point. - int64_t findInEitherOperand(User *U, bool IsSub); - /// After finding the constant offset and how it is reached from the GEP - /// index, we build a new index which is a clone of the old one except the - /// constant offset is removed. For example, given (a + (b + 5)) and knowning - /// the constant offset is 5, this function returns (a + b). + /// Searches the expression that computes V for a non-zero constant C s.t. + /// V can be reassociated into the form V' + C. If the searching is + /// successful, returns C and update UserChain as a def-use chain from C to V; + /// otherwise, UserChain is empty. /// - /// We cannot simply change the constant to zero because the expression that - /// computes the index or its intermediate result may be used by others. - Value *rebuildWithoutConstantOffset(); - // A helper function for rebuildWithoutConstantOffset that rebuilds the direct - // user (U) of the constant offset (C). - Value *rebuildLeafWithoutConstantOffset(User *U, Value *C); - /// Returns a clone of U except the first occurrence of From with To. - Value *cloneAndReplace(User *U, Value *From, Value *To); + /// \p V The given expression + /// \p SignExtended Whether V will be sign-extended in the computation of the + /// GEP index + /// \p ZeroExtended Whether V will be zero-extended in the computation of the + /// GEP index + /// \p NonNegative Whether V is guaranteed to be non-negative. For example, + /// an index of an inbounds GEP is guaranteed to be + /// non-negative. Levaraging this, we can better split + /// inbounds GEPs. + APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative); + /// A helper function to look into both operands of a binary operator. + APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended, + bool ZeroExtended); + /// After finding the constant offset C from the GEP index I, we build a new + /// index I' s.t. I' + C = I. This function builds and returns the new + /// index I' according to UserChain produced by function "find". + /// + /// The building conceptually takes two steps: + /// 1) iteratively distribute s/zext towards the leaves of the expression tree + /// that computes I + /// 2) reassociate the expression tree to the form I' + C. + /// + /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute + /// sext to a, b and 5 so that we have + /// sext(a) + (sext(b) + 5). + /// Then, we reassociate it to + /// (sext(a) + sext(b)) + 5. + /// Given this form, we know I' is sext(a) + sext(b). + Value *rebuildWithoutConstOffset(); + /// After the first step of rebuilding the GEP index without the constant + /// offset, distribute s/zext to the operands of all operators in UserChain. + /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) => + /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))). + /// + /// The function also updates UserChain to point to new subexpressions after + /// distributing s/zext. e.g., the old UserChain of the above example is + /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), + /// and the new UserChain is + /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> + /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) + /// + /// \p ChainIndex The index to UserChain. ChainIndex is initially + /// UserChain.size() - 1, and is decremented during + /// the recursion. + Value *distributeExtsAndCloneChain(unsigned ChainIndex); + /// Reassociates the GEP index to the form I' + C and returns I'. + Value *removeConstOffset(unsigned ChainIndex); + /// A helper function to apply ExtInsts, a list of s/zext, to value V. + /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function + /// returns "sext i32 (zext i16 V to i32) to i64". + Value *applyExts(Value *V); /// Returns true if LHS and RHS have no bits in common, i.e., LHS | RHS == 0. bool NoCommonBits(Value *LHS, Value *RHS) const; @@ -163,20 +197,26 @@ class ConstantOffsetExtractor { /// \p KnownOne Mask of all bits that are known to be one. /// \p KnownZero Mask of all bits that are known to be zero. void ComputeKnownBits(Value *V, APInt &KnownOne, APInt &KnownZero) const; - /// Finds the first use of Used in U. Returns -1 if not found. - static unsigned FindFirstUse(User *U, Value *Used); - /// Returns whether OPC (sext or zext) can be distributed to the operands of - /// BO. e.g., sext can be distributed to the operands of an "add nsw" because - /// sext (add nsw a, b) == add nsw (sext a), (sext b). - static bool Distributable(unsigned OPC, BinaryOperator *BO); + /// A helper function that returns whether we can trace into the operands + /// of binary operator BO for a constant offset. + /// + /// \p SignExtended Whether BO is surrounded by sext + /// \p ZeroExtended Whether BO is surrounded by zext + /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound + /// array index. + bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, + bool NonNegative); /// The path from the constant offset to the old GEP index. e.g., if the GEP /// index is "a * b + (c + 5)". After running function find, UserChain[0] will /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and /// UserChain[2] will be the entire expression "a * b + (c + 5)". /// - /// This path helps rebuildWithoutConstantOffset rebuild the new GEP index. + /// This path helps to rebuild the new GEP index. SmallVector<User *, 8> UserChain; + /// A data structure used in rebuildWithoutConstOffset. Contains all + /// sext/zext instructions along UserChain. + SmallVector<CastInst *, 16> ExtInsts; /// The data layout of the module. Used in ComputeKnownBits. const DataLayout *DL; Instruction *IP; /// Insertion position of cloned instructions. @@ -196,6 +236,15 @@ class SeparateConstOffsetFromGEP : public FunctionPass { AU.addRequired<DataLayoutPass>(); AU.addRequired<TargetTransformInfo>(); } + + bool doInitialization(Module &M) override { + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + if (DLP == nullptr) + report_fatal_error("data layout missing"); + DL = &DLP->getDataLayout(); + return false; + } + bool runOnFunction(Function &F) override; private: @@ -206,8 +255,42 @@ class SeparateConstOffsetFromGEP : public FunctionPass { /// function only inspects the GEP without changing it. The output /// NeedsExtraction indicates whether we can extract a non-zero constant /// offset from any index. - int64_t accumulateByteOffset(GetElementPtrInst *GEP, const DataLayout *DL, - bool &NeedsExtraction); + int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction); + /// Canonicalize array indices to pointer-size integers. This helps to + /// simplify the logic of splitting a GEP. For example, if a + b is a + /// pointer-size integer, we have + /// gep base, a + b = gep (gep base, a), b + /// However, this equality may not hold if the size of a + b is smaller than + /// the pointer size, because LLVM conceptually sign-extends GEP indices to + /// pointer size before computing the address + /// (http://llvm.org/docs/LangRef.html#id181). + /// + /// This canonicalization is very likely already done in clang and + /// instcombine. Therefore, the program will probably remain the same. + /// + /// Returns true if the module changes. + /// + /// Verified in @i32_add in split-gep.ll + bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); + /// For each array index that is in the form of zext(a), convert it to sext(a) + /// if we can prove zext(a) <= max signed value of typeof(a). We prefer + /// sext(a) to zext(a), because in the special case where x + y >= 0 and + /// (x >= 0 or y >= 0), function CanTraceInto can split sext(x + y), + /// while no such case exists for zext(x + y). + /// + /// Note that + /// zext(x + y) = zext(x) + zext(y) + /// is wrong, e.g., + /// zext i32(UINT_MAX + 1) to i64 != + /// (zext i32 UINT_MAX to i64) + (zext i32 1 to i64) + /// + /// Returns true if the module changes. + /// + /// Verified in @inbounds_zext_add in split-gep.ll and @sum_of_array3 in + /// split-gep-and-gvn.ll + bool convertInBoundsZExtToSExt(GetElementPtrInst *GEP); + + const DataLayout *DL; }; } // anonymous namespace @@ -227,181 +310,272 @@ FunctionPass *llvm::createSeparateConstOffsetFromGEPPass() { return new SeparateConstOffsetFromGEP(); } -bool ConstantOffsetExtractor::Distributable(unsigned OPC, BinaryOperator *BO) { - assert(OPC == Instruction::SExt || OPC == Instruction::ZExt); +bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, + bool ZeroExtended, + BinaryOperator *BO, + bool NonNegative) { + // We only consider ADD, SUB and OR, because a non-zero constant found in + // expressions composed of these operations can be easily hoisted as a + // constant offset by reassociation. + if (BO->getOpcode() != Instruction::Add && + BO->getOpcode() != Instruction::Sub && + BO->getOpcode() != Instruction::Or) { + return false; + } + + Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); + // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS + // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS). + if (BO->getOpcode() == Instruction::Or && !NoCommonBits(LHS, RHS)) + return false; + + // In addition, tracing into BO requires that its surrounding s/zext (if + // any) is distributable to both operands. + // + // Suppose BO = A op B. + // SignExtended | ZeroExtended | Distributable? + // --------------+--------------+---------------------------------- + // 0 | 0 | true because no s/zext exists + // 0 | 1 | zext(BO) == zext(A) op zext(B) + // 1 | 0 | sext(BO) == sext(A) op sext(B) + // 1 | 1 | zext(sext(BO)) == + // | | zext(sext(A)) op zext(sext(B)) + if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) { + // If a + b >= 0 and (a >= 0 or b >= 0), then + // sext(a + b) = sext(a) + sext(b) + // even if the addition is not marked nsw. + // + // Leveraging this invarient, we can trace into an sext'ed inbound GEP + // index if the constant offset is non-negative. + // + // Verified in @sext_add in split-gep.ll. + if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) { + if (!ConstLHS->isNegative()) + return true; + } + if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) { + if (!ConstRHS->isNegative()) + return true; + } + } // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B) // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B) if (BO->getOpcode() == Instruction::Add || BO->getOpcode() == Instruction::Sub) { - return (OPC == Instruction::SExt && BO->hasNoSignedWrap()) || - (OPC == Instruction::ZExt && BO->hasNoUnsignedWrap()); + if (SignExtended && !BO->hasNoSignedWrap()) + return false; + if (ZeroExtended && !BO->hasNoUnsignedWrap()) + return false; } - // sext/zext (and/or/xor A, B) == and/or/xor (sext/zext A), (sext/zext B) - // -instcombine also leverages this invariant to do the reverse - // transformation to reduce integer casts. - return BO->getOpcode() == Instruction::And || - BO->getOpcode() == Instruction::Or || - BO->getOpcode() == Instruction::Xor; + return true; } -int64_t ConstantOffsetExtractor::findInEitherOperand(User *U, bool IsSub) { - assert(U->getNumOperands() == 2); - int64_t ConstantOffset = find(U->getOperand(0)); +APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO, + bool SignExtended, + bool ZeroExtended) { + // BO being non-negative does not shed light on whether its operands are + // non-negative. Clear the NonNegative flag here. + APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended, + /* NonNegative */ false); // If we found a constant offset in the left operand, stop and return that. // This shortcut might cause us to miss opportunities of combining the // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9. // However, such cases are probably already handled by -instcombine, // given this pass runs after the standard optimizations. if (ConstantOffset != 0) return ConstantOffset; - ConstantOffset = find(U->getOperand(1)); + ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended, + /* NonNegative */ false); // If U is a sub operator, negate the constant offset found in the right // operand. - return IsSub ? -ConstantOffset : ConstantOffset; + if (BO->getOpcode() == Instruction::Sub) + ConstantOffset = -ConstantOffset; + return ConstantOffset; } -int64_t ConstantOffsetExtractor::find(Value *V) { - // TODO(jingyue): We can even trace into integer/pointer casts, such as +APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, + bool ZeroExtended, bool NonNegative) { + // TODO(jingyue): We could trace into integer/pointer casts, such as // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only // integers because it gives good enough results for our benchmarks. - assert(V->getType()->isIntegerTy()); + unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth(); + // We cannot do much with Values that are not a User, such as an Argument. User *U = dyn_cast<User>(V); - // We cannot do much with Values that are not a User, such as BasicBlock and - // MDNode. - if (U == nullptr) return 0; + if (U == nullptr) return APInt(BitWidth, 0); - int64_t ConstantOffset = 0; - if (ConstantInt *CI = dyn_cast<ConstantInt>(U)) { + APInt ConstantOffset(BitWidth, 0); + if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { // Hooray, we found it! - ConstantOffset = CI->getSExtValue(); - } else if (Operator *O = dyn_cast<Operator>(U)) { - // The GEP index may be more complicated than a simple addition of a - // varaible and a constant. Therefore, we trace into subexpressions for more - // hoisting opportunities. - switch (O->getOpcode()) { - case Instruction::Add: { - ConstantOffset = findInEitherOperand(U, false); - break; - } - case Instruction::Sub: { - ConstantOffset = findInEitherOperand(U, true); - break; - } - case Instruction::Or: { - // If LHS and RHS don't have common bits, (LHS | RHS) is equivalent to - // (LHS + RHS). - if (NoCommonBits(U->getOperand(0), U->getOperand(1))) - ConstantOffset = findInEitherOperand(U, false); - break; - } - case Instruction::SExt: - case Instruction::ZExt: { - // We trace into sext/zext if the operator can be distributed to its - // operand. e.g., we can transform into "sext (add nsw a, 5)" and - // extract constant 5, because - // sext (add nsw a, 5) == add nsw (sext a), 5 - if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U->getOperand(0))) { - if (Distributable(O->getOpcode(), BO)) - ConstantOffset = find(U->getOperand(0)); - } - break; - } + ConstantOffset = CI->getValue(); + } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) { + // Trace into subexpressions for more hoisting opportunities. + if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) { + ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); } + } else if (isa<SExtInst>(V)) { + ConstantOffset = find(U->getOperand(0), /* SignExtended */ true, + ZeroExtended, NonNegative).sext(BitWidth); + } else if (isa<ZExtInst>(V)) { + // As an optimization, we can clear the SignExtended flag because + // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll. + // + // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0. + ConstantOffset = + find(U->getOperand(0), /* SignExtended */ false, + /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth); } - // If we found a non-zero constant offset, adds it to the path for future - // transformation (rebuildWithoutConstantOffset). Zero is a valid constant - // offset, but doesn't help this optimization. + + // If we found a non-zero constant offset, add it to the path for + // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't + // help this optimization. if (ConstantOffset != 0) UserChain.push_back(U); return ConstantOffset; } -unsigned ConstantOffsetExtractor::FindFirstUse(User *U, Value *Used) { - for (unsigned I = 0, E = U->getNumOperands(); I < E; ++I) { - if (U->getOperand(I) == Used) - return I; +Value *ConstantOffsetExtractor::applyExts(Value *V) { + Value *Current = V; + // ExtInsts is built in the use-def order. Therefore, we apply them to V + // in the reversed order. + for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) { + if (Constant *C = dyn_cast<Constant>(Current)) { + // If Current is a constant, apply s/zext using ConstantExpr::getCast. + // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt. + Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType()); + } else { + Instruction *Ext = (*I)->clone(); + Ext->setOperand(0, Current); + Ext->insertBefore(IP); + Current = Ext; + } } - return -1; + return Current; } -Value *ConstantOffsetExtractor::cloneAndReplace(User *U, Value *From, - Value *To) { - // Finds in U the first use of From. It is safe to ignore future occurrences - // of From, because findInEitherOperand similarly stops searching the right - // operand when the first operand has a non-zero constant offset. - unsigned OpNo = FindFirstUse(U, From); - assert(OpNo != (unsigned)-1 && "UserChain wasn't built correctly"); - - // ConstantOffsetExtractor::find only follows Operators (i.e., Instructions - // and ConstantExprs). Therefore, U is either an Instruction or a - // ConstantExpr. - if (Instruction *I = dyn_cast<Instruction>(U)) { - Instruction *Clone = I->clone(); - Clone->setOperand(OpNo, To); - Clone->insertBefore(IP); - return Clone; +Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { + distributeExtsAndCloneChain(UserChain.size() - 1); + // Remove all nullptrs (used to be s/zext) from UserChain. + unsigned NewSize = 0; + for (auto I = UserChain.begin(), E = UserChain.end(); I != E; ++I) { + if (*I != nullptr) { + UserChain[NewSize] = *I; + NewSize++; + } } - // cast<Constant>(To) is safe because a ConstantExpr only uses Constants. - return cast<ConstantExpr>(U) - ->getWithOperandReplaced(OpNo, cast<Constant>(To)); + UserChain.resize(NewSize); + return removeConstOffset(UserChain.size() - 1); } -Value *ConstantOffsetExtractor::rebuildLeafWithoutConstantOffset(User *U, - Value *C) { - assert(U->getNumOperands() <= 2 && - "We didn't trace into any operator with more than 2 operands"); - // If U has only one operand which is the constant offset, removing the - // constant offset leaves U as a null value. - if (U->getNumOperands() == 1) - return Constant::getNullValue(U->getType()); - - // U->getNumOperands() == 2 - unsigned OpNo = FindFirstUse(U, C); // U->getOperand(OpNo) == C - assert(OpNo < 2 && "UserChain wasn't built correctly"); - Value *TheOther = U->getOperand(1 - OpNo); // The other operand of U - // If U = C - X, removing C makes U = -X; otherwise U will simply be X. - if (!isa<SubOperator>(U) || OpNo == 1) - return TheOther; - if (isa<ConstantExpr>(U)) - return ConstantExpr::getNeg(cast<Constant>(TheOther)); - return BinaryOperator::CreateNeg(TheOther, "", IP); +Value * +ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { + User *U = UserChain[ChainIndex]; + if (ChainIndex == 0) { + assert(isa<ConstantInt>(U)); + // If U is a ConstantInt, applyExts will return a ConstantInt as well. + return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U)); + } + + if (CastInst *Cast = dyn_cast<CastInst>(U)) { + assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) && + "We only traced into two types of CastInst: sext and zext"); + ExtInsts.push_back(Cast); + UserChain[ChainIndex] = nullptr; + return distributeExtsAndCloneChain(ChainIndex - 1); + } + + // Function find only trace into BinaryOperator and CastInst. + BinaryOperator *BO = cast<BinaryOperator>(U); + // OpNo = which operand of BO is UserChain[ChainIndex - 1] + unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); + Value *TheOther = applyExts(BO->getOperand(1 - OpNo)); + Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1); + + BinaryOperator *NewBO = nullptr; + if (OpNo == 0) { + NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther, + BO->getName(), IP); + } else { + NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain, + BO->getName(), IP); + } + return UserChain[ChainIndex] = NewBO; } -Value *ConstantOffsetExtractor::rebuildWithoutConstantOffset() { - assert(UserChain.size() > 0 && "you at least found a constant, right?"); - // Start with the constant and go up through UserChain, each time building a - // clone of the subexpression but with the constant removed. - // e.g., to build a clone of (a + (b + (c + 5)) but with the 5 removed, we - // first c, then (b + c), and finally (a + (b + c)). - // - // Fast path: if the GEP index is a constant, simply returns 0. - if (UserChain.size() == 1) - return ConstantInt::get(UserChain[0]->getType(), 0); - - Value *Remainder = - rebuildLeafWithoutConstantOffset(UserChain[1], UserChain[0]); - for (size_t I = 2; I < UserChain.size(); ++I) - Remainder = cloneAndReplace(UserChain[I], UserChain[I - 1], Remainder); - return Remainder; +Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { + if (ChainIndex == 0) { + assert(isa<ConstantInt>(UserChain[ChainIndex])); + return ConstantInt::getNullValue(UserChain[ChainIndex]->getType()); + } + + BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]); + unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); + assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]); + Value *NextInChain = removeConstOffset(ChainIndex - 1); + Value *TheOther = BO->getOperand(1 - OpNo); + + // If NextInChain is 0 and not the LHS of a sub, we can simplify the + // sub-expression to be just TheOther. + if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) { + if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0)) + return TheOther; + } + + if (BO->getOpcode() == Instruction::Or) { + // Rebuild "or" as "add", because "or" may be invalid for the new + // epxression. + // + // For instance, given + // a | (b + 5) where a and b + 5 have no common bits, + // we can extract 5 as the constant offset. + // + // However, reusing the "or" in the new index would give us + // (a | b) + 5 + // which does not equal a | (b + 5). + // + // Replacing the "or" with "add" is fine, because + // a | (b + 5) = a + (b + 5) = (a + b) + 5 + return BinaryOperator::CreateAdd(BO->getOperand(0), BO->getOperand(1), + BO->getName(), IP); + } + + // We can reuse BO in this case, because the new expression shares the same + // instruction type and BO is used at most once. + assert(BO->getNumUses() <= 1 && + "distributeExtsAndCloneChain clones each BinaryOperator in " + "UserChain, so no one should be used more than " + "once"); + BO->setOperand(OpNo, NextInChain); + BO->setHasNoSignedWrap(false); + BO->setHasNoUnsignedWrap(false); + // Make sure it appears after all instructions we've inserted so far. + BO->moveBefore(IP); + return BO; } int64_t ConstantOffsetExtractor::Extract(Value *Idx, Value *&NewIdx, const DataLayout *DL, - Instruction *IP) { - ConstantOffsetExtractor Extractor(DL, IP); + GetElementPtrInst *GEP) { + ConstantOffsetExtractor Extractor(DL, GEP); // Find a non-zero constant offset first. - int64_t ConstantOffset = Extractor.find(Idx); - if (ConstantOffset == 0) - return 0; - // Then rebuild a new index with the constant removed. - NewIdx = Extractor.rebuildWithoutConstantOffset(); - return ConstantOffset; + APInt ConstantOffset = + Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, + GEP->isInBounds()); + if (ConstantOffset != 0) { + // Separates the constant offset from the GEP index. + NewIdx = Extractor.rebuildWithoutConstOffset(); + } + return ConstantOffset.getSExtValue(); } -int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL) { - return ConstantOffsetExtractor(DL, nullptr).find(Idx); +int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL, + GetElementPtrInst *GEP) { + // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. + return ConstantOffsetExtractor(DL, GEP) + .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, + GEP->isInBounds()) + .getSExtValue(); } void ConstantOffsetExtractor::ComputeKnownBits(Value *V, APInt &KnownOne, @@ -421,8 +595,64 @@ bool ConstantOffsetExtractor::NoCommonBits(Value *LHS, Value *RHS) const { return (LHSKnownZero | RHSKnownZero).isAllOnesValue(); } -int64_t SeparateConstOffsetFromGEP::accumulateByteOffset( - GetElementPtrInst *GEP, const DataLayout *DL, bool &NeedsExtraction) { +bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize( + GetElementPtrInst *GEP) { + bool Changed = false; + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); + gep_type_iterator GTI = gep_type_begin(*GEP); + for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); + I != E; ++I, ++GTI) { + // Skip struct member indices which must be i32. + if (isa<SequentialType>(*GTI)) { + if ((*I)->getType() != IntPtrTy) { + *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP); + Changed = true; + } + } + } + return Changed; +} + +bool +SeparateConstOffsetFromGEP::convertInBoundsZExtToSExt(GetElementPtrInst *GEP) { + if (!GEP->isInBounds()) + return false; + + // TODO: consider alloca + GlobalVariable *UnderlyingObject = + dyn_cast<GlobalVariable>(GEP->getPointerOperand()); + if (UnderlyingObject == nullptr) + return false; + + uint64_t ObjectSize = + DL->getTypeAllocSize(UnderlyingObject->getType()->getElementType()); + gep_type_iterator GTI = gep_type_begin(*GEP); + bool Changed = false; + for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); I != E; + ++I, ++GTI) { + if (isa<SequentialType>(*GTI)) { + if (ZExtInst *Extended = dyn_cast<ZExtInst>(*I)) { + unsigned SrcBitWidth = + cast<IntegerType>(Extended->getSrcTy())->getBitWidth(); + // For GEP operand zext(a), if a <= max signed value of typeof(a), then + // the sign bit of a is zero and sext(a) = zext(a). Because the GEP is + // in bounds, we know a <= ObjectSize, so the condition can be reduced + // to ObjectSize <= max signed value of typeof(a). + if (ObjectSize <= + APInt::getSignedMaxValue(SrcBitWidth).getZExtValue()) { + *I = new SExtInst(Extended->getOperand(0), Extended->getType(), + Extended->getName(), GEP); + Changed = true; + } + } + } + } + return Changed; +} + +int64_t +SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, + bool &NeedsExtraction) { NeedsExtraction = false; int64_t AccumulativeByteOffset = 0; gep_type_iterator GTI = gep_type_begin(*GEP); @@ -430,7 +660,7 @@ int64_t SeparateConstOffsetFromGEP::accumulateByteOffset( if (isa<SequentialType>(*GTI)) { // Tries to extract a constant offset from this GEP index. int64_t ConstantOffset = - ConstantOffsetExtractor::Find(GEP->getOperand(I), DL); + ConstantOffsetExtractor::Find(GEP->getOperand(I), DL, GEP); if (ConstantOffset != 0) { NeedsExtraction = true; // A GEP may have multiple indices. We accumulate the extracted @@ -455,31 +685,11 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { return false; bool Changed = false; + Changed |= canonicalizeArrayIndicesToPointerSize(GEP); + Changed |= convertInBoundsZExtToSExt(GEP); - // Shortcuts integer casts. Eliminating these explicit casts can make - // subsequent optimizations more obvious: ConstantOffsetExtractor needn't - // trace into these casts. - if (GEP->isInBounds()) { - // Doing this to inbounds GEPs is safe because their indices are guaranteed - // to be non-negative and in bounds. - gep_type_iterator GTI = gep_type_begin(*GEP); - for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { - if (isa<SequentialType>(*GTI)) { - if (Operator *O = dyn_cast<Operator>(GEP->getOperand(I))) { - if (O->getOpcode() == Instruction::SExt || - O->getOpcode() == Instruction::ZExt) { - GEP->setOperand(I, O->getOperand(0)); - Changed = true; - } - } - } - } - } - - const DataLayout *DL = &getAnalysis<DataLayoutPass>().getDataLayout(); bool NeedsExtraction; - int64_t AccumulativeByteOffset = - accumulateByteOffset(GEP, DL, NeedsExtraction); + int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction); if (!NeedsExtraction) return Changed; @@ -506,30 +716,29 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { assert(NewIdx != nullptr && "ConstantOffset != 0 implies NewIdx is set"); GEP->setOperand(I, NewIdx); - // Clear the inbounds attribute because the new index may be off-bound. - // e.g., - // - // b = add i64 a, 5 - // addr = gep inbounds float* p, i64 b - // - // is transformed to: - // - // addr2 = gep float* p, i64 a - // addr = gep float* addr2, i64 5 - // - // If a is -4, although the old index b is in bounds, the new index a is - // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the - // inbounds keyword is not present, the offsets are added to the base - // address with silently-wrapping two's complement arithmetic". - // Therefore, the final code will be a semantically equivalent. - // - // TODO(jingyue): do some range analysis to keep as many inbounds as - // possible. GEPs with inbounds are more friendly to alias analysis. - GEP->setIsInBounds(false); - Changed = true; } } } + // Clear the inbounds attribute because the new index may be off-bound. + // e.g., + // + // b = add i64 a, 5 + // addr = gep inbounds float* p, i64 b + // + // is transformed to: + // + // addr2 = gep float* p, i64 a + // addr = gep float* addr2, i64 5 + // + // If a is -4, although the old index b is in bounds, the new index a is + // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the + // inbounds keyword is not present, the offsets are added to the base + // address with silently-wrapping two's complement arithmetic". + // Therefore, the final code will be a semantically equivalent. + // + // TODO(jingyue): do some range analysis to keep as many inbounds as + // possible. GEPs with inbounds are more friendly to alias analysis. + GEP->setIsInBounds(false); // Offsets the base with the accumulative byte offset. // @@ -562,9 +771,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { Instruction *NewGEP = GEP->clone(); NewGEP->insertBefore(GEP); - Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); uint64_t ElementTypeSizeOfGEP = DL->getTypeAllocSize(GEP->getType()->getElementType()); + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) { // Very likely. As long as %gep is natually aligned, the byte offset we // extracted should be a multiple of sizeof(*%gep). diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index 482c33a..7348c45 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" @@ -34,6 +35,7 @@ namespace { DominatorTree *DT; LoopInfo *LI; AliasAnalysis *AA; + const DataLayout *DL; public: static char ID; // Pass identification @@ -98,6 +100,8 @@ bool Sinking::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfo>(); AA = &getAnalysis<AliasAnalysis>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; bool MadeChange, EverMadeChange = false; @@ -193,7 +197,7 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { // We cannot sink a load across a critical edge - there may be stores in // other code paths. - if (!isSafeToSpeculativelyExecute(Inst)) + if (!isSafeToSpeculativelyExecute(Inst, DL)) return false; // We don't want to sink across a critical edge if we don't dominate the |