diff options
author | Stephen Hines <srhines@google.com> | 2013-08-07 15:07:10 -0700 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2013-08-07 15:07:10 -0700 |
commit | fab2daa4a1127ecb217abe2b07c1769122b6fee1 (patch) | |
tree | 268ebfd1963fd98ba412e76819afdf95a7d4267b /lib/Transforms/Vectorize | |
parent | 8197ac1c1a0a91baa70c4dea8cb488f254ef974c (diff) | |
parent | 10251753b6897adcd22cc981c0cc42f348c109de (diff) | |
download | external_llvm-fab2daa4a1127ecb217abe2b07c1769122b6fee1.zip external_llvm-fab2daa4a1127ecb217abe2b07c1769122b6fee1.tar.gz external_llvm-fab2daa4a1127ecb217abe2b07c1769122b6fee1.tar.bz2 |
Merge commit '10251753b6897adcd22cc981c0cc42f348c109de' into merge-20130807
Conflicts:
lib/Archive/ArchiveReader.cpp
lib/Support/Unix/PathV2.inc
Change-Id: I29d8c1e321a4a380b6013f00bac6a8e4b593cc4e
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r-- | lib/Transforms/Vectorize/BBVectorize.cpp | 12 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/CMakeLists.txt | 1 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 1296 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 1803 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/VecUtils.cpp | 852 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/VecUtils.h | 184 |
6 files changed, 2669 insertions, 1479 deletions
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index 17900da..cbc1d63 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -356,7 +356,7 @@ namespace { Instruction *J, unsigned o, bool IBeforeJ); void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, - Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, + Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands, bool IBeforeJ); void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, @@ -1602,7 +1602,7 @@ namespace { DenseSet<ValuePair> CurrentPairs; bool CanAdd = true; - for (SmallVector<ValuePairWithDepth, 8>::iterator C2 + for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = BestChildren.begin(), E2 = BestChildren.end(); C2 != E2; ++C2) { if (C2->first.first == C->first.first || @@ -1642,7 +1642,7 @@ namespace { if (!CanAdd) continue; // And check the queue too... - for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(), + for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(), E2 = Q.end(); C2 != E2; ++C2) { if (C2->first.first == C->first.first || C2->first.first == C->first.second || @@ -1691,7 +1691,7 @@ namespace { // to an already-selected child. Check for this here, and if a // conflict is found, then remove the previously-selected child // before adding this one in its place. - for (SmallVector<ValuePairWithDepth, 8>::iterator C2 + for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = BestChildren.begin(); C2 != BestChildren.end();) { if (C2->first.first == C->first.first || C2->first.first == C->first.second || @@ -1706,7 +1706,7 @@ namespace { BestChildren.push_back(ValuePairWithDepth(C->first, C->second)); } - for (SmallVector<ValuePairWithDepth, 8>::iterator C + for (SmallVectorImpl<ValuePairWithDepth>::iterator C = BestChildren.begin(), E2 = BestChildren.end(); C != E2; ++C) { size_t DepthF = getDepthFactor(C->first.first); @@ -2687,7 +2687,7 @@ namespace { // to the vector instruction that fuses I with J. void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, Instruction *I, Instruction *J, - SmallVector<Value *, 3> &ReplacedOperands, + SmallVectorImpl<Value *> &ReplacedOperands, bool IBeforeJ) { unsigned NumOperands = I->getNumOperands(); diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 7ae082f..07967d8 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -3,7 +3,6 @@ add_llvm_library(LLVMVectorize Vectorize.cpp LoopVectorize.cpp SLPVectorizer.cpp - VecUtils.cpp ) add_dependencies(LLVMVectorize intrinsics_gen) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 3693f4a..a62fedc 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -47,13 +47,14 @@ #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -174,6 +175,11 @@ private: /// originated from one scalar instruction. typedef SmallVector<Value*, 2> VectorParts; + // When we if-convert we need create edge masks. We have to cache values so + // that we don't end up with exponential recursion/IR. + typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, + VectorParts> EdgeMaskCache; + /// Add code that checks at runtime if the accessed arrays overlap. /// Returns the comparator value or NULL if no check is needed. Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal, @@ -317,79 +323,35 @@ private: Value *ExtendedIdx; /// Maps scalars to widened vectors. ValueMap WidenMap; + EdgeMaskCache MaskCache; }; -/// \brief Check if conditionally executed loads are hoistable. -/// -/// This class has two functions: isHoistableLoad and canHoistAllLoads. -/// isHoistableLoad should be called on all load instructions that are executed -/// conditionally. After all conditional loads are processed, the client should -/// call canHoistAllLoads to determine if all of the conditional executed loads -/// have an unconditional memory access to the same memory address in the loop. -class LoadHoisting { - typedef SmallPtrSet<Value *, 8> MemorySet; - - Loop *TheLoop; - DominatorTree *DT; - MemorySet CondLoadAddrSet; - -public: - LoadHoisting(Loop *L, DominatorTree *D) : TheLoop(L), DT(D) {} - - /// \brief Check if the instruction is a load with a identifiable address. - bool isHoistableLoad(Instruction *L); - - /// \brief Check if all of the conditional loads are hoistable because there - /// exists an unconditional memory access to the same address in the loop. - bool canHoistAllLoads(); -}; - -bool LoadHoisting::isHoistableLoad(Instruction *L) { - LoadInst *LI = dyn_cast<LoadInst>(L); - if (!LI) - return false; - - CondLoadAddrSet.insert(LI->getPointerOperand()); - return true; -} - -static void addMemAccesses(BasicBlock *BB, SmallPtrSet<Value *, 8> &Set) { - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) { - if (LoadInst *LI = dyn_cast<LoadInst>(BI)) // Try a load. - Set.insert(LI->getPointerOperand()); - else if (StoreInst *SI = dyn_cast<StoreInst>(BI)) // Try a store. - Set.insert(SI->getPointerOperand()); - } -} - -bool LoadHoisting::canHoistAllLoads() { - // No conditional loads. - if (CondLoadAddrSet.empty()) - return true; - - MemorySet UncondMemAccesses; - std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); - BasicBlock *LoopLatch = TheLoop->getLoopLatch(); - - // Iterate over the unconditional blocks and collect memory access addresses. - for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { - BasicBlock *BB = LoopBlocks[i]; +/// \brief Look for a meaningful debug location on the instruction or it's +/// operands. +static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { + if (!I) + return I; - // Ignore conditional blocks. - if (BB != LoopLatch && !DT->dominates(BB, LoopLatch)) - continue; + DebugLoc Empty; + if (I->getDebugLoc() != Empty) + return I; - addMemAccesses(BB, UncondMemAccesses); + for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { + if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) + if (OpInst->getDebugLoc() != Empty) + return OpInst; } - // And make sure there is a matching unconditional access for every - // conditional load. - for (MemorySet::iterator MI = CondLoadAddrSet.begin(), - ME = CondLoadAddrSet.end(); MI != ME; ++MI) - if (!UncondMemAccesses.count(*MI)) - return false; + return I; +} - return true; +/// \brief Set the debug location in the builder using the debug location in the +/// instruction. +static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { + if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) + B.SetCurrentDebugLocation(Inst->getDebugLoc()); + else + B.SetCurrentDebugLocation(DebugLoc()); } /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and @@ -408,11 +370,10 @@ bool LoadHoisting::canHoistAllLoads() { class LoopVectorizationLegality { public: LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, - DominatorTree *DT, TargetTransformInfo* TTI, - AliasAnalysis *AA, TargetLibraryInfo *TLI) - : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI), + DominatorTree *DT, TargetLibraryInfo *TLI) + : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI), Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), - LoadSpeculation(L, DT) {} + MaxSafeDepDistBytes(-1U) {} /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -499,7 +460,8 @@ public: } /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr); + void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, + unsigned DepSetId); /// This flag indicates if we need to add the runtime check. bool Need; @@ -511,6 +473,9 @@ public: SmallVector<const SCEV*, 2> Ends; /// Holds the information if this pointer is used for writing to memory. SmallVector<bool, 2> IsWritePtr; + /// Holds the id of the set of pointers that could be dependent because of a + /// shared underlying object. + SmallVector<unsigned, 2> DependencySetId; }; /// A POD for saving information about induction variables. @@ -531,11 +496,6 @@ public: /// induction descriptor. typedef MapVector<PHINode*, InductionInfo> InductionList; - /// Alias(Multi)Map stores the values (GEPs or underlying objects and their - /// respective Store/Load instruction(s) to calculate aliasing. - typedef MapVector<Value*, Instruction* > AliasMap; - typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap; - /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -582,6 +542,9 @@ public: /// This function returns the identity element (or neutral element) for /// the operation K. static Constant *getReductionIdentity(ReductionKind K, Type *Tp); + + unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -602,8 +565,9 @@ private: void collectLoopUniforms(); /// Return true if all of the instructions in the block can be speculatively - /// executed. - bool blockCanBePredicated(BasicBlock *BB); + /// executed. \p SafePtrs is a list of addresses that are known to be legal + /// and we know that we can read from them without segfault. + bool blockCanBePredicated(BasicBlock *BB, SmallPtrSet<Value *, 8>& SafePtrs); /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. @@ -622,16 +586,6 @@ private: /// Returns the induction kind of Phi. This function may return NoInduction /// if the PHI is not an induction variable. InductionKind isInductionVariable(PHINode *Phi); - /// Return true if can compute the address bounds of Ptr within the loop. - bool hasComputableBounds(Value *Ptr); - /// Return true if there is the chance of write reorder. - bool hasPossibleGlobalWriteReorder(Value *Object, - Instruction *Inst, - AliasMultiMap &WriteObjects, - unsigned MaxByteWidth); - /// Return the AA location for a load or a store. - AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst); - /// The loop that we evaluate. Loop *TheLoop; @@ -641,10 +595,6 @@ private: DataLayout *DL; /// Dominators. DominatorTree *DT; - /// Target Info. - TargetTransformInfo *TTI; - /// Alias Analysis. - AliasAnalysis *AA; /// Target Library Info. TargetLibraryInfo *TLI; @@ -674,8 +624,7 @@ private: /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; - /// Utility to determine whether loads can be speculated. - LoadHoisting LoadSpeculation; + unsigned MaxSafeDepDistBytes; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -902,7 +851,6 @@ struct LoopVectorize : public LoopPass { LoopInfo *LI; TargetTransformInfo *TTI; DominatorTree *DT; - AliasAnalysis *AA; TargetLibraryInfo *TLI; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { @@ -915,7 +863,6 @@ struct LoopVectorize : public LoopPass { LI = &getAnalysis<LoopInfo>(); TTI = &getAnalysis<TargetTransformInfo>(); DT = &getAnalysis<DominatorTree>(); - AA = getAnalysisIfAvailable<AliasAnalysis>(); TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); if (DL == NULL) { @@ -934,7 +881,7 @@ struct LoopVectorize : public LoopPass { } // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI); + LoopVectorizationLegality LVL(L, SE, DL, DT, TLI); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing.\n"); return false; @@ -1009,7 +956,8 @@ struct LoopVectorize : public LoopPass { void LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, - bool WritePtr) { + bool WritePtr, + unsigned DepSetId) { const SCEV *Sc = SE->getSCEV(Ptr); const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); assert(AR && "Invalid addrec expression"); @@ -1019,6 +967,7 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, Starts.push_back(AR->getStart()); Ends.push_back(ScEnd); IsWritePtr.push_back(WritePtr); + DependencySetId.push_back(DepSetId); } Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { @@ -1178,7 +1127,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); - + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; @@ -1199,6 +1148,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // Handle consecutive loads/stores. GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { + setDebugLocFromInst(Builder, Gep); Value *PtrOperand = Gep->getPointerOperand(); Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); @@ -1209,26 +1159,40 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Gep2->setName("gep.indvar.base"); Ptr = Builder.Insert(Gep2); } else if (Gep) { + setDebugLocFromInst(Builder, Gep); assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), OrigLoop) && "Base ptr must be invariant"); // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); - - Value *LastGepOperand = Gep->getOperand(NumOperands - 1); - VectorParts &GEPParts = getVectorValue(LastGepOperand); - Value *LastIndex = GEPParts[0]; - LastIndex = Builder.CreateExtractElement(LastIndex, Zero); - + unsigned LastOperand = NumOperands - 1; // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - Gep2->setOperand(NumOperands - 1, LastIndex); - Gep2->setName("gep.indvar.idx"); + + for (unsigned i = 0; i < NumOperands; ++i) { + Value *GepOperand = Gep->getOperand(i); + Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); + + // Update last index or loop invariant instruction anchored in loop. + if (i == LastOperand || + (GepOperandInst && OrigLoop->contains(GepOperandInst))) { + assert((i == LastOperand || + SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && + "Must be last index or loop invariant"); + + VectorParts &GEPParts = getVectorValue(GepOperand); + Value *Index = GEPParts[0]; + Index = Builder.CreateExtractElement(Index, Zero); + Gep2->setOperand(i, Index); + Gep2->setName("gep.indvar.idx"); + } + } Ptr = Builder.Insert(Gep2); } else { // Use the induction element ptr. assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); + setDebugLocFromInst(Builder, Ptr); VectorParts &PtrVal = getVectorValue(Ptr); Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); } @@ -1237,8 +1201,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (SI) { assert(!Legal->isUniform(SI->getPointerOperand()) && "We do not allow storing to uniform addresses"); + setDebugLocFromInst(Builder, SI); + // We don't want to update the value in the map as it might be used in + // another expression. So don't use a reference type for "StoredVal". + VectorParts StoredVal = getVectorValue(SI->getValueOperand()); - VectorParts &StoredVal = getVectorValue(SI->getValueOperand()); for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); @@ -1253,11 +1220,16 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); } - Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); + Value *VecPtr = Builder.CreateBitCast(PartPtr, + DataTy->getPointerTo(AddressSpace)); Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); } + return; } + // Handle loads. + assert(LI && "Must have a load instruction"); + setDebugLocFromInst(Builder, LI); for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); @@ -1269,7 +1241,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); } - Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); + Value *VecPtr = Builder.CreateBitCast(PartPtr, + DataTy->getPointerTo(AddressSpace)); Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); cast<LoadInst>(LI)->setAlignment(Alignment); Entry[Part] = Reverse ? reverseVector(LI) : LI; @@ -1281,6 +1254,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // Holds vector parameters or scalars, in case of uniform vals. SmallVector<VectorParts, 4> Params; + setDebugLocFromInst(Builder, Instr); + // Find all of the vectorized parameters. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { Value *SrcOp = Instr->getOperand(op); @@ -1356,10 +1331,9 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, if (!PtrRtCheck->Need) return NULL; - Instruction *MemoryRuntimeCheck = 0; unsigned NumPointers = PtrRtCheck->Pointers.size(); - SmallVector<Value* , 2> Starts; - SmallVector<Value* , 2> Ends; + SmallVector<TrackingVH<Value> , 2> Starts; + SmallVector<TrackingVH<Value> , 2> Ends; SCEVExpander Exp(*SE, "induction"); @@ -1386,13 +1360,18 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, } IRBuilder<> ChkBuilder(Loc); - + // Our instructions might fold to a constant. + Value *MemoryRuntimeCheck = 0; for (unsigned i = 0; i < NumPointers; ++i) { for (unsigned j = i+1; j < NumPointers; ++j) { // No need to check if two readonly pointers intersect. if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) continue; + // Only need to check pointers between two different dependency sets. + if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) + continue; + Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc"); Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc"); Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc"); @@ -1404,12 +1383,18 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, if (MemoryRuntimeCheck) IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx"); - - MemoryRuntimeCheck = cast<Instruction>(IsConflict); + MemoryRuntimeCheck = IsConflict; } } - return MemoryRuntimeCheck; + // We have to do this trickery because the IRBuilder might fold the check to a + // constant expression in which case there is no Instruction anchored in a + // the block. + LLVMContext &Ctx = Loc->getContext(); + Instruction * Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, + ConstantInt::getTrue(Ctx)); + ChkBuilder.Insert(Check, "memcheck.conflict"); + return Check; } void @@ -1493,11 +1478,28 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { BasicBlock *ScalarPH = MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + // Create and register the new vector loop. + Loop* Lp = new Loop(); + Loop *ParentLoop = OrigLoop->getParentLoop(); + + // Insert the new loop into the loop nest and register the new basic blocks + // before calling any utilities such as SCEV that require valid LoopInfo. + if (ParentLoop) { + ParentLoop->addChildLoop(Lp); + ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); + ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); + ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); + } else { + LI->addTopLevelLoop(Lp); + } + Lp->addBasicBlockToLoop(VecBody, LI->getBase()); + // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); // Generate the induction variable. + setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); Induction = Builder.CreatePHI(IdxTy, 2, "index"); // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). @@ -1506,6 +1508,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // This is the IR builder that we use to add all of the logic for bypassing // the new vector loop. IRBuilder<> BypassBuilder(BypassBlock->getTerminator()); + setDebugLocFromInst(BypassBuilder, + getDebugLocFromInstOrOperands(OldInduction)); // We may need to extend the index in case there is a type mismatch. // We know that the count starts at zero and does not overflow. @@ -1544,6 +1548,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Create a new block containing the memory check. BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck"); + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch @@ -1711,24 +1717,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); - // Create and register the new vector loop. - Loop* Lp = new Loop(); - Loop *ParentLoop = OrigLoop->getParentLoop(); - - // Insert the new loop into the loop nest and register the new basic blocks. - if (ParentLoop) { - ParentLoop->addChildLoop(Lp); - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase()); - ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); - ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); - ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); - } else { - LI->addTopLevelLoop(Lp); - } - - Lp->addBasicBlockToLoop(VecBody, LI->getBase()); - // Save the state. LoopVectorPreHeader = VectorPH; LoopScalarPreHeader = ScalarPH; @@ -1787,6 +1775,8 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { case Intrinsic::pow: case Intrinsic::fma: case Intrinsic::fmuladd: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: return II->getIntrinsicID(); default: return Intrinsic::not_intrinsic; @@ -1925,7 +1915,8 @@ Value *createMinMaxOp(IRBuilder<> &Builder, } Value *Cmp; - if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax) + if (RK == LoopVectorizationLegality::MRK_FloatMin || + RK == LoopVectorizationLegality::MRK_FloatMax) Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); else Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); @@ -1985,6 +1976,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { LoopVectorizationLegality::ReductionDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; + setDebugLocFromInst(Builder, RdxDesc.StartValue); + // We need to generate a reduction vector from the incoming scalar. // To do so, we need to generate the 'identity' vector and overide // one of the elements with the incoming scalar reduction. We need @@ -2042,6 +2035,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); VectorParts RdxParts; + setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr); for (unsigned part = 0; part < UF; ++part) { // This PHINode contains the vectorized reduction variable, or // the initial value vector, if we bypass the vector loop. @@ -2057,6 +2051,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = RdxParts[0]; unsigned Op = getReductionBinOp(RdxDesc.Kind); + setDebugLocFromInst(Builder, ReducedPartRdx); for (unsigned part = 1; part < UF; ++part) { if (Op != Instruction::ICmp && Op != Instruction::FCmp) ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op, @@ -2151,6 +2146,12 @@ InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && "Invalid edge"); + // Look for cached value. + std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst); + EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge); + if (ECEntryIt != MaskCache.end()) + return ECEntryIt->second; + VectorParts SrcMask = createBlockInMask(Src); // The terminator has to be a branch inst! @@ -2166,9 +2167,12 @@ InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { for (unsigned part = 0; part < UF; ++part) EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); + + MaskCache[Edge] = EdgeMask; return EdgeMask; } + MaskCache[Edge] = SrcMask; return SrcMask; } @@ -2221,6 +2225,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, continue; } + setDebugLocFromInst(Builder, P); // Check for PHI nodes that are lowered to vector selects. if (P->getParent() != OrigLoop->getHeader()) { // We know that all PHIs in non header blocks are converted into @@ -2372,6 +2377,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, case Instruction::Xor: { // Just widen binops. BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); + setDebugLocFromInst(Builder, BinOp); VectorParts &A = getVectorValue(it->getOperand(0)); VectorParts &B = getVectorValue(it->getOperand(1)); @@ -2398,6 +2404,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // instruction with a scalar condition. Otherwise, use vector-select. bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), OrigLoop); + setDebugLocFromInst(Builder, it); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. @@ -2422,6 +2429,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // Widen compares. Generate vector compares. bool FCmp = (it->getOpcode() == Instruction::FCmp); CmpInst *Cmp = dyn_cast<CmpInst>(it); + setDebugLocFromInst(Builder, it); VectorParts &A = getVectorValue(it->getOperand(0)); VectorParts &B = getVectorValue(it->getOperand(1)); for (unsigned Part = 0; Part < UF; ++Part) { @@ -2452,6 +2460,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, case Instruction::FPTrunc: case Instruction::BitCast: { CastInst *CI = dyn_cast<CastInst>(it); + setDebugLocFromInst(Builder, it); /// Optimize the special case where the source is the induction /// variable. Notice that we can only optimize the 'trunc' case /// because: a. FP conversions lose precision, b. sext/zext may wrap, @@ -2478,20 +2487,29 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // Ignore dbg intrinsics. if (isa<DbgInfoIntrinsic>(it)) break; + setDebugLocFromInst(Builder, it); Module *M = BB->getParent()->getParent(); CallInst *CI = cast<CallInst>(it); Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); assert(ID && "Not an intrinsic call!"); - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value*, 4> Args; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); - Args.push_back(Arg[Part]); + switch (ID) { + case Intrinsic::lifetime_end: + case Intrinsic::lifetime_start: + scalarizeInstruction(it); + break; + default: + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Value *, 4> Args; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); + Args.push_back(Arg[Part]); + } + Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) }; + Function *F = Intrinsic::getDeclaration(M, ID, Tys); + Entry[Part] = Builder.CreateCall(F, Args); } - Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) }; - Function *F = Intrinsic::getDeclaration(M, ID, Tys); - Entry[Part] = Builder.CreateCall(F, Args); + break; } break; } @@ -2531,6 +2549,24 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); + // A list of pointers that we can safely read and write to. + SmallPtrSet<Value *, 8> SafePointes; + + // Collect safe addresses. + for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { + BasicBlock *BB = LoopBlocks[i]; + + if (blockNeedsPredication(BB)) + continue; + + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + SafePointes.insert(LI->getPointerOperand()); + else if (StoreInst *SI = dyn_cast<StoreInst>(I)) + SafePointes.insert(SI->getPointerOperand()); + } + } + // Collect the blocks that need predication. for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { BasicBlock *BB = LoopBlocks[i]; @@ -2540,14 +2576,10 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { return false; // We must be able to predicate all blocks that need to be predicated. - if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) + if (blockNeedsPredication(BB) && !blockCanBePredicated(BB, SafePointes)) return false; } - // Check that we can actually speculate the hoistable loads. - if (!LoadSpeculation.canHoistAllLoads()) - return false; - // We can if-convert this loop. return true; } @@ -2762,7 +2794,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } if (AddReductionVar(Phi, RK_FloatMinMax)) { - DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n"); + DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi << + "\n"); continue; } @@ -2837,53 +2870,665 @@ void LoopVectorizationLegality::collectLoopUniforms() { } } -AliasAnalysis::Location -LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) { - if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) - return AA->getLocation(Store); - else if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) - return AA->getLocation(Load); +namespace { +/// \brief Analyses memory accesses in a loop. +/// +/// Checks whether run time pointer checks are needed and builds sets for data +/// dependence checking. +class AccessAnalysis { +public: + /// \brief Read or write access location. + typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; + typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; + + /// \brief Set of potential dependent memory accesses. + typedef EquivalenceClasses<MemAccessInfo> DepCandidates; + + AccessAnalysis(DataLayout *Dl, DepCandidates &DA) : + DL(Dl), DepCands(DA), AreAllWritesIdentified(true), + AreAllReadsIdentified(true), IsRTCheckNeeded(false) {} + + /// \brief Register a load and whether it is only read from. + void addLoad(Value *Ptr, bool IsReadOnly) { + Accesses.insert(MemAccessInfo(Ptr, false)); + if (IsReadOnly) + ReadOnlyPtr.insert(Ptr); + } + + /// \brief Register a store. + void addStore(Value *Ptr) { + Accesses.insert(MemAccessInfo(Ptr, true)); + } + + /// \brief Check whether we can check the pointers at runtime for + /// non-intersection. + bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, + unsigned &NumComparisons, ScalarEvolution *SE, + Loop *TheLoop); + + /// \brief Goes over all memory accesses, checks whether a RT check is needed + /// and builds sets of dependent accesses. + void buildDependenceSets() { + // Process read-write pointers first. + processMemAccesses(false); + // Next, process read pointers. + processMemAccesses(true); + } + + bool isRTCheckNeeded() { return IsRTCheckNeeded; } + + bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } + + MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } + +private: + typedef SetVector<MemAccessInfo> PtrAccessSet; + typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap; + + /// \brief Go over all memory access or only the deferred ones if + /// \p UseDeferred is true and check whether runtime pointer checks are needed + /// and build sets of dependency check candidates. + void processMemAccesses(bool UseDeferred); + + /// Set of all accesses. + PtrAccessSet Accesses; + + /// Set of access to check after all writes have been processed. + PtrAccessSet DeferredAccesses; + + /// Map of pointers to last access encountered. + UnderlyingObjToAccessMap ObjToLastAccess; + + /// Set of accesses that need a further dependence check. + MemAccessInfoSet CheckDeps; - llvm_unreachable("Should be either load or store instruction"); + /// Set of pointers that are read only. + SmallPtrSet<Value*, 16> ReadOnlyPtr; + + /// Set of underlying objects already written to. + SmallPtrSet<Value*, 16> WriteObjects; + + DataLayout *DL; + + /// Sets of potentially dependent accesses - members of one set share an + /// underlying pointer. The set "CheckDeps" identfies which sets really need a + /// dependence check. + DepCandidates &DepCands; + + bool AreAllWritesIdentified; + bool AreAllReadsIdentified; + bool IsRTCheckNeeded; +}; + +} // end anonymous namespace + +/// \brief Check whether a pointer can participate in a runtime bounds check. +static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) { + const SCEV *PtrScev = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); + if (!AR) + return false; + + return AR->isAffine(); } -bool -LoopVectorizationLegality::hasPossibleGlobalWriteReorder( - Value *Object, - Instruction *Inst, - AliasMultiMap& WriteObjects, - unsigned MaxByteWidth) { +bool AccessAnalysis::canCheckPtrAtRT( + LoopVectorizationLegality::RuntimePointerCheck &RtCheck, + unsigned &NumComparisons, ScalarEvolution *SE, + Loop *TheLoop) { + // Find pointers with computable bounds. We are going to use this information + // to place a runtime bound check. + unsigned NumReadPtrChecks = 0; + unsigned NumWritePtrChecks = 0; + bool CanDoRT = true; - AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst); + bool IsDepCheckNeeded = isDependencyCheckNeeded(); + // We assign consecutive id to access from different dependence sets. + // Accesses within the same set don't need a runtime check. + unsigned RunningDepId = 1; + DenseMap<Value *, unsigned> DepSetId; - std::vector<Instruction*>::iterator - it = WriteObjects[Object].begin(), - end = WriteObjects[Object].end(); + for (PtrAccessSet::iterator AI = Accesses.begin(), AE = Accesses.end(); + AI != AE; ++AI) { + const MemAccessInfo &Access = *AI; + Value *Ptr = Access.getPointer(); + bool IsWrite = Access.getInt(); - for (; it != end; ++it) { - Instruction* I = *it; - if (I == Inst) + // Just add write checks if we have both. + if (!IsWrite && Accesses.count(MemAccessInfo(Ptr, true))) continue; - AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I); - if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth), - ThatLoc.getWithNewSize(MaxByteWidth))) + if (IsWrite) + ++NumWritePtrChecks; + else + ++NumReadPtrChecks; + + if (hasComputableBounds(SE, Ptr)) { + // The id of the dependence set. + unsigned DepId; + + if (IsDepCheckNeeded) { + Value *Leader = DepCands.getLeaderValue(Access).getPointer(); + unsigned &LeaderId = DepSetId[Leader]; + if (!LeaderId) + LeaderId = RunningDepId++; + DepId = LeaderId; + } else + // Each access has its own dependence set. + DepId = RunningDepId++; + + RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId); + + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n"); + } else { + CanDoRT = false; + } + } + + if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) + NumComparisons = 0; // Only one dependence set. + else + NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks + + NumWritePtrChecks - 1)); + return CanDoRT; +} + +static bool isFunctionScopeIdentifiedObject(Value *Ptr) { + return isNoAliasArgument(Ptr) || isNoAliasCall(Ptr) || isa<AllocaInst>(Ptr); +} + +void AccessAnalysis::processMemAccesses(bool UseDeferred) { + // We process the set twice: first we process read-write pointers, last we + // process read-only pointers. This allows us to skip dependence tests for + // read-only pointers. + + PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; + for (PtrAccessSet::iterator AI = S.begin(), AE = S.end(); AI != AE; ++AI) { + const MemAccessInfo &Access = *AI; + Value *Ptr = Access.getPointer(); + bool IsWrite = Access.getInt(); + + DepCands.insert(Access); + + // Memorize read-only pointers for later processing and skip them in the + // first round (they need to be checked after we have seen all write + // pointers). Note: we also mark pointer that are not consecutive as + // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need the + // second check for "!IsWrite". + bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; + if (!UseDeferred && IsReadOnlyPtr) { + DeferredAccesses.insert(Access); + continue; + } + + bool NeedDepCheck = false; + // Check whether there is the possiblity of dependency because of underlying + // objects being the same. + typedef SmallVector<Value*, 16> ValueVector; + ValueVector TempObjects; + GetUnderlyingObjects(Ptr, TempObjects, DL); + for (ValueVector::iterator UI = TempObjects.begin(), UE = TempObjects.end(); + UI != UE; ++UI) { + Value *UnderlyingObj = *UI; + + // If this is a write then it needs to be an identified object. If this a + // read and all writes (so far) are identified function scope objects we + // don't need an identified underlying object but only an Argument (the + // next write is going to invalidate this assumption if it is + // unidentified). + // This is a micro-optimization for the case where all writes are + // identified and we have one argument pointer. + // Otherwise, we do need a runtime check. + if ((IsWrite && !isFunctionScopeIdentifiedObject(UnderlyingObj)) || + (!IsWrite && (!AreAllWritesIdentified || + !isa<Argument>(UnderlyingObj)) && + !isIdentifiedObject(UnderlyingObj))) { + DEBUG(dbgs() << "LV: Found an unidentified " << + (IsWrite ? "write" : "read" ) << " ptr:" << *UnderlyingObj << + "\n"); + IsRTCheckNeeded = (IsRTCheckNeeded || + !isIdentifiedObject(UnderlyingObj) || + !AreAllReadsIdentified); + + if (IsWrite) + AreAllWritesIdentified = false; + if (!IsWrite) + AreAllReadsIdentified = false; + } + + // If this is a write - check other reads and writes for conflicts. If + // this is a read only check other writes for conflicts (but only if there + // is no other write to the ptr - this is an optimization to catch "a[i] = + // a[i] + " without having to do a dependence check). + if ((IsWrite || IsReadOnlyPtr) && WriteObjects.count(UnderlyingObj)) + NeedDepCheck = true; + + if (IsWrite) + WriteObjects.insert(UnderlyingObj); + + // Create sets of pointers connected by shared underlying objects. + UnderlyingObjToAccessMap::iterator Prev = + ObjToLastAccess.find(UnderlyingObj); + if (Prev != ObjToLastAccess.end()) + DepCands.unionSets(Access, Prev->second); + + ObjToLastAccess[UnderlyingObj] = Access; + } + + if (NeedDepCheck) + CheckDeps.insert(Access); + } +} + +namespace { +/// \brief Checks memory dependences among accesses to the same underlying +/// object to determine whether there vectorization is legal or not (and at +/// which vectorization factor). +/// +/// This class works under the assumption that we already checked that memory +/// locations with different underlying pointers are "must-not alias". +/// We use the ScalarEvolution framework to symbolically evalutate access +/// functions pairs. Since we currently don't restructure the loop we can rely +/// on the program order of memory accesses to determine their safety. +/// At the moment we will only deem accesses as safe for: +/// * A negative constant distance assuming program order. +/// +/// Safe: tmp = a[i + 1]; OR a[i + 1] = x; +/// a[i] = tmp; y = a[i]; +/// +/// The latter case is safe because later checks guarantuee that there can't +/// be a cycle through a phi node (that is, we check that "x" and "y" is not +/// the same variable: a header phi can only be an induction or a reduction, a +/// reduction can't have a memory sink, an induction can't have a memory +/// source). This is important and must not be violated (or we have to +/// resort to checking for cycles through memory). +/// +/// * A positive constant distance assuming program order that is bigger +/// than the biggest memory access. +/// +/// tmp = a[i] OR b[i] = x +/// a[i+2] = tmp y = b[i+2]; +/// +/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. +/// +/// * Zero distances and all accesses have the same size. +/// +class MemoryDepChecker { +public: + typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; + typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; + + MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) : + SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0) {} + + /// \brief Register the location (instructions are given increasing numbers) + /// of a write access. + void addAccess(StoreInst *SI) { + Value *Ptr = SI->getPointerOperand(); + Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); + InstMap.push_back(SI); + ++AccessIdx; + } + + /// \brief Register the location (instructions are given increasing numbers) + /// of a write access. + void addAccess(LoadInst *LI) { + Value *Ptr = LI->getPointerOperand(); + Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); + InstMap.push_back(LI); + ++AccessIdx; + } + + /// \brief Check whether the dependencies between the accesses are safe. + /// + /// Only checks sets with elements in \p CheckDeps. + bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, + MemAccessInfoSet &CheckDeps); + + /// \brief The maximum number of bytes of a vector register we can vectorize + /// the accesses safely with. + unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + +private: + ScalarEvolution *SE; + DataLayout *DL; + const Loop *InnermostLoop; + + /// \brief Maps access locations (ptr, read/write) to program order. + DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses; + + /// \brief Memory access instructions in program order. + SmallVector<Instruction *, 16> InstMap; + + /// \brief The program order index to be used for the next instruction. + unsigned AccessIdx; + + // We can access this many bytes in parallel safely. + unsigned MaxSafeDepDistBytes; + + /// \brief Check whether there is a plausible dependence between the two + /// accesses. + /// + /// Access \p A must happen before \p B in program order. The two indices + /// identify the index into the program order map. + /// + /// This function checks whether there is a plausible dependence (or the + /// absence of such can't be proved) between the two accesses. If there is a + /// plausible dependence but the dependence distance is bigger than one + /// element access it records this distance in \p MaxSafeDepDistBytes (if this + /// distance is smaller than any other distance encountered so far). + /// Otherwise, this function returns true signaling a possible dependence. + bool isDependent(const MemAccessInfo &A, unsigned AIdx, + const MemAccessInfo &B, unsigned BIdx); + + /// \brief Check whether the data dependence could prevent store-load + /// forwarding. + bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); +}; + +} // end anonymous namespace + +static bool isInBoundsGep(Value *Ptr) { + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) + return GEP->isInBounds(); + return false; +} + +/// \brief Check whether the access through \p Ptr has a constant stride. +static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr, + const Loop *Lp) { + const Type *Ty = Ptr->getType(); + assert(Ty->isPointerTy() && "Unexpected non ptr"); + + // Make sure that the pointer does not point to aggregate types. + const PointerType *PtrTy = cast<PointerType>(Ty); + if (PtrTy->getElementType()->isAggregateType()) { + DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << + "\n"); + return 0; + } + + const SCEV *PtrScev = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); + if (!AR) { + DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " + << *Ptr << " SCEV: " << *PtrScev << "\n"); + return 0; + } + + // The accesss function must stride over the innermost loop. + if (Lp != AR->getLoop()) { + DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " << + *Ptr << " SCEV: " << *PtrScev << "\n"); + } + + // The address calculation must not wrap. Otherwise, a dependence could be + // inverted. + // An inbounds getelementptr that is a AddRec with a unit stride + // cannot wrap per definition. The unit stride requirement is checked later. + // An getelementptr without an inbounds attribute and unit stride would have + // to access the pointer value "0" which is undefined behavior in address + // space 0, therefore we can also vectorize this case. + bool IsInBoundsGEP = isInBoundsGep(Ptr); + bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); + bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; + if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) { + DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space " + << *Ptr << " SCEV: " << *PtrScev << "\n"); + return 0; + } + + // Check the step is constant. + const SCEV *Step = AR->getStepRecurrence(*SE); + + // Calculate the pointer stride and check if it is consecutive. + const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); + if (!C) { + DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr << + " SCEV: " << *PtrScev << "\n"); + return 0; + } + + int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType()); + const APInt &APStepVal = C->getValue()->getValue(); + + // Huge step value - give up. + if (APStepVal.getBitWidth() > 64) + return 0; + + int64_t StepVal = APStepVal.getSExtValue(); + + // Strided access. + int64_t Stride = StepVal / Size; + int64_t Rem = StepVal % Size; + if (Rem) + return 0; + + // If the SCEV could wrap but we have an inbounds gep with a unit stride we + // know we can't "wrap around the address space". In case of address space + // zero we know that this won't happen without triggering undefined behavior. + if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && + Stride != 1 && Stride != -1) + return 0; + + return Stride; +} + +bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, + unsigned TypeByteSize) { + // If loads occur at a distance that is not a multiple of a feasible vector + // factor store-load forwarding does not take place. + // Positive dependences might cause troubles because vectorizing them might + // prevent store-load forwarding making vectorized code run a lot slower. + // a[i] = a[i-3] ^ a[i-8]; + // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and + // hence on your typical architecture store-load forwarding does not take + // place. Vectorizing in such cases does not make sense. + // Store-load forwarding distance. + const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize; + // Maximum vector factor. + unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize; + if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues) + MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes; + + for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues; + vf *= 2) { + if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) { + MaxVFWithoutSLForwardIssues = (vf >>=1); + break; + } + } + + if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) { + DEBUG(dbgs() << "LV: Distance " << Distance << + " that could cause a store-load forwarding conflict\n"); + return true; + } + + if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes && + MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize) + MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues; + return false; +} + +bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, + const MemAccessInfo &B, unsigned BIdx) { + assert (AIdx < BIdx && "Must pass arguments in program order"); + + Value *APtr = A.getPointer(); + Value *BPtr = B.getPointer(); + bool AIsWrite = A.getInt(); + bool BIsWrite = B.getInt(); + + // Two reads are independent. + if (!AIsWrite && !BIsWrite) + return false; + + const SCEV *AScev = SE->getSCEV(APtr); + const SCEV *BScev = SE->getSCEV(BPtr); + + int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop); + int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop); + + const SCEV *Src = AScev; + const SCEV *Sink = BScev; + + // If the induction step is negative we have to invert source and sink of the + // dependence. + if (StrideAPtr < 0) { + //Src = BScev; + //Sink = AScev; + std::swap(APtr, BPtr); + std::swap(Src, Sink); + std::swap(AIsWrite, BIsWrite); + std::swap(AIdx, BIdx); + std::swap(StrideAPtr, StrideBPtr); + } + + const SCEV *Dist = SE->getMinusSCEV(Sink, Src); + + DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink + << "(Induction step: " << StrideAPtr << ")\n"); + DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to " + << *InstMap[BIdx] << ": " << *Dist << "\n"); + + // Need consecutive accesses. We don't want to vectorize + // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in + // the address space. + if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ + DEBUG(dbgs() << "Non-consecutive pointer access\n"); + return true; + } + + const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); + if (!C) { + DEBUG(dbgs() << "LV: Dependence because of non constant distance\n"); + return true; + } + + Type *ATy = APtr->getType()->getPointerElementType(); + Type *BTy = BPtr->getType()->getPointerElementType(); + unsigned TypeByteSize = DL->getTypeAllocSize(ATy); + + // Negative distances are not plausible dependencies. + const APInt &Val = C->getValue()->getValue(); + if (Val.isNegative()) { + bool IsTrueDataDependence = (AIsWrite && !BIsWrite); + if (IsTrueDataDependence && + (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) || + ATy != BTy)) return true; + + DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n"); + return false; + } + + // Write to the same location with the same size. + // Could be improved to assert type sizes are the same (i32 == float, etc). + if (Val == 0) { + if (ATy == BTy) + return false; + DEBUG(dbgs() << "LV: Zero dependence difference but different types"); + return true; } + + assert(Val.isStrictlyPositive() && "Expect a positive value"); + + // Positive distance bigger than max vectorization factor. + if (ATy != BTy) { + DEBUG(dbgs() << + "LV: ReadWrite-Write positive dependency with different types"); + return false; + } + + unsigned Distance = (unsigned) Val.getZExtValue(); + + // Bail out early if passed-in parameters make vectorization not feasible. + unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1; + unsigned ForcedUnroll = VectorizationUnroll ? VectorizationUnroll : 1; + + // The distance must be bigger than the size needed for a vectorized version + // of the operation and the size of the vectorized operation must not be + // bigger than the currrent maximum size. + if (Distance < 2*TypeByteSize || + 2*TypeByteSize > MaxSafeDepDistBytes || + Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { + DEBUG(dbgs() << "LV: Failure because of Positive distance " + << Val.getSExtValue() << "\n"); + return true; + } + + MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? + Distance : MaxSafeDepDistBytes; + + bool IsTrueDataDependence = (!AIsWrite && BIsWrite); + if (IsTrueDataDependence && + couldPreventStoreLoadForward(Distance, TypeByteSize)) + return true; + + DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() << + " with max VF=" << MaxSafeDepDistBytes/TypeByteSize << "\n"); + return false; } +bool +MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, + MemAccessInfoSet &CheckDeps) { + + MaxSafeDepDistBytes = -1U; + while (!CheckDeps.empty()) { + MemAccessInfo CurAccess = *CheckDeps.begin(); + + // Get the relevant memory access set. + EquivalenceClasses<MemAccessInfo>::iterator I = + AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); + + // Check accesses within this set. + EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE; + AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); + + // Check every access pair. + while (AI != AE) { + CheckDeps.erase(*AI); + EquivalenceClasses<MemAccessInfo>::member_iterator OI = llvm::next(AI); + while (OI != AE) { + // Check every accessing instruction pair in program order. + for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(), + I1E = Accesses[*AI].end(); I1 != I1E; ++I1) + for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(), + I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { + if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2)) + return false; + if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1)) + return false; + } + ++OI; + } + AI++; + } + } + return true; +} + bool LoopVectorizationLegality::canVectorizeMemory() { typedef SmallVector<Value*, 16> ValueVector; typedef SmallPtrSet<Value*, 16> ValueSet; + // Holds the Load and Store *instructions*. ValueVector Loads; ValueVector Stores; + + // Holds all the different accesses in the loop. + unsigned NumReads = 0; + unsigned NumReadWrites = 0; + PtrRtCheck.Pointers.clear(); PtrRtCheck.Need = false; const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + MemoryDepChecker DepChecker(SE, DL, TheLoop); // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), @@ -2897,6 +3542,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // but is not a load, then we quit. Notice that we don't handle function // calls that read or write. if (it->mayReadFromMemory()) { + // Many math library functions read the rounding mode. We will only + // vectorize a loop if it contains known function calls that don't set + // the flag. Therefore, it is safe to ignore this read from memory. + CallInst *Call = dyn_cast<CallInst>(it); + if (Call && getIntrinsicIDForCall(Call, TLI)) + continue; + LoadInst *Ld = dyn_cast<LoadInst>(it); if (!Ld) return false; if (!Ld->isSimple() && !IsAnnotatedParallel) { @@ -2904,6 +3556,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } Loads.push_back(Ld); + DepChecker.addAccess(Ld); continue; } @@ -2916,6 +3569,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } Stores.push_back(St); + DepChecker.addAccess(St); } } // next instr. } // next block. @@ -2930,10 +3584,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } - // Holds the read and read-write *pointers* that we find. These maps hold - // unique values for pointers (so no need for multi-map). - AliasMap Reads; - AliasMap ReadWrites; + AccessAnalysis::DepCandidates DependentAccesses; + AccessAnalysis Accesses(DL, DependentAccesses); // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -2952,10 +3604,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } - // If we did *not* see this pointer before, insert it to - // the read-write list. At this phase it is only a 'write' list. - if (Seen.insert(Ptr)) - ReadWrites.insert(std::make_pair(Ptr, ST)); + // If we did *not* see this pointer before, insert it to the read-write + // list. At this phase it is only a 'write' list. + if (Seen.insert(Ptr)) { + ++NumReadWrites; + Accesses.addStore(Ptr); + } } if (IsAnnotatedParallel) { @@ -2965,6 +3619,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } + SmallPtrSet<Value *, 16> ReadOnlyPtr; for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { LoadInst *LD = cast<LoadInst>(*I); Value* Ptr = LD->getPointerOperand(); @@ -2976,51 +3631,44 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // If the address of i is unknown (for example A[B[i]]) then we may // read a few words, modify, and write a few words, and some of the // words may be written to the same address. - if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr)) - Reads.insert(std::make_pair(Ptr, LD)); + bool IsReadOnlyPtr = false; + if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop)) { + ++NumReads; + IsReadOnlyPtr = true; + } + Accesses.addLoad(Ptr, IsReadOnlyPtr); } // If we write (or read-write) to a single destination and there are no // other reads in this loop then is it safe to vectorize. - if (ReadWrites.size() == 1 && Reads.size() == 0) { + if (NumReadWrites == 1 && NumReads == 0) { DEBUG(dbgs() << "LV: Found a write-only loop!\n"); return true; } - unsigned NumReadPtrs = 0; - unsigned NumWritePtrs = 0; + // Build dependence sets and check whether we need a runtime pointer bounds + // check. + Accesses.buildDependenceSets(); + bool NeedRTCheck = Accesses.isRTCheckNeeded(); // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. - bool CanDoRT = true; - AliasMap::iterator MI, ME; - for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { - Value *V = (*MI).first; - if (hasComputableBounds(V)) { - PtrRtCheck.insert(SE, TheLoop, V, true); - NumWritePtrs++; - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); - } else { - CanDoRT = false; - break; - } - } - for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { - Value *V = (*MI).first; - if (hasComputableBounds(V)) { - PtrRtCheck.insert(SE, TheLoop, V, false); - NumReadPtrs++; - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); - } else { - CanDoRT = false; - break; - } - } + unsigned NumComparisons = 0; + bool CanDoRT = false; + if (NeedRTCheck) + CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop); - // Check that we did not collect too many pointers or found a - // unsizeable pointer. - unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1)); - DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n"); + + DEBUG(dbgs() << "LV: We need to do " << NumComparisons << + " pointer comparisons.\n"); + + // If we only have one set of dependences to check pointers among we don't + // need a runtime check. + if (NumComparisons == 0 && NeedRTCheck) + NeedRTCheck = false; + + // Check that we did not collect too many pointers or found a unsizeable + // pointer. if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { PtrRtCheck.reset(); CanDoRT = false; @@ -3030,113 +3678,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() { DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); } - bool NeedRTCheck = false; - - // Biggest vectorized access possible, vector width * unroll factor. - // TODO: We're being very pessimistic here, find a way to know the - // real access width before getting here. - unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) * - TTI->getMaximumUnrollFactor(); - // Now that the pointers are in two lists (Reads and ReadWrites), we - // can check that there are no conflicts between each of the writes and - // between the writes to the reads. - // Note that WriteObjects duplicates the stores (indexed now by underlying - // objects) to avoid pointing to elements inside ReadWrites. - // TODO: Maybe create a new type where they can interact without duplication. - AliasMultiMap WriteObjects; - ValueVector TempObjects; - - // Check that the read-writes do not conflict with other read-write - // pointers. - bool AllWritesIdentified = true; - for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { - Value *Val = (*MI).first; - Instruction *Inst = (*MI).second; - - GetUnderlyingObjects(Val, TempObjects, DL); - for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); - UI != UE; ++UI) { - if (!isIdentifiedObject(*UI)) { - DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n"); - NeedRTCheck = true; - AllWritesIdentified = false; - } - - // Never seen it before, can't alias. - if (WriteObjects[*UI].empty()) { - DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n"); - WriteObjects[*UI].push_back(Inst); - continue; - } - // Direct alias found. - if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) { - DEBUG(dbgs() << "LV: Found a possible write-write reorder:" - << **UI <<"\n"); - return false; - } - DEBUG(dbgs() << "LV: Found a conflicting global value:" - << **UI <<"\n"); - DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n"); - DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); - - // If global alias, make sure they do alias. - if (hasPossibleGlobalWriteReorder(*UI, - Inst, - WriteObjects, - MaxByteWidth)) { - DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI - << "\n"); - return false; - } - - // Didn't alias, insert into map for further reference. - WriteObjects[*UI].push_back(Inst); - } - TempObjects.clear(); - } - - /// Check that the reads don't conflict with the read-writes. - for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { - Value *Val = (*MI).first; - GetUnderlyingObjects(Val, TempObjects, DL); - for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); - UI != UE; ++UI) { - // If all of the writes are identified then we don't care if the read - // pointer is identified or not. - if (!AllWritesIdentified && !isIdentifiedObject(*UI)) { - DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n"); - NeedRTCheck = true; - } - - // Never seen it before, can't alias. - if (WriteObjects[*UI].empty()) - continue; - // Direct alias found. - if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) { - DEBUG(dbgs() << "LV: Found a possible write-write reorder:" - << **UI <<"\n"); - return false; - } - DEBUG(dbgs() << "LV: Found a global value: " - << **UI <<"\n"); - Instruction *Inst = (*MI).second; - DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n"); - DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); - - // If global alias, make sure they do alias. - if (hasPossibleGlobalWriteReorder(*UI, - Inst, - WriteObjects, - MaxByteWidth)) { - DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI - << "\n"); - return false; - } - } - TempObjects.clear(); - } - - PtrRtCheck.Need = NeedRTCheck; if (NeedRTCheck && !CanDoRT) { DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"); @@ -3144,9 +3685,20 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } + PtrRtCheck.Need = NeedRTCheck; + + bool CanVecMem = true; + if (Accesses.isDependencyCheckNeeded()) { + DEBUG(dbgs() << "LV: Checking memory dependencies\n"); + CanVecMem = DepChecker.areDepsSafe(DependentAccesses, + Accesses.getDependenciesToCheck()); + MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); + } + DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << " need a runtime memory check.\n"); - return true; + + return CanVecMem; } static bool hasMultipleUsesOf(Instruction *I, @@ -3280,9 +3832,13 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // Check if we found the exit user. BasicBlock *Parent = Usr->getParent(); if (!TheLoop->contains(Parent)) { - // Exit if you find multiple outside users. - if (ExitInstruction != 0) + // Exit if you find multiple outside users or if the header phi node is + // being used. In this case the user uses the value of the previous + // iteration, in which case we would loose "VF-1" iterations of the + // reduction operation if we vectorize. + if (ExitInstruction != 0 || Cur == Phi) return false; + ExitInstruction = Cur; continue; } @@ -3475,11 +4031,15 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { return !DT->dominates(BB, Latch); } -bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { +bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, + SmallPtrSet<Value *, 8>& SafePtrs) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { // We might be able to hoist the load. - if (it->mayReadFromMemory() && !LoadSpeculation.isHoistableLoad(it)) - return false; + if (it->mayReadFromMemory()) { + LoadInst *LI = dyn_cast<LoadInst>(it); + if (!LI || !SafePtrs.count(LI->getPointerOperand())) + return false; + } // We don't predicate stores at the moment. if (it->mayWriteToMemory() || it->mayThrow()) @@ -3499,15 +4059,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { return true; } -bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { - const SCEV *PhiScev = SE->getSCEV(Ptr); - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); - if (!AR) - return false; - - return AR->isAffine(); -} - LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, unsigned UserVF) { @@ -3524,6 +4075,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, unsigned WidestType = getWidestType(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); + unsigned MaxSafeDepDist = -1U; + if (Legal->getMaxSafeDepDistBytes() != -1U) + MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; + WidestRegister = ((WidestRegister < MaxSafeDepDist) ? + WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n"); @@ -3657,6 +4213,10 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, if (OptForSize) return 1; + // We used the distance for the unroll factor. + if (Legal->getMaxSafeDepDistBytes() != -1U) + return 1; + // Do not unroll loops with a relatively small trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); @@ -3845,7 +4405,7 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { continue; unsigned C = getInstructionCost(it, VF); - Cost += C; + BlockCost += C; DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << VF << " For instruction: "<< *it << "\n"); } @@ -3853,7 +4413,7 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { // We assume that if-converted blocks have a 50% chance of being executed. // When the code is scalar then some of the blocks are avoided due to CF. // When the code is vectorized we execute all code paths. - if (Legal->blockNeedsPredication(*bb) && VF == 1) + if (VF == 1 && Legal->blockNeedsPredication(*bb)) BlockCost /= 2; Cost += BlockCost; @@ -3862,6 +4422,59 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { return Cost; } +/// \brief Check whether the address computation for a non-consecutive memory +/// access looks like an unlikely candidate for being merged into the indexing +/// mode. +/// +/// We look for a GEP which has one index that is an induction variable and all +/// other indices are loop invariant. If the stride of this access is also +/// within a small bound we decide that this address computation can likely be +/// merged into the addressing mode. +/// In all other cases, we identify the address computation as complex. +static bool isLikelyComplexAddressComputation(Value *Ptr, + LoopVectorizationLegality *Legal, + ScalarEvolution *SE, + const Loop *TheLoop) { + GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + if (!Gep) + return true; + + // We are looking for a gep with all loop invariant indices except for one + // which should be an induction variable. + unsigned NumOperands = Gep->getNumOperands(); + for (unsigned i = 1; i < NumOperands; ++i) { + Value *Opd = Gep->getOperand(i); + if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && + !Legal->isInductionVariable(Opd)) + return true; + } + + // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step + // can likely be merged into the address computation. + unsigned MaxMergeDistance = 64; + + const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr)); + if (!AddRec) + return true; + + // Check the step is constant. + const SCEV *Step = AddRec->getStepRecurrence(*SE); + // Calculate the pointer stride and check if it is consecutive. + const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); + if (!C) + return true; + + const APInt &APStepVal = C->getValue()->getValue(); + + // Huge step value - give up. + if (APStepVal.getBitWidth() > 64) + return true; + + int64_t StepVal = APStepVal.getSExtValue(); + + return StepVal > MaxMergeDistance; +} + unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of @@ -3957,6 +4570,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { + bool IsComplexComputation = + isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); unsigned Cost = 0; // The cost of extracting from the value vector and pointer vector. Type *PtrTy = ToVectorTy(Ptr->getType(), VF); @@ -3972,7 +4587,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { } // The cost of the scalar loads/stores. - Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType()); + Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS); return Cost; @@ -4053,7 +4668,6 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 40e0098..9312b4b 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -16,19 +16,23 @@ // //===----------------------------------------------------------------------===// #define SV_NAME "slp-vectorizer" -#define DEBUG_TYPE SV_NAME +#define DEBUG_TYPE "SLP" -#include "VecUtils.h" #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/Verifier.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" @@ -36,19 +40,1515 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include <algorithm> #include <map> using namespace llvm; static cl::opt<int> -SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, - cl::desc("Only vectorize trees if the gain is above this " - "number. (gain = -cost of vectorization)")); + SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, + cl::desc("Only vectorize if you gain more than this " + "number ")); namespace { +static const unsigned MinVecRegSize = 128; + +static const unsigned RecursionMaxDepth = 12; + +/// RAII pattern to save the insertion point of the IR builder. +class BuilderLocGuard { +public: + BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()), + DbgLoc(B.getCurrentDebugLocation()) {} + ~BuilderLocGuard() { + Builder.SetCurrentDebugLocation(DbgLoc); + if (Loc) + Builder.SetInsertPoint(Loc); + } + +private: + // Prevent copying. + BuilderLocGuard(const BuilderLocGuard &); + BuilderLocGuard &operator=(const BuilderLocGuard &); + IRBuilder<> &Builder; + AssertingVH<Instruction> Loc; + DebugLoc DbgLoc; +}; + +/// A helper class for numbering instructions in multible blocks. +/// Numbers starts at zero for each basic block. +struct BlockNumbering { + + BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {} + + BlockNumbering() : BB(0), Valid(false) {} + + void numberInstructions() { + unsigned Loc = 0; + InstrIdx.clear(); + InstrVec.clear(); + // Number the instructions in the block. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + InstrIdx[it] = Loc++; + InstrVec.push_back(it); + assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); + } + Valid = true; + } + + int getIndex(Instruction *I) { + assert(I->getParent() == BB && "Invalid instruction"); + if (!Valid) + numberInstructions(); + assert(InstrIdx.count(I) && "Unknown instruction"); + return InstrIdx[I]; + } + + Instruction *getInstruction(unsigned loc) { + if (!Valid) + numberInstructions(); + assert(InstrVec.size() > loc && "Invalid Index"); + return InstrVec[loc]; + } + + void forget() { Valid = false; } + +private: + /// The block we are numbering. + BasicBlock *BB; + /// Is the block numbered. + bool Valid; + /// Maps instructions to numbers and back. + SmallDenseMap<Instruction *, int> InstrIdx; + /// Maps integers to Instructions. + SmallVector<Instruction *, 32> InstrVec; +}; + +/// \returns the parent basic block if all of the instructions in \p VL +/// are in the same block or null otherwise. +static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + if (!I0) + return 0; + BasicBlock *BB = I0->getParent(); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (!I) + return 0; + + if (BB != I->getParent()) + return 0; + } + return BB; +} + +/// \returns True if all of the values in \p VL are constants. +static bool allConstant(ArrayRef<Value *> VL) { + for (unsigned i = 0, e = VL.size(); i < e; ++i) + if (!isa<Constant>(VL[i])) + return false; + return true; +} + +/// \returns True if all of the values in \p VL are identical. +static bool isSplat(ArrayRef<Value *> VL) { + for (unsigned i = 1, e = VL.size(); i < e; ++i) + if (VL[i] != VL[0]) + return false; + return true; +} + +/// \returns The opcode if all of the Instructions in \p VL have the same +/// opcode, or zero. +static unsigned getSameOpcode(ArrayRef<Value *> VL) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + if (!I0) + return 0; + unsigned Opcode = I0->getOpcode(); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (!I || Opcode != I->getOpcode()) + return 0; + } + return Opcode; +} + +/// \returns The type that all of the values in \p VL have or null if there +/// are different types. +static Type* getSameType(ArrayRef<Value *> VL) { + Type *Ty = VL[0]->getType(); + for (int i = 1, e = VL.size(); i < e; i++) + if (VL[i]->getType() != Ty) + return 0; + + return Ty; +} + +/// \returns True if the ExtractElement instructions in VL can be vectorized +/// to use the original vector. +static bool CanReuseExtract(ArrayRef<Value *> VL) { + assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); + // Check if all of the extracts come from the same vector and from the + // correct offset. + Value *VL0 = VL[0]; + ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); + Value *Vec = E0->getOperand(0); + + // We have to extract from the same vector type. + unsigned NElts = Vec->getType()->getVectorNumElements(); + + if (NElts != VL.size()) + return false; + + // Check that all of the indices extract from the correct offset. + ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); + if (!CI || CI->getZExtValue()) + return false; + + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); + ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); + + if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) + return false; + } + + return true; +} + +/// Bottom Up SLP Vectorizer. +class BoUpSLP { +public: + typedef SmallVector<Value *, 8> ValueList; + typedef SmallVector<Instruction *, 16> InstrList; + typedef SmallPtrSet<Value *, 16> ValueSet; + typedef SmallVector<StoreInst *, 8> StoreList; + + BoUpSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, + DominatorTree *Dt) : + F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt), + Builder(Se->getContext()) { + // Setup the block numbering utility for all of the blocks in the + // function. + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BasicBlock *BB = it; + BlocksNumbers[BB] = BlockNumbering(BB); + } + } + + /// \brief Vectorize the tree that starts with the elements in \p VL. + void vectorizeTree(); + + /// \returns the vectorization cost of the subtree that starts at \p VL. + /// A negative number means that this is profitable. + int getTreeCost(); + + /// Construct a vectorizable tree that starts at \p Roots. + void buildTree(ArrayRef<Value *> Roots); + + /// Clear the internal data structures that are created by 'buildTree'. + void deleteTree() { + VectorizableTree.clear(); + ScalarToTreeEntry.clear(); + MustGather.clear(); + ExternalUses.clear(); + MemBarrierIgnoreList.clear(); + } + + /// \returns true if the memory operations A and B are consecutive. + bool isConsecutiveAccess(Value *A, Value *B); + + /// \brief Perform LICM and CSE on the newly generated gather sequences. + void optimizeGatherSequence(); +private: + struct TreeEntry; + + /// \returns the cost of the vectorizable entry. + int getEntryCost(TreeEntry *E); + + /// This is the recursive part of buildTree. + void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); + + /// Vectorize a single entry in the tree. + Value *vectorizeTree(TreeEntry *E); + + /// Vectorize a single entry in the tree, starting in \p VL. + Value *vectorizeTree(ArrayRef<Value *> VL); + + /// \returns the pointer to the vectorized value if \p VL is already + /// vectorized, or NULL. They may happen in cycles. + Value *alreadyVectorized(ArrayRef<Value *> VL); + + /// \brief Take the pointer operand from the Load/Store instruction. + /// \returns NULL if this is not a valid Load/Store instruction. + static Value *getPointerOperand(Value *I); + + /// \brief Take the address space operand from the Load/Store instruction. + /// \returns -1 if this is not a valid Load/Store instruction. + static unsigned getAddressSpaceOperand(Value *I); + + /// \returns the scalarization cost for this type. Scalarization in this + /// context means the creation of vectors from a group of scalars. + int getGatherCost(Type *Ty); + + /// \returns the scalarization cost for this list of values. Assuming that + /// this subtree gets vectorized, we may need to extract the values from the + /// roots. This method calculates the cost of extracting the values. + int getGatherCost(ArrayRef<Value *> VL); + + /// \returns the AA location that is being access by the instruction. + AliasAnalysis::Location getLocation(Instruction *I); + + /// \brief Checks if it is possible to sink an instruction from + /// \p Src to \p Dst. + /// \returns the pointer to the barrier instruction if we can't sink. + Value *getSinkBarrier(Instruction *Src, Instruction *Dst); + + /// \returns the index of the last instrucion in the BB from \p VL. + int getLastIndex(ArrayRef<Value *> VL); + + /// \returns the Instrucion in the bundle \p VL. + Instruction *getLastInstruction(ArrayRef<Value *> VL); + + /// \returns a vector from a collection of scalars in \p VL. + Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); + + struct TreeEntry { + TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0), + NeedToGather(0) {} + + /// \returns true if the scalars in VL are equal to this entry. + bool isSame(ArrayRef<Value *> VL) { + assert(VL.size() == Scalars.size() && "Invalid size"); + for (int i = 0, e = VL.size(); i != e; ++i) + if (VL[i] != Scalars[i]) + return false; + return true; + } + + /// A vector of scalars. + ValueList Scalars; + + /// The Scalars are vectorized into this value. It is initialized to Null. + Value *VectorizedValue; + + /// The index in the basic block of the last scalar. + int LastScalarIndex; + + /// Do we need to gather this sequence ? + bool NeedToGather; + }; + + /// Create a new VectorizableTree entry. + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { + VectorizableTree.push_back(TreeEntry()); + int idx = VectorizableTree.size() - 1; + TreeEntry *Last = &VectorizableTree[idx]; + Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); + Last->NeedToGather = !Vectorized; + if (Vectorized) { + Last->LastScalarIndex = getLastIndex(VL); + for (int i = 0, e = VL.size(); i != e; ++i) { + assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); + ScalarToTreeEntry[VL[i]] = idx; + } + } else { + Last->LastScalarIndex = 0; + MustGather.insert(VL.begin(), VL.end()); + } + return Last; + } + + /// -- Vectorization State -- + /// Holds all of the tree entries. + std::vector<TreeEntry> VectorizableTree; + + /// Maps a specific scalar to its tree entry. + SmallDenseMap<Value*, int> ScalarToTreeEntry; + + /// A list of scalars that we found that we need to keep as scalars. + ValueSet MustGather; + + /// This POD struct describes one external user in the vectorized tree. + struct ExternalUser { + ExternalUser (Value *S, llvm::User *U, int L) : + Scalar(S), User(U), Lane(L){}; + // Which scalar in our function. + Value *Scalar; + // Which user that uses the scalar. + llvm::User *User; + // Which lane does the scalar belong to. + int Lane; + }; + typedef SmallVector<ExternalUser, 16> UserList; + + /// A list of values that need to extracted out of the tree. + /// This list holds pairs of (Internal Scalar : External User). + UserList ExternalUses; + + /// A list of instructions to ignore while sinking + /// memory instructions. This map must be reset between runs of getCost. + ValueSet MemBarrierIgnoreList; + + /// Holds all of the instructions that we gathered. + SetVector<Instruction *> GatherSeq; + + /// Numbers instructions in different blocks. + DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers; + + // Analysis and block reference. + Function *F; + ScalarEvolution *SE; + DataLayout *DL; + TargetTransformInfo *TTI; + AliasAnalysis *AA; + LoopInfo *LI; + DominatorTree *DT; + /// Instruction builder to construct the vectorized tree. + IRBuilder<> Builder; +}; + +void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { + deleteTree(); + if (!getSameType(Roots)) + return; + buildTree_rec(Roots, 0); + + // Collect the values that we need to extract from the tree. + for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { + TreeEntry *Entry = &VectorizableTree[EIdx]; + + // For each lane: + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + + // No need to handle users of gathered values. + if (Entry->NeedToGather) + continue; + + for (Value::use_iterator User = Scalar->use_begin(), + UE = Scalar->use_end(); User != UE; ++User) { + DEBUG(dbgs() << "SLP: Checking user:" << **User << ".\n"); + + bool Gathered = MustGather.count(*User); + + // Skip in-tree scalars that become vectors. + if (ScalarToTreeEntry.count(*User) && !Gathered) { + DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << + **User << ".\n"); + int Idx = ScalarToTreeEntry[*User]; (void) Idx; + assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); + continue; + } + + if (!isa<Instruction>(*User)) + continue; + + DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " << + Lane << " from " << *Scalar << ".\n"); + ExternalUses.push_back(ExternalUser(Scalar, *User, Lane)); + } + } + } +} + + +void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { + bool SameTy = getSameType(VL); (void)SameTy; + assert(SameTy && "Invalid types!"); + + if (Depth == RecursionMaxDepth) { + DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); + newTreeEntry(VL, false); + return; + } + + // Don't handle vectors. + if (VL[0]->getType()->isVectorTy()) { + DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); + newTreeEntry(VL, false); + return; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + if (SI->getValueOperand()->getType()->isVectorTy()) { + DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); + newTreeEntry(VL, false); + return; + } + + // If all of the operands are identical or constant we have a simple solution. + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || + !getSameOpcode(VL)) { + DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); + newTreeEntry(VL, false); + return; + } + + // We now know that this is a vector of instructions of the same type from + // the same block. + + // Check if this is a duplicate of another entry. + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *E = &VectorizableTree[Idx]; + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); + if (E->Scalars[i] != VL[i]) { + DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); + newTreeEntry(VL, false); + return; + } + } + DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); + return; + } + + // Check that none of the instructions in the bundle are already in the tree. + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + if (ScalarToTreeEntry.count(VL[i])) { + DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << + ") is already in tree.\n"); + newTreeEntry(VL, false); + return; + } + } + + // If any of the scalars appears in the table OR it is marked as a value that + // needs to stat scalar then we need to gather the scalars. + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) { + DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n"); + newTreeEntry(VL, false); + return; + } + } + + // Check that all of the users of the scalars that we want to vectorize are + // schedulable. + Instruction *VL0 = cast<Instruction>(VL[0]); + int MyLastIndex = getLastIndex(VL); + BasicBlock *BB = cast<Instruction>(VL0)->getParent(); + + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + Instruction *Scalar = cast<Instruction>(VL[i]); + DEBUG(dbgs() << "SLP: Checking users of " << *Scalar << ". \n"); + for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end(); + U != UE; ++U) { + DEBUG(dbgs() << "SLP: \tUser " << **U << ". \n"); + Instruction *User = dyn_cast<Instruction>(*U); + if (!User) { + DEBUG(dbgs() << "SLP: Gathering due unknown user. \n"); + newTreeEntry(VL, false); + return; + } + + // We don't care if the user is in a different basic block. + BasicBlock *UserBlock = User->getParent(); + if (UserBlock != BB) { + DEBUG(dbgs() << "SLP: User from a different basic block " + << *User << ". \n"); + continue; + } + + // If this is a PHINode within this basic block then we can place the + // extract wherever we want. + if (isa<PHINode>(*User)) { + DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *User << ". \n"); + continue; + } + + // Check if this is a safe in-tree user. + if (ScalarToTreeEntry.count(User)) { + int Idx = ScalarToTreeEntry[User]; + int VecLocation = VectorizableTree[Idx].LastScalarIndex; + if (VecLocation <= MyLastIndex) { + DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n"); + newTreeEntry(VL, false); + return; + } + DEBUG(dbgs() << "SLP: In-tree user (" << *User << ") at #" << + VecLocation << " vector value (" << *Scalar << ") at #" + << MyLastIndex << ".\n"); + continue; + } + + // Make sure that we can schedule this unknown user. + BlockNumbering &BN = BlocksNumbers[BB]; + int UserIndex = BN.getIndex(User); + if (UserIndex < MyLastIndex) { + + DEBUG(dbgs() << "SLP: Can't schedule extractelement for " + << *User << ". \n"); + newTreeEntry(VL, false); + return; + } + } + } + + // Check that every instructions appears once in this bundle. + for (unsigned i = 0, e = VL.size(); i < e; ++i) + for (unsigned j = i+1; j < e; ++j) + if (VL[i] == VL[j]) { + DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, false); + return; + } + + // Check that instructions in this bundle don't reference other instructions. + // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4. + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); + U != UE; ++U) { + for (unsigned j = 0; j < e; ++j) { + if (i != j && *U == VL[j]) { + DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << **U << ". \n"); + newTreeEntry(VL, false); + return; + } + } + } + } + + DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); + + unsigned Opcode = getSameOpcode(VL); + + // Check if it is safe to sink the loads or the stores. + if (Opcode == Instruction::Load || Opcode == Instruction::Store) { + Instruction *Last = getLastInstruction(VL); + + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + if (VL[i] == Last) + continue; + Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last); + if (Barrier) { + DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last + << "\n because of " << *Barrier << ". Gathering.\n"); + newTreeEntry(VL, false); + return; + } + } + } + + switch (Opcode) { + case Instruction::PHI: { + PHINode *PH = dyn_cast<PHINode>(VL0); + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); + + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } + case Instruction::ExtractElement: { + bool Reuse = CanReuseExtract(VL); + if (Reuse) { + DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); + } + newTreeEntry(VL, Reuse); + return; + } + case Instruction::Load: { + // Check if the loads are consecutive or of we need to swizzle them. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Need to swizzle loads.\n"); + return; + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of loads.\n"); + return; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + for (unsigned i = 0; i < VL.size(); ++i) { + Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); + if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); + return; + } + } + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of casts.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Check that all of the compares have the same predicate. + CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); + Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + CmpInst *Cmp = cast<CmpInst>(VL[i]); + if (Cmp->getPredicate() != P0 || + Cmp->getOperand(0)->getType() != ComparedTy) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); + return; + } + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of compares.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::Store: { + // Check if the stores are consecutive or of we need to swizzle them. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Non consecutive store.\n"); + return; + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); + + // We can ignore these values because we are sinking them down. + MemBarrierIgnoreList.insert(VL.begin(), VL.end()); + buildTree_rec(Operands, Depth + 1); + return; + } + default: + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); + return; + } +} + +int BoUpSLP::getEntryCost(TreeEntry *E) { + ArrayRef<Value*> VL = E->Scalars; + + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + if (E->NeedToGather) { + if (allConstant(VL)) + return 0; + if (isSplat(VL)) { + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + } + return getGatherCost(E->Scalars); + } + + assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) && + "Invalid VL"); + Instruction *VL0 = cast<Instruction>(VL[0]); + unsigned Opcode = VL0->getOpcode(); + switch (Opcode) { + case Instruction::PHI: { + return 0; + } + case Instruction::ExtractElement: { + if (CanReuseExtract(VL)) + return 0; + return getGatherCost(VecTy); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + + // Calculate the cost of this instruction. + int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), + VL0->getType(), SrcTy); + + VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + return VecCost - ScalarCost; + } + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Calculate the cost of this instruction. + int ScalarCost = 0; + int VecCost = 0; + if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || + Opcode == Instruction::Select) { + VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); + ScalarCost = VecTy->getNumElements() * + TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); + VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); + } else { + ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy); + VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); + } + return VecCost - ScalarCost; + } + case Instruction::Load: { + // Cost of wide load - cost of scalar loads. + int ScalarLdCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + return VecLdCost - ScalarLdCost; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + int ScalarStCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + return VecStCost - ScalarStCost; + } + default: + llvm_unreachable("Unknown instruction"); + } +} + +int BoUpSLP::getTreeCost() { + int Cost = 0; + DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << + VectorizableTree.size() << ".\n"); + + // Don't vectorize tiny trees. Small load/store chains or consecutive stores + // of constants will be vectoried in SelectionDAG in MergeConsecutiveStores. + // The SelectionDAG vectorizer can only handle pairs (trees of height = 2). + if (VectorizableTree.size() < 3) { + if (!VectorizableTree.size()) { + assert(!ExternalUses.size() && "We should not have any external users"); + } + return 0; + } + + unsigned BundleWidth = VectorizableTree[0].Scalars.size(); + + for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { + int C = getEntryCost(&VectorizableTree[i]); + DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " + << *VectorizableTree[i].Scalars[0] << " .\n"); + Cost += C; + } + + int ExtractCost = 0; + for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end(); + I != E; ++I) { + + VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); + ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + I->Lane); + } + + + DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); + return Cost + ExtractCost; +} + +int BoUpSLP::getGatherCost(Type *Ty) { + int Cost = 0; + for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + return Cost; +} + +int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { + // Find the type of the operands in VL. + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + // Find the cost of inserting/extracting values from the vector. + return getGatherCost(VecTy); +} + +AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return AA->getLocation(SI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return AA->getLocation(LI); + return AliasAnalysis::Location(); +} + +Value *BoUpSLP::getPointerOperand(Value *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->getPointerOperand(); + return 0; +} + +unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { + if (LoadInst *L = dyn_cast<LoadInst>(I)) + return L->getPointerAddressSpace(); + if (StoreInst *S = dyn_cast<StoreInst>(I)) + return S->getPointerAddressSpace(); + return -1; +} + +bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) + return false; + + // Make sure that A and B are different pointers of the same type. + if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) + return false; + + // Calculate a constant offset from the base pointer without using SCEV + // in the supported cases. + // TODO: Add support for the case where one of the pointers is a GEP that + // uses the other pointer. + GetElementPtrInst *GepA = dyn_cast<GetElementPtrInst>(PtrA); + GetElementPtrInst *GepB = dyn_cast<GetElementPtrInst>(PtrB); + + unsigned BW = DL->getPointerSizeInBits(ASA); + Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); + int64_t Sz = DL->getTypeStoreSize(Ty); + + // Check if PtrA is the base and PtrB is a constant offset. + if (GepB && GepB->getPointerOperand() == PtrA) { + APInt Offset(BW, 0); + if (GepB->accumulateConstantOffset(*DL, Offset)) + return Offset.getSExtValue() == Sz; + return false; + } + + // Check if PtrB is the base and PtrA is a constant offset. + if (GepA && GepA->getPointerOperand() == PtrB) { + APInt Offset(BW, 0); + if (GepA->accumulateConstantOffset(*DL, Offset)) + return Offset.getSExtValue() == -Sz; + return false; + } + + // If both pointers are GEPs: + if (GepA && GepB) { + // Check that they have the same base pointer and number of indices. + if (GepA->getPointerOperand() != GepB->getPointerOperand() || + GepA->getNumIndices() != GepB->getNumIndices()) + return false; + + // Try to strip the geps. This makes SCEV faster. + // Make sure that all of the indices except for the last are identical. + int LastIdx = GepA->getNumIndices(); + for (int i = 0; i < LastIdx - 1; i++) { + if (GepA->getOperand(i+1) != GepB->getOperand(i+1)) + return false; + } + + PtrA = GepA->getOperand(LastIdx); + PtrB = GepB->getOperand(LastIdx); + Sz = 1; + } + + ConstantInt *CA = dyn_cast<ConstantInt>(PtrA); + ConstantInt *CB = dyn_cast<ConstantInt>(PtrB); + if (CA && CB) { + return (CA->getSExtValue() + Sz == CB->getSExtValue()); + } + + // Calculate the distance. + const SCEV *PtrSCEVA = SE->getSCEV(PtrA); + const SCEV *PtrSCEVB = SE->getSCEV(PtrB); + const SCEV *C = SE->getConstant(PtrSCEVA->getType(), Sz); + const SCEV *X = SE->getAddExpr(PtrSCEVA, C); + return X == PtrSCEVB; +} + +Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { + assert(Src->getParent() == Dst->getParent() && "Not the same BB"); + BasicBlock::iterator I = Src, E = Dst; + /// Scan all of the instruction from SRC to DST and check if + /// the source may alias. + for (++I; I != E; ++I) { + // Ignore store instructions that are marked as 'ignore'. + if (MemBarrierIgnoreList.count(I)) + continue; + if (Src->mayWriteToMemory()) /* Write */ { + if (!I->mayReadOrWriteMemory()) + continue; + } else /* Read */ { + if (!I->mayWriteToMemory()) + continue; + } + AliasAnalysis::Location A = getLocation(&*I); + AliasAnalysis::Location B = getLocation(Src); + + if (!A.Ptr || !B.Ptr || AA->alias(A, B)) + return I; + } + return 0; +} + +int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) { + BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); + assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); + BlockNumbering &BN = BlocksNumbers[BB]; + + int MaxIdx = BN.getIndex(BB->getFirstNonPHI()); + for (unsigned i = 0, e = VL.size(); i < e; ++i) + MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i]))); + return MaxIdx; +} + +Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) { + BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); + assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); + BlockNumbering &BN = BlocksNumbers[BB]; + + int MaxIdx = BN.getIndex(cast<Instruction>(VL[0])); + for (unsigned i = 1, e = VL.size(); i < e; ++i) + MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i]))); + Instruction *I = BN.getInstruction(MaxIdx); + assert(I && "bad location"); + return I; +} + +Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { + Value *Vec = UndefValue::get(Ty); + // Generate the 'InsertElement' instruction. + for (unsigned i = 0; i < Ty->getNumElements(); ++i) { + Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); + if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { + GatherSeq.insert(Insrt); + + // Add to our 'need-to-extract' list. + if (ScalarToTreeEntry.count(VL[i])) { + int Idx = ScalarToTreeEntry[VL[i]]; + TreeEntry *E = &VectorizableTree[Idx]; + // Find which lane we need to extract. + int FoundLane = -1; + for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { + // Is this the lane of the scalar that we are looking for ? + if (E->Scalars[Lane] == VL[i]) { + FoundLane = Lane; + break; + } + } + assert(FoundLane >= 0 && "Could not find the correct lane"); + ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); + } + } + } + + return Vec; +} + +Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) { + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *En = &VectorizableTree[Idx]; + if (En->isSame(VL) && En->VectorizedValue) + return En->VectorizedValue; + } + return 0; +} + +Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *E = &VectorizableTree[Idx]; + if (E->isSame(VL)) + return vectorizeTree(E); + } + + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + return Gather(VL, VecTy); +} + +Value *BoUpSLP::vectorizeTree(TreeEntry *E) { + BuilderLocGuard Guard(Builder); + + if (E->VectorizedValue) { + DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + return E->VectorizedValue; + } + + Type *ScalarTy = E->Scalars[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(E->Scalars[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); + + if (E->NeedToGather) { + return Gather(E->Scalars, VecTy); + } + + Instruction *VL0 = cast<Instruction>(E->Scalars[0]); + unsigned Opcode = VL0->getOpcode(); + assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); + + switch (Opcode) { + case Instruction::PHI: { + PHINode *PH = dyn_cast<PHINode>(VL0); + Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt()); + Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); + E->VectorizedValue = NewPhi; + + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + ValueList Operands; + BasicBlock *IBB = PH->getIncomingBlock(i); + + // Prepare the operand vector. + for (unsigned j = 0; j < E->Scalars.size(); ++j) + Operands.push_back(cast<PHINode>(E->Scalars[j])-> + getIncomingValueForBlock(IBB)); + + Builder.SetInsertPoint(IBB->getTerminator()); + Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + Value *Vec = vectorizeTree(Operands); + NewPhi->addIncoming(Vec, IBB); + } + + assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && + "Invalid number of incoming values"); + return NewPhi; + } + + case Instruction::ExtractElement: { + if (CanReuseExtract(E->Scalars)) { + Value *V = VL0->getOperand(0); + E->VectorizedValue = V; + return V; + } + return Gather(E->Scalars, VecTy); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + ValueList INVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + + Value *InVec = vectorizeTree(INVL); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + CastInst *CI = dyn_cast<CastInst>(VL0); + Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + E->VectorizedValue = V; + return V; + } + case Instruction::FCmp: + case Instruction::ICmp: { + ValueList LHSV, RHSV; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + } + + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + + Value *L = vectorizeTree(LHSV); + Value *R = vectorizeTree(RHSV); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); + Value *V; + if (Opcode == Instruction::FCmp) + V = Builder.CreateFCmp(P0, L, R); + else + V = Builder.CreateICmp(P0, L, R); + + E->VectorizedValue = V; + return V; + } + case Instruction::Select: { + ValueList TrueVec, FalseVec, CondVec; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2)); + } + + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + + Value *Cond = vectorizeTree(CondVec); + Value *True = vectorizeTree(TrueVec); + Value *False = vectorizeTree(FalseVec); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + Value *V = Builder.CreateSelect(Cond, True, False); + E->VectorizedValue = V; + return V; + } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + } + + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); + + if (LHS == RHS && isa<Instruction>(LHS)) { + assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); + } + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + BinaryOperator *BinOp = cast<BinaryOperator>(VL0); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); + E->VectorizedValue = V; + return V; + } + case Instruction::Load: { + // Loads are inserted at the head of the tree because we don't want to + // sink them all the way down past store instructions. + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + + LoadInst *LI = cast<LoadInst>(VL0); + Value *VecPtr = + Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo()); + unsigned Alignment = LI->getAlignment(); + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + E->VectorizedValue = LI; + return LI; + } + case Instruction::Store: { + StoreInst *SI = cast<StoreInst>(VL0); + unsigned Alignment = SI->getAlignment(); + + ValueList ValueOp; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand()); + + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + + Value *VecValue = vectorizeTree(ValueOp); + Value *VecPtr = + Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); + StoreInst *S = Builder.CreateStore(VecValue, VecPtr); + S->setAlignment(Alignment); + E->VectorizedValue = S; + return S; + } + default: + llvm_unreachable("unknown inst"); + } + return 0; +} + +void BoUpSLP::vectorizeTree() { + Builder.SetInsertPoint(F->getEntryBlock().begin()); + vectorizeTree(&VectorizableTree[0]); + + DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); + + // Extract all of the elements with the external uses. + for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end(); + it != e; ++it) { + Value *Scalar = it->Scalar; + llvm::User *User = it->User; + + // Skip users that we already RAUW. This happens when one instruction + // has multiple uses of the same value. + if (std::find(Scalar->use_begin(), Scalar->use_end(), User) == + Scalar->use_end()) + continue; + assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); + + int Idx = ScalarToTreeEntry[Scalar]; + TreeEntry *E = &VectorizableTree[Idx]; + assert(!E->NeedToGather && "Extracting from a gather list"); + + Value *Vec = E->VectorizedValue; + assert(Vec && "Can't find vectorizable value"); + + Value *Lane = Builder.getInt32(it->Lane); + // Generate extracts for out-of-tree users. + // Find the insertion point for the extractelement lane. + if (PHINode *PN = dyn_cast<PHINode>(Vec)) { + Builder.SetInsertPoint(PN->getParent()->getFirstInsertionPt()); + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + User->replaceUsesOfWith(Scalar, Ex); + } else if (isa<Instruction>(Vec)){ + if (PHINode *PH = dyn_cast<PHINode>(User)) { + for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { + if (PH->getIncomingValue(i) == Scalar) { + Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + PH->setOperand(i, Ex); + } + } + } else { + Builder.SetInsertPoint(cast<Instruction>(User)); + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + User->replaceUsesOfWith(Scalar, Ex); + } + } else { + Builder.SetInsertPoint(F->getEntryBlock().begin()); + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + User->replaceUsesOfWith(Scalar, Ex); + } + + DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); + } + + // For each vectorized value: + for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { + TreeEntry *Entry = &VectorizableTree[EIdx]; + + // For each lane: + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + + // No need to handle users of gathered values. + if (Entry->NeedToGather) + continue; + + assert(Entry->VectorizedValue && "Can't find vectorizable value"); + + Type *Ty = Scalar->getType(); + if (!Ty->isVoidTy()) { + for (Value::use_iterator User = Scalar->use_begin(), + UE = Scalar->use_end(); User != UE; ++User) { + DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n"); + assert(!MustGather.count(*User) && + "Replacing gathered value with undef"); + assert(ScalarToTreeEntry.count(*User) && + "Replacing out-of-tree value with undef"); + } + Value *Undef = UndefValue::get(Ty); + Scalar->replaceAllUsesWith(Undef); + } + DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); + cast<Instruction>(Scalar)->eraseFromParent(); + } + } + + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BlocksNumbers[it].forget(); + } + Builder.ClearInsertionPoint(); +} + +void BoUpSLP::optimizeGatherSequence() { + DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() + << " gather sequences instructions.\n"); + // LICM InsertElementInst sequences. + for (SetVector<Instruction *>::iterator it = GatherSeq.begin(), + e = GatherSeq.end(); it != e; ++it) { + InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); + + if (!Insert) + continue; + + // Check if this block is inside a loop. + Loop *L = LI->getLoopFor(Insert->getParent()); + if (!L) + continue; + + // Check if it has a preheader. + BasicBlock *PreHeader = L->getLoopPreheader(); + if (!PreHeader) + continue; + + // If the vector or the element that we insert into it are + // instructions that are defined in this basic block then we can't + // hoist this instruction. + Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); + Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); + if (CurrVec && L->contains(CurrVec)) + continue; + if (NewElem && L->contains(NewElem)) + continue; + + // We can hoist this instruction. Move it to the pre-header. + Insert->moveBefore(PreHeader->getTerminator()); + } + + // Perform O(N^2) search over the gather sequences and merge identical + // instructions. TODO: We can further optimize this scan if we split the + // instructions into different buckets based on the insert lane. + SmallPtrSet<Instruction*, 16> Visited; + SmallVector<Instruction*, 16> ToRemove; + ReversePostOrderTraversal<Function*> RPOT(F); + for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(), + E = RPOT.end(); I != E; ++I) { + BasicBlock *BB = *I; + // For all instructions in the function: + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + Instruction *In = it; + if ((!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) || + !GatherSeq.count(In)) + continue; + + // Check if we can replace this instruction with any of the + // visited instructions. + for (SmallPtrSet<Instruction*, 16>::iterator v = Visited.begin(), + ve = Visited.end(); v != ve; ++v) { + if (In->isIdenticalTo(*v) && + DT->dominates((*v)->getParent(), In->getParent())) { + In->replaceAllUsesWith(*v); + ToRemove.push_back(In); + In = 0; + break; + } + } + if (In) + Visited.insert(In); + } + } + + // Erase all of the instructions that we RAUWed. + for (SmallVectorImpl<Instruction *>::iterator v = ToRemove.begin(), + ve = ToRemove.end(); v != ve; ++v) { + assert((*v)->getNumUses() == 0 && "Can't remove instructions with uses"); + (*v)->eraseFromParent(); + } +} + /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { - typedef MapVector<Value*, BoUpSLP::StoreList> StoreListMap; + typedef SmallVector<StoreInst *, 8> StoreList; + typedef MapVector<Value *, StoreList> StoreListMap; /// Pass identification, replacement for typeid static char ID; @@ -62,6 +1562,7 @@ struct SLPVectorizer : public FunctionPass { TargetTransformInfo *TTI; AliasAnalysis *AA; LoopInfo *LI; + DominatorTree *DT; virtual bool runOnFunction(Function &F) { SE = &getAnalysis<ScalarEvolution>(); @@ -69,6 +1570,7 @@ struct SLPVectorizer : public FunctionPass { TTI = &getAnalysis<TargetTransformInfo>(); AA = &getAnalysis<AliasAnalysis>(); LI = &getAnalysis<LoopInfo>(); + DT = &getAnalysis<DominatorTree>(); StoreRefs.clear(); bool Changed = false; @@ -78,34 +1580,36 @@ struct SLPVectorizer : public FunctionPass { if (!DL) return false; - DEBUG(dbgs()<<"SLP: Analyzing blocks in " << F.getName() << ".\n"); + // Don't vectorize when the attribute NoImplicitFloat is used. + if (F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::NoImplicitFloat)) + return false; - for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) { - BasicBlock *BB = it; - bool BBChanged = false; + DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); - // Use the bollom up slp vectorizer to construct chains that start with - // he store instructions. - BoUpSLP R(BB, SE, DL, TTI, AA, LI->getLoopFor(BB)); + // Use the bollom up slp vectorizer to construct chains that start with + // he store instructions. + BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT); - // Vectorize trees that end at reductions. - BBChanged |= vectorizeReductions(BB, R); + // Scan the blocks in the function in post order. + for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), + e = po_end(&F.getEntryBlock()); it != e; ++it) { + BasicBlock *BB = *it; // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; - DEBUG(dbgs()<<"SLP: Found " << count << " stores to vectorize.\n"); - BBChanged |= vectorizeStoreChains(R); + DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); + Changed |= vectorizeStoreChains(R); } - // Try to hoist some of the scalarization code to the preheader. - if (BBChanged) hoistGatherSequence(LI, BB, R); - - Changed |= BBChanged; + // Vectorize trees that end at reductions. + Changed |= vectorizeChainsInBlock(BB, R); } if (Changed) { - DEBUG(dbgs()<<"SLP: vectorized \""<<F.getName()<<"\"\n"); + R.optimizeGatherSequence(); + DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); DEBUG(verifyFunction(F)); } return Changed; @@ -117,6 +1621,10 @@ struct SLPVectorizer : public FunctionPass { AU.addRequired<AliasAnalysis>(); AU.addRequired<TargetTransformInfo>(); AU.addRequired<LoopInfo>(); + AU.addRequired<DominatorTree>(); + AU.addPreserved<LoopInfo>(); + AU.addPreserved<DominatorTree>(); + AU.setPreservesCFG(); } private: @@ -128,29 +1636,126 @@ private: unsigned collectStores(BasicBlock *BB, BoUpSLP &R); /// \brief Try to vectorize a chain that starts at two arithmetic instrs. - bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); + bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); /// \brief Try to vectorize a list of operands. + /// \returns true if a value was vectorized. bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R); /// \brief Try to vectorize a chain that may start at the operands of \V; - bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); + bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); /// \brief Vectorize the stores that were collected in StoreRefs. bool vectorizeStoreChains(BoUpSLP &R); - /// \brief Try to hoist gather sequences outside of the loop in cases where - /// all of the sources are loop invariant. - void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R); + /// \brief Scan the basic block and look for patterns that are likely to start + /// a vectorization chain. + bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); - /// \brief Scan the basic block and look for reductions that may start a - /// vectorization chain. - bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R); + bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, + BoUpSLP &R); + bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, + BoUpSLP &R); private: StoreListMap StoreRefs; }; +bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, + int CostThreshold, BoUpSLP &R) { + unsigned ChainLen = Chain.size(); + DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen + << "\n"); + Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); + unsigned Sz = DL->getTypeSizeInBits(StoreTy); + unsigned VF = MinVecRegSize / Sz; + + if (!isPowerOf2_32(Sz) || VF < 2) + return false; + + bool Changed = false; + // Look for profitable vectorizable trees at all offsets, starting at zero. + for (unsigned i = 0, e = ChainLen; i < e; ++i) { + if (i + VF > e) + break; + DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i + << "\n"); + ArrayRef<Value *> Operands = Chain.slice(i, VF); + + R.buildTree(Operands); + + int Cost = R.getTreeCost(); + + DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + R.vectorizeTree(); + + // Move to the next bundle. + i += VF - 1; + Changed = true; + } + } + + return Changed; +} + +bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, + int costThreshold, BoUpSLP &R) { + SetVector<Value *> Heads, Tails; + SmallDenseMap<Value *, Value *> ConsecutiveChain; + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we vectorized so that we don't visit the same store twice. + BoUpSLP::ValueSet VectorizedStores; + bool Changed = false; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of stores that follow each other. + for (unsigned i = 0, e = Stores.size(); i < e; ++i) { + for (unsigned j = 0; j < e; ++j) { + if (i == j) + continue; + + if (R.isConsecutiveAccess(Stores[i], Stores[j])) { + Tails.insert(Stores[j]); + Heads.insert(Stores[i]); + ConsecutiveChain[Stores[i]] = Stores[j]; + } + } + } + + // For stores that start but don't end a link in the chain: + for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to vectorize it. + BoUpSLP::ValueList Operands; + Value *I = *it; + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + if (VectorizedStores.count(I)) + break; + Operands.push_back(I); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); + + // Mark the vectorized stores so that we don't vectorize them again. + if (Vectorized) + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed |= Vectorized; + } + + return Changed; +} + + unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { unsigned count = 0; StoreRefs.clear(); @@ -176,34 +1781,50 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { return count; } -bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { - if (!A || !B) return false; +bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { + if (!A || !B) + return false; Value *VL[] = { A, B }; return tryToVectorizeList(VL, R); } bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { - DEBUG(dbgs()<<"SLP: Vectorizing a list of length = " << VL.size() << ".\n"); + if (VL.size() < 2) + return false; + + DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n"); + + // Check that all of the parts are scalar instructions of the same type. + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + if (!I0) + return 0; + + unsigned Opcode0 = I0->getOpcode(); - // Check that all of the parts are scalar. for (int i = 0, e = VL.size(); i < e; ++i) { Type *Ty = VL[i]->getType(); if (Ty->isAggregateType() || Ty->isVectorTy()) return 0; + Instruction *Inst = dyn_cast<Instruction>(VL[i]); + if (!Inst || Inst->getOpcode() != Opcode0) + return 0; } - int Cost = R.getTreeCost(VL); - int ExtrCost = R.getScalarizationCost(VL); - DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << - " Cost of extract:" << ExtrCost << ".\n"); - if ((Cost+ExtrCost) >= -SLPCostThreshold) return false; - DEBUG(dbgs()<<"SLP: Vectorizing pair.\n"); - R.vectorizeArith(VL); + R.buildTree(VL); + int Cost = R.getTreeCost(); + + if (Cost >= -SLPCostThreshold) + return false; + + DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n"); + R.vectorizeTree(); return true; } -bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { - if (!V) return false; +bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { + if (!V) + return false; + // Try to vectorize V. if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) return true; @@ -240,25 +1861,51 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { return 0; } -bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) { +bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; + SmallVector<Value *, 4> Incoming; + // Collect the incoming values from the PHIs. + for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie; + ++instr) { + PHINode *P = dyn_cast<PHINode>(instr); + + if (!P) + break; + + // Stop constructing the list when you reach a different type. + if (Incoming.size() && P->getType() != Incoming[0]->getType()) { + Changed |= tryToVectorizeList(Incoming, R); + Incoming.clear(); + } + + Incoming.push_back(P); + } + + if (Incoming.size() > 1) + Changed |= tryToVectorizeList(Incoming, R); + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - if (isa<DbgInfoIntrinsic>(it)) continue; + if (isa<DbgInfoIntrinsic>(it)) + continue; // Try to vectorize reductions that use PHINodes. if (PHINode *P = dyn_cast<PHINode>(it)) { // Check that the PHI is a reduction PHI. - if (P->getNumIncomingValues() != 2) return Changed; - Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) : - (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : - 0)); + if (P->getNumIncomingValues() != 2) + return Changed; + Value *Rdx = + (P->getIncomingBlock(0) == BB + ? (P->getIncomingValue(0)) + : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0)); // Check if this is a Binary Operator. BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); if (!BI) continue; Value *Inst = BI->getOperand(0); - if (Inst == P) Inst = BI->getOperand(1); + if (Inst == P) + Inst = BI->getOperand(1); + Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R); continue; } @@ -271,7 +1918,8 @@ bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) { } for (int i = 0; i < 2; ++i) if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) - Changed |= tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R); + Changed |= + tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R); continue; } } @@ -287,51 +1935,19 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { if (it->second.size() < 2) continue; - DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " << - it->second.size() << ".\n"); + DEBUG(dbgs() << "SLP: Analyzing a store chain of length " + << it->second.size() << ".\n"); - Changed |= R.vectorizeStores(it->second, -SLPCostThreshold); + // Process the stores in chunks of 16. + for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { + unsigned Len = std::min<unsigned>(CE - CI, 16); + ArrayRef<StoreInst *> Chunk(&it->second[CI], Len); + Changed |= vectorizeStores(Chunk, -SLPCostThreshold, R); + } } return Changed; } -void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, - BoUpSLP &R) { - // Check if this block is inside a loop. - Loop *L = LI->getLoopFor(BB); - if (!L) - return; - - // Check if it has a preheader. - BasicBlock *PreHeader = L->getLoopPreheader(); - if (!PreHeader) - return; - - // Mark the insertion point for the block. - Instruction *Location = PreHeader->getTerminator(); - - BoUpSLP::ValueList &Gathers = R.getGatherSeqInstructions(); - for (BoUpSLP::ValueList::iterator it = Gathers.begin(), e = Gathers.end(); - it != e; ++it) { - InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); - - // The InsertElement sequence can be simplified into a constant. - if (!Insert) - continue; - - // If the vector or the element that we insert into it are - // instructions that are defined in this basic block then we can't - // hoist this instruction. - Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); - Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); - if (CurrVec && L->contains(CurrVec)) continue; - if (NewElem && L->contains(NewElem)) continue; - - // We can hoist this instruction. Move it to the pre-header. - Insert->moveBefore(Location); - } -} - } // end anonymous namespace char SLPVectorizer::ID = 0; @@ -344,8 +1960,5 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) namespace llvm { - Pass *createSLPVectorizerPass() { - return new SLPVectorizer(); - } +Pass *createSLPVectorizerPass() { return new SLPVectorizer(); } } - diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp deleted file mode 100644 index 21e6cdd..0000000 --- a/lib/Transforms/Vectorize/VecUtils.cpp +++ /dev/null @@ -1,852 +0,0 @@ -//===- VecUtils.cpp --- Vectorization Utilities ---------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -#define DEBUG_TYPE "SLP" - -#include "VecUtils.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/Verifier.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Value.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -#include <algorithm> -#include <map> - -using namespace llvm; - -static const unsigned MinVecRegSize = 128; - -static const unsigned RecursionMaxDepth = 6; - -namespace llvm { - -BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) : - Builder(S->getContext()), BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) { - numberInstructions(); -} - -void BoUpSLP::numberInstructions() { - int Loc = 0; - InstrIdx.clear(); - InstrVec.clear(); - // Number the instructions in the block. - for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) { - InstrIdx[it] = Loc++; - InstrVec.push_back(it); - assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); - } -} - -Value *BoUpSLP::getPointerOperand(Value *I) { - if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand(); - if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand(); - return 0; -} - -unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { - if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace(); - if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace(); - return -1; -} - -bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { - Value *PtrA = getPointerOperand(A); - Value *PtrB = getPointerOperand(B); - unsigned ASA = getAddressSpaceOperand(A); - unsigned ASB = getAddressSpaceOperand(B); - - // Check that the address spaces match and that the pointers are valid. - if (!PtrA || !PtrB || (ASA != ASB)) return false; - - // Check that A and B are of the same type. - if (PtrA->getType() != PtrB->getType()) return false; - - // Calculate the distance. - const SCEV *PtrSCEVA = SE->getSCEV(PtrA); - const SCEV *PtrSCEVB = SE->getSCEV(PtrB); - const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB); - const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV); - - // Non constant distance. - if (!ConstOffSCEV) return false; - - int64_t Offset = ConstOffSCEV->getValue()->getSExtValue(); - Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); - // The Instructions are connsecutive if the size of the first load/store is - // the same as the offset. - int64_t Sz = DL->getTypeStoreSize(Ty); - return ((-Offset) == Sz); -} - -bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) { - Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); - unsigned Sz = DL->getTypeSizeInBits(StoreTy); - unsigned VF = MinVecRegSize / Sz; - - if (!isPowerOf2_32(Sz) || VF < 2) return false; - - bool Changed = false; - // Look for profitable vectorizable trees at all offsets, starting at zero. - for (unsigned i = 0, e = Chain.size(); i < e; ++i) { - if (i + VF > e) return Changed; - DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n"); - ArrayRef<Value *> Operands = Chain.slice(i, VF); - - int Cost = getTreeCost(Operands); - DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); - Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Operands,VF))); - vectorizeTree(Operands, VF); - i += VF - 1; - Changed = true; - } - } - - return Changed; -} - -bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) { - SetVector<Value*> Heads, Tails; - SmallDenseMap<Value*, Value*> ConsecutiveChain; - - // We may run into multiple chains that merge into a single chain. We mark the - // stores that we vectorized so that we don't visit the same store twice. - ValueSet VectorizedStores; - bool Changed = false; - - // Do a quadratic search on all of the given stores and find - // all of the pairs of loads that follow each other. - for (unsigned i = 0, e = Stores.size(); i < e; ++i) - for (unsigned j = 0; j < e; ++j) { - if (i == j) continue; - if (isConsecutiveAccess(Stores[i], Stores[j])) { - Tails.insert(Stores[j]); - Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[j]; - } - } - - // For stores that start but don't end a link in the chain: - for (SetVector<Value*>::iterator it = Heads.begin(), e = Heads.end(); - it != e; ++it) { - if (Tails.count(*it)) continue; - - // We found a store instr that starts a chain. Now follow the chain and try - // to vectorize it. - ValueList Operands; - Value *I = *it; - // Collect the chain into a list. - while (Tails.count(I) || Heads.count(I)) { - if (VectorizedStores.count(I)) break; - Operands.push_back(I); - // Move to the next value in the chain. - I = ConsecutiveChain[I]; - } - - bool Vectorized = vectorizeStoreChain(Operands, costThreshold); - - // Mark the vectorized stores so that we don't vectorize them again. - if (Vectorized) - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed |= Vectorized; - } - - return Changed; -} - -int BoUpSLP::getScalarizationCost(ArrayRef<Value *> VL) { - // Find the type of the operands in VL. - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - // Find the cost of inserting/extracting values from the vector. - return getScalarizationCost(VecTy); -} - -int BoUpSLP::getScalarizationCost(Type *Ty) { - int Cost = 0; - for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); - return Cost; -} - -AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { - if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI); - if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI); - return AliasAnalysis::Location(); -} - -Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { - assert(Src->getParent() == Dst->getParent() && "Not the same BB"); - BasicBlock::iterator I = Src, E = Dst; - /// Scan all of the instruction from SRC to DST and check if - /// the source may alias. - for (++I; I != E; ++I) { - // Ignore store instructions that are marked as 'ignore'. - if (MemBarrierIgnoreList.count(I)) continue; - if (Src->mayWriteToMemory()) /* Write */ { - if (!I->mayReadOrWriteMemory()) continue; - } else /* Read */ { - if (!I->mayWriteToMemory()) continue; - } - AliasAnalysis::Location A = getLocation(&*I); - AliasAnalysis::Location B = getLocation(Src); - - if (!A.Ptr || !B.Ptr || AA->alias(A, B)) - return I; - } - return 0; -} - -void BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) { - int LastIdx = getLastIndex(Operands, Operands.size()); - Instruction *Loc = getInsertionPoint(LastIdx); - Builder.SetInsertPoint(Loc); - - assert(getFirstUserIndex(Operands, Operands.size()) > LastIdx && - "Vectorizing with in-tree users"); - - Value *Vec = vectorizeTree(Operands, Operands.size()); - // After vectorizing the operands we need to generate extractelement - // instructions and replace all of the uses of the scalar values with - // the values that we extracted from the vectorized tree. - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); - Operands[i]->replaceAllUsesWith(S); - } -} - -int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) { - // Get rid of the list of stores that were removed, and from the - // lists of instructions with multiple users. - MemBarrierIgnoreList.clear(); - LaneMap.clear(); - MultiUserVals.clear(); - MustScalarize.clear(); - MustExtract.clear(); - - // Find the location of the last root. - int LastRootIndex = getLastIndex(VL, VL.size()); - int FirstUserIndex = getFirstUserIndex(VL, VL.size()); - - // Don't vectorize if there are users of the tree roots inside the tree - // itself. - if (LastRootIndex > FirstUserIndex) - return max_cost; - - // Scan the tree and find which value is used by which lane, and which values - // must be scalarized. - getTreeUses_rec(VL, 0); - - // Check that instructions with multiple users can be vectorized. Mark unsafe - // instructions. - for (SetVector<Value*>::iterator it = MultiUserVals.begin(), - e = MultiUserVals.end(); it != e; ++it) { - // Check that all of the users of this instr are within the tree - // and that they are all from the same lane. - int Lane = -1; - for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end(); - I != E; ++I) { - if (LaneMap.find(*I) == LaneMap.end()) { - DEBUG(dbgs()<<"SLP: Instr " << **it << " has multiple users.\n"); - - // We don't have an ordering problem if the user is not in this basic - // block. - Instruction *Inst = cast<Instruction>(*I); - if (Inst->getParent() != BB) { - MustExtract.insert(*it); - continue; - } - - // We don't have an ordering problem if the user is after the last root. - int Idx = InstrIdx[Inst]; - if (Idx < LastRootIndex) { - MustScalarize.insert(*it); - DEBUG(dbgs()<<"SLP: Adding to MustScalarize " - "because of an unsafe out of tree usage.\n"); - break; - } - - - DEBUG(dbgs()<<"SLP: Adding to MustExtract " - "because of a safe out of tree usage.\n"); - MustExtract.insert(*it); - continue; - } - if (Lane == -1) Lane = LaneMap[*I]; - if (Lane != LaneMap[*I]) { - MustScalarize.insert(*it); - DEBUG(dbgs()<<"SLP: Adding " << **it << - " to MustScalarize because multiple lane use it: " - << Lane << " and " << LaneMap[*I] << ".\n"); - break; - } - } - } - - // Now calculate the cost of vectorizing the tree. - return getTreeCost_rec(VL, 0); -} - -void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) { - if (Depth == RecursionMaxDepth) return; - - // Don't handle vectors. - if (VL[0]->getType()->isVectorTy()) return; - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) - if (SI->getValueOperand()->getType()->isVectorTy()) return; - - // Check if all of the operands are constants. - bool AllConst = true; - bool AllSameScalar = true; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - AllConst &= isa<Constant>(VL[i]); - AllSameScalar &= (VL[0] == VL[i]); - Instruction *I = dyn_cast<Instruction>(VL[i]); - // If one of the instructions is out of this BB, we need to scalarize all. - if (I && I->getParent() != BB) return; - } - - // If all of the operands are identical or constant we have a simple solution. - if (AllConst || AllSameScalar) return; - - // Scalarize unknown structures. - Instruction *VL0 = dyn_cast<Instruction>(VL[0]); - if (!VL0) return; - - unsigned Opcode = VL0->getOpcode(); - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) return; - } - - for (int i = 0, e = VL.size(); i < e; ++i) { - // Check that the instruction is only used within - // one lane. - if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return; - // Make this instruction as 'seen' and remember the lane. - LaneMap[VL[i]] = i; - } - - // Mark instructions with multiple users. - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - // Remember to check if all of the users of this instr are vectorized - // within our tree. At depth zero we have no local users, only external - // users that we don't care about. - if (Depth && I && I->getNumUses() > 1) { - DEBUG(dbgs()<<"SLP: Adding to MultiUserVals " - "because it has multiple users:" << *I << " \n"); - MultiUserVals.insert(I); - } - } - - switch (Opcode) { - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); - - getTreeUses_rec(Operands, Depth+1); - } - return; - } - case Instruction::Store: { - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); - getTreeUses_rec(Operands, Depth+1); - return; - } - default: - return; - } -} - -int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) { - Type *ScalarTy = VL[0]->getType(); - - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - - /// Don't mess with vectors. - if (ScalarTy->isVectorTy()) return max_cost; - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - - if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy); - - // Check if all of the operands are constants. - bool AllConst = true; - bool AllSameScalar = true; - bool MustScalarizeFlag = false; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - AllConst &= isa<Constant>(VL[i]); - AllSameScalar &= (VL[0] == VL[i]); - // Must have a single use. - Instruction *I = dyn_cast<Instruction>(VL[i]); - MustScalarizeFlag |= MustScalarize.count(VL[i]); - // This instruction is outside the basic block. - if (I && I->getParent() != BB) - return getScalarizationCost(VecTy); - } - - // Is this a simple vector constant. - if (AllConst) return 0; - - // If all of the operands are identical we can broadcast them. - Instruction *VL0 = dyn_cast<Instruction>(VL[0]); - if (AllSameScalar) { - // If we are in a loop, and this is not an instruction (e.g. constant or - // argument) or the instruction is defined outside the loop then assume - // that the cost is zero. - if (L && (!VL0 || !L->contains(VL0))) - return 0; - - // We need to broadcast the scalar. - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); - } - - // If this is not a constant, or a scalar from outside the loop then we - // need to scalarize it. - if (MustScalarizeFlag) - return getScalarizationCost(VecTy); - - if (!VL0) return getScalarizationCost(VecTy); - assert(VL0->getParent() == BB && "Wrong BB"); - - unsigned Opcode = VL0->getOpcode(); - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy); - } - - // Check if it is safe to sink the loads or the stores. - if (Opcode == Instruction::Load || Opcode == Instruction::Store) { - int MaxIdx = getLastIndex(VL, VL.size()); - Instruction *Last = InstrVec[MaxIdx]; - - for (unsigned i = 0, e = VL.size(); i < e; ++i ) { - if (VL[i] == Last) continue; - Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last); - if (Barrier) { - DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << - *Last << "\n because of " << *Barrier << "\n"); - return max_cost; - } - } - } - - // Calculate the extract cost. - unsigned ExternalUserExtractCost = 0; - for (unsigned i = 0, e = VL.size(); i < e; ++i) - if (MustExtract.count(VL[i])) - ExternalUserExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); - - switch (Opcode) { - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - int Cost = ExternalUserExtractCost; - ValueList Operands; - Type *SrcTy = VL0->getOperand(0)->getType(); - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); - // Check that the casted type is the same for all users. - if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy) - return getScalarizationCost(VecTy); - } - - Cost += getTreeCost_rec(Operands, Depth+1); - if (Cost >= max_cost) return max_cost; - - // Calculate the cost of this instruction. - int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); - - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); - Cost += (VecCost - ScalarCost); - return Cost; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - int Cost = ExternalUserExtractCost; - // Calculate the cost of all of the operands. - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); - - Cost += getTreeCost_rec(Operands, Depth+1); - if (Cost >= max_cost) return max_cost; - } - - // Calculate the cost of this instruction. - int ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy); - - int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); - Cost += (VecCost - ScalarCost); - return Cost; - } - case Instruction::Load: { - // If we are scalarize the loads, add the cost of forming the vector. - for (unsigned i = 0, e = VL.size()-1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i+1])) - return getScalarizationCost(VecTy); - - // Cost of wide load - cost of scalar loads. - int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - return VecLdCost - ScalarLdCost + ExternalUserExtractCost; - } - case Instruction::Store: { - // We know that we can merge the stores. Calculate the cost. - int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0); - int StoreCost = VecStCost - ScalarStCost; - - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); - MemBarrierIgnoreList.insert(VL[j]); - } - - int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1); - return TotalCost + ExternalUserExtractCost; - } - default: - // Unable to vectorize unknown instructions. - return getScalarizationCost(VecTy); - } -} - -int BoUpSLP::getLastIndex(ArrayRef<Value *> VL, unsigned VF) { - int MaxIdx = InstrIdx[BB->getFirstNonPHI()]; - for (unsigned i = 0; i < VF; ++i ) - MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); - return MaxIdx; -} - -int BoUpSLP::getFirstUserIndex(ArrayRef<Value *> VL, unsigned VF) { - // Find the first user of the values. - int FirstUser = InstrVec.size(); - for (unsigned i = 0; i < VF; ++i) { - for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); - U != UE; ++U) { - Instruction *Instr = dyn_cast<Instruction>(*U); - if (!Instr || Instr->getParent() != BB) - continue; - - FirstUser = std::min(FirstUser, InstrIdx[Instr]); - } - } - return FirstUser; -} - -int BoUpSLP::getLastIndex(Instruction *I, Instruction *J) { - assert(I->getParent() == BB && "Invalid parent for instruction I"); - assert(J->getParent() == BB && "Invalid parent for instruction J"); - return std::max(InstrIdx[I],InstrIdx[J]); -} - -Instruction *BoUpSLP::getInsertionPoint(unsigned Index) { - return InstrVec[Index + 1]; -} - -Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) { - Value *Vec = UndefValue::get(Ty); - for (unsigned i=0; i < Ty->getNumElements(); ++i) { - // Generate the 'InsertElement' instruction. - Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); - // Remember that this instruction is used as part of a 'gather' sequence. - // The caller of the bottom-up slp vectorizer can try to hoist the sequence - // if the users are outside of the basic block. - GatherInstructions.push_back(Vec); - } - - for (unsigned i = 0; i < Ty->getNumElements(); ++i) - VectorizedValues[VL[i]] = Vec; - - return Vec; -} - -Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) { - Value *V = vectorizeTree_rec(VL, VF); - - int LastInstrIdx = getLastIndex(VL, VL.size()); - for (SetVector<Value*>::iterator it = MustExtract.begin(), - e = MustExtract.end(); it != e; ++it) { - Instruction *I = cast<Instruction>(*it); - - // This is a scalarized value, so we can use the original value. - // No need to extract from the vector. - if (!LaneMap.count(I)) - continue; - - Value *Vec = VectorizedValues[I]; - // We decided not to vectorize I because one of its users was not - // vectorizerd. This is okay. - if (!Vec) - continue; - - Value *Idx = Builder.getInt32(LaneMap[I]); - Value *Extract = Builder.CreateExtractElement(Vec, Idx); - bool Replaced = false; - for (Value::use_iterator U = I->use_begin(), UE = I->use_end(); U != UE; - ++U) { - Instruction *UI = cast<Instruction>(*U); - if (UI->getParent() != I->getParent() || InstrIdx[UI] > LastInstrIdx) - UI->replaceUsesOfWith(I ,Extract); - Replaced = true; - } - assert(Replaced && "Must replace at least one outside user"); - (void)Replaced; - } - - // We moved some instructions around. We have to number them again - // before we can do any analysis. - numberInstructions(); - MustScalarize.clear(); - MustExtract.clear(); - VectorizedValues.clear(); - return V; -} - -Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) { - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VF); - - // Check if all of the operands are constants or identical. - bool AllConst = true; - bool AllSameScalar = true; - for (unsigned i = 0, e = VF; i < e; ++i) { - AllConst &= isa<Constant>(VL[i]); - AllSameScalar &= (VL[0] == VL[i]); - // The instruction must be in the same BB, and it must be vectorizable. - Instruction *I = dyn_cast<Instruction>(VL[i]); - if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB)) - return Scalarize(VL, VecTy); - } - - // Check that this is a simple vector constant. - if (AllConst || AllSameScalar) - return Scalarize(VL, VecTy); - - // Scalarize unknown structures. - Instruction *VL0 = dyn_cast<Instruction>(VL[0]); - if (!VL0) - return Scalarize(VL, VecTy); - - if (VectorizedValues.count(VL0)) { - Value * Vec = VectorizedValues[VL0]; - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = Vec; - return Vec; - } - - unsigned Opcode = VL0->getOpcode(); - for (unsigned i = 0, e = VF; i < e; ++i) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) - return Scalarize(VL, VecTy); - } - - switch (Opcode) { - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - ValueList INVL; - for (int i = 0; i < VF; ++i) - INVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); - Value *InVec = vectorizeTree_rec(INVL, VF); - CastInst *CI = dyn_cast<CastInst>(VL0); - Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = V; - - return V; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - ValueList LHSVL, RHSVL; - for (int i = 0; i < VF; ++i) { - LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); - RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1)); - } - - Value *LHS = vectorizeTree_rec(LHSVL, VF); - Value *RHS = vectorizeTree_rec(RHSVL, VF); - BinaryOperator *BinOp = cast<BinaryOperator>(VL0); - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS,RHS); - - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = V; - - return V; - } - case Instruction::Load: { - LoadInst *LI = cast<LoadInst>(VL0); - unsigned Alignment = LI->getAlignment(); - - // Check if all of the loads are consecutive. - for (unsigned i = 1, e = VF; i < e; ++i) - if (!isConsecutiveAccess(VL[i-1], VL[i])) - return Scalarize(VL, VecTy); - - // Loads are inserted at the head of the tree because we don't want to sink - // them all the way down past store instructions. - Instruction *Loc = getInsertionPoint(getLastIndex(VL, VL.size())); - IRBuilder<> LoadBuilder(Loc); - Value *VecPtr = LoadBuilder.CreateBitCast(LI->getPointerOperand(), - VecTy->getPointerTo()); - LI = LoadBuilder.CreateLoad(VecPtr); - LI->setAlignment(Alignment); - - for (int i = 0; i < VF; ++i) - VectorizedValues[VL[i]] = LI; - - return LI; - } - case Instruction::Store: { - StoreInst *SI = cast<StoreInst>(VL0); - unsigned Alignment = SI->getAlignment(); - - ValueList ValueOp; - for (int i = 0; i < VF; ++i) - ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand()); - - Value *VecValue = vectorizeTree_rec(ValueOp, VF); - Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), - VecTy->getPointerTo()); - Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); - - for (int i = 0; i < VF; ++i) - cast<Instruction>(VL[i])->eraseFromParent(); - return 0; - } - default: - return Scalarize(VL, VecTy); - } -} - -} // end of namespace diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h deleted file mode 100644 index d41d2ed..0000000 --- a/lib/Transforms/Vectorize/VecUtils.h +++ /dev/null @@ -1,184 +0,0 @@ -//===- VecUtils.h - Vectorization Utilities -------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This family of classes and functions manipulate vectors and chains of -// vectors. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H -#define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/IR/IRBuilder.h" -#include <vector> - -namespace llvm { - -class BasicBlock; class Instruction; class Type; -class VectorType; class StoreInst; class Value; -class ScalarEvolution; class DataLayout; -class TargetTransformInfo; class AliasAnalysis; -class Loop; - -/// Bottom Up SLP vectorization utility class. -struct BoUpSLP { - typedef SmallVector<Value*, 8> ValueList; - typedef SmallPtrSet<Value*, 16> ValueSet; - typedef SmallVector<StoreInst*, 8> StoreList; - static const int max_cost = 1<<20; - - // \brief C'tor. - BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp); - - /// \brief Take the pointer operand from the Load/Store instruction. - /// \returns NULL if this is not a valid Load/Store instruction. - static Value *getPointerOperand(Value *I); - - /// \brief Take the address space operand from the Load/Store instruction. - /// \returns -1 if this is not a valid Load/Store instruction. - static unsigned getAddressSpaceOperand(Value *I); - - /// \returns true if the memory operations A and B are consecutive. - bool isConsecutiveAccess(Value *A, Value *B); - - /// \brief Vectorize the tree that starts with the elements in \p VL. - /// \returns the vectorized value. - Value *vectorizeTree(ArrayRef<Value *> VL, int VF); - - /// \returns the vectorization cost of the subtree that starts at \p VL. - /// A negative number means that this is profitable. - int getTreeCost(ArrayRef<Value *> VL); - - /// \returns the scalarization cost for this list of values. Assuming that - /// this subtree gets vectorized, we may need to extract the values from the - /// roots. This method calculates the cost of extracting the values. - int getScalarizationCost(ArrayRef<Value *> VL); - - /// \brief Attempts to order and vectorize a sequence of stores. This - /// function does a quadratic scan of the given stores. - /// \returns true if the basic block was modified. - bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold); - - /// \brief Vectorize a group of scalars into a vector tree. - void vectorizeArith(ArrayRef<Value *> Operands); - - /// \returns the list of new instructions that were added in order to collect - /// scalars into vectors. This list can be used to further optimize the gather - /// sequences. - ValueList &getGatherSeqInstructions() {return GatherInstructions; } - -private: - /// \brief This method contains the recursive part of getTreeCost. - int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth); - - /// \brief This recursive method looks for vectorization hazards such as - /// values that are used by multiple users and checks that values are used - /// by only one vector lane. It updates the variables LaneMap, MultiUserVals. - void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth); - - /// \brief This method contains the recursive part of vectorizeTree. - Value *vectorizeTree_rec(ArrayRef<Value *> VL, int VF); - - /// \brief Number all of the instructions in the block. - void numberInstructions(); - - /// \brief Vectorize a sorted sequence of stores. - bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold); - - /// \returns the scalarization cost for this type. Scalarization in this - /// context means the creation of vectors from a group of scalars. - int getScalarizationCost(Type *Ty); - - /// \returns the AA location that is being access by the instruction. - AliasAnalysis::Location getLocation(Instruction *I); - - /// \brief Checks if it is possible to sink an instruction from - /// \p Src to \p Dst. - /// \returns the pointer to the barrier instruction if we can't sink. - Value *isUnsafeToSink(Instruction *Src, Instruction *Dst); - - /// \returns the index of the last instrucion in the BB from \p VL. - /// Only consider the first \p VF elements. - int getLastIndex(ArrayRef<Value *> VL, unsigned VF); - - /// \returns the index of the first User of \p VL. - /// Only consider the first \p VF elements. - int getFirstUserIndex(ArrayRef<Value *> VL, unsigned VF); - - /// \returns the instruction \p I or \p Jt hat appears last in the BB . - int getLastIndex(Instruction *I, Instruction *J); - - /// \returns the insertion point for \p Index. - Instruction *getInsertionPoint(unsigned Index); - - /// \returns a vector from a collection of scalars in \p VL. - Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty); - -private: - /// Maps instructions to numbers and back. - SmallDenseMap<Value*, int> InstrIdx; - /// Maps integers to Instructions. - std::vector<Instruction*> InstrVec; - - // -- containers that are used during getTreeCost -- // - - /// Contains values that must be scalarized because they are used - /// by multiple lanes, or by users outside the tree. - /// NOTICE: The vectorization methods also use this set. - ValueSet MustScalarize; - - /// Contains values that have users outside of the vectorized graph. - /// We need to generate extract instructions for these values. - /// NOTICE: The vectorization methods also use this set. - SetVector<Value*> MustExtract; - - /// Contains a list of values that are used outside the current tree. This - /// set must be reset between runs. - SetVector<Value*> MultiUserVals; - /// Maps values in the tree to the vector lanes that uses them. This map must - /// be reset between runs of getCost. - std::map<Value*, int> LaneMap; - /// A list of instructions to ignore while sinking - /// memory instructions. This map must be reset between runs of getCost. - ValueSet MemBarrierIgnoreList; - - // -- Containers that are used during vectorizeTree -- // - - /// Maps between the first scalar to the vector. This map must be reset - ///between runs. - DenseMap<Value*, Value*> VectorizedValues; - - // -- Containers that are used after vectorization by the caller -- // - - /// A list of instructions that are used when gathering scalars into vectors. - /// In many cases these instructions can be hoisted outside of the BB. - /// Iterating over this list is faster than calling LICM. - ValueList GatherInstructions; - - /// Instruction builder to construct the vectorized tree. - IRBuilder<> Builder; - - // Analysis and block reference. - BasicBlock *BB; - ScalarEvolution *SE; - DataLayout *DL; - TargetTransformInfo *TTI; - AliasAnalysis *AA; - Loop *L; -}; - -} // end of namespace - -#endif // LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H |