diff options
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 552 |
1 files changed, 390 insertions, 162 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 44bfea1..baf9741 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -19,9 +19,10 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -74,6 +75,27 @@ static const unsigned MinVecRegSize = 128; static const unsigned RecursionMaxDepth = 12; +// Limit the number of alias checks. The limit is chosen so that +// it has no negative effect on the llvm benchmarks. +static const unsigned AliasedCheckLimit = 10; + +// Another limit for the alias checks: The maximum distance between load/store +// instructions where alias checks are done. +// This limit is useful for very large basic blocks. +static const unsigned MaxMemDepDistance = 160; + +/// \brief Predicate for the element types that the SLP vectorizer supports. +/// +/// The most important thing to filter here are types which are invalid in LLVM +/// vectors. We also filter target specific types which have absolutely no +/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just +/// avoids spending time checking the cost model and realizing that they will +/// be inevitably scalarized. +static bool isValidElementType(Type *Ty) { + return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && + !Ty->isPPC_FP128Ty(); +} + /// \returns the parent basic block if all of the instructions in \p VL /// are in the same block or null otherwise. static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { @@ -207,6 +229,8 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { MD = MDNode::getMostGenericTBAA(MD, IMD); break; case LLVMContext::MD_alias_scope: + MD = MDNode::getMostGenericAliasScope(MD, IMD); + break; case LLVMContext::MD_noalias: MD = MDNode::intersect(MD, IMD); break; @@ -263,104 +287,6 @@ static bool CanReuseExtract(ArrayRef<Value *> VL) { return true; } -static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right) { - - SmallVector<Value *, 16> OrigLeft, OrigRight; - - bool AllSameOpcodeLeft = true; - bool AllSameOpcodeRight = true; - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - Instruction *I = cast<Instruction>(VL[i]); - Value *V0 = I->getOperand(0); - Value *V1 = I->getOperand(1); - - OrigLeft.push_back(V0); - OrigRight.push_back(V1); - - Instruction *I0 = dyn_cast<Instruction>(V0); - Instruction *I1 = dyn_cast<Instruction>(V1); - - // Check whether all operands on one side have the same opcode. In this case - // we want to preserve the original order and not make things worse by - // reordering. - AllSameOpcodeLeft = I0; - AllSameOpcodeRight = I1; - - if (i && AllSameOpcodeLeft) { - if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) { - if(P0->getOpcode() != I0->getOpcode()) - AllSameOpcodeLeft = false; - } else - AllSameOpcodeLeft = false; - } - if (i && AllSameOpcodeRight) { - if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) { - if(P1->getOpcode() != I1->getOpcode()) - AllSameOpcodeRight = false; - } else - AllSameOpcodeRight = false; - } - - // Sort two opcodes. In the code below we try to preserve the ability to use - // broadcast of values instead of individual inserts. - // vl1 = load - // vl2 = phi - // vr1 = load - // vr2 = vr2 - // = vl1 x vr1 - // = vl2 x vr2 - // If we just sorted according to opcode we would leave the first line in - // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). - // = vl1 x vr1 - // = vr2 x vl2 - // Because vr2 and vr1 are from the same load we loose the opportunity of a - // broadcast for the packed right side in the backend: we have [vr1, vl2] - // instead of [vr1, vr2=vr1]. - if (I0 && I1) { - if(!i && I0->getOpcode() > I1->getOpcode()) { - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) { - // Try not to destroy a broad cast for no apparent benefit. - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) { - // Try preserve broadcasts. - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) { - // Try preserve broadcasts. - Left.push_back(I1); - Right.push_back(I0); - } else { - Left.push_back(I0); - Right.push_back(I1); - } - continue; - } - // One opcode, put the instruction on the right. - if (I0) { - Left.push_back(V1); - Right.push_back(I0); - continue; - } - Left.push_back(V0); - Right.push_back(V1); - } - - bool LeftBroadcast = isSplat(Left); - bool RightBroadcast = isSplat(Right); - - // Don't reorder if the operands where good to begin with. - if (!(LeftBroadcast || RightBroadcast) && - (AllSameOpcodeRight || AllSameOpcodeLeft)) { - Left = OrigLeft; - Right = OrigRight; - } -} - /// \returns True if in-tree use also needs extract. This refers to /// possible scalar operand in vectorized instruction. static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, @@ -388,6 +314,26 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, } } +/// \returns the AA location that is being access by the instruction. +static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) { + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return AA->getLocation(SI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return AA->getLocation(LI); + return AliasAnalysis::Location(); +} + +/// \returns True if the instruction is not a volatile or atomic load/store. +static bool isSimple(Instruction *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->isSimple(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->isSimple(); + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) + return !MI->isVolatile(); + return true; +} + /// Bottom Up SLP Vectorizer. class BoUpSLP { public: @@ -398,11 +344,11 @@ public: BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, - LoopInfo *Li, DominatorTree *Dt, AssumptionTracker *AT) - : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), - F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), + LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC) + : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), + SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), Builder(Se->getContext()) { - CodeMetrics::collectEphemeralValues(F, AT, EphValues); + CodeMetrics::collectEphemeralValues(F, AC, EphValues); } /// \brief Vectorize the tree that starts with the elements in \p VL. @@ -494,6 +440,16 @@ private: /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(); + /// \reorder commutative operands in alt shuffle if they result in + /// vectorized code. + void reorderAltShuffleOperands(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right); + /// \reorder commutative operands to get better probability of + /// generating vectorized code. + void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right); struct TreeEntry { TreeEntry() : Scalars(), VectorizedValue(nullptr), NeedToGather(0) {} @@ -555,6 +511,52 @@ private: }; typedef SmallVector<ExternalUser, 16> UserList; + /// Checks if two instructions may access the same memory. + /// + /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it + /// is invariant in the calling loop. + bool isAliased(const AliasAnalysis::Location &Loc1, Instruction *Inst1, + Instruction *Inst2) { + + // First check if the result is already in the cache. + AliasCacheKey key = std::make_pair(Inst1, Inst2); + Optional<bool> &result = AliasCache[key]; + if (result.hasValue()) { + return result.getValue(); + } + AliasAnalysis::Location Loc2 = getLocation(Inst2, AA); + bool aliased = true; + if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { + // Do the alias check. + aliased = AA->alias(Loc1, Loc2); + } + // Store the result in the cache. + result = aliased; + return aliased; + } + + typedef std::pair<Instruction *, Instruction *> AliasCacheKey; + + /// Cache for alias results. + /// TODO: consider moving this to the AliasAnalysis itself. + DenseMap<AliasCacheKey, Optional<bool>> AliasCache; + + /// Removes an instruction from its block and eventually deletes it. + /// It's like Instruction::eraseFromParent() except that the actual deletion + /// is delayed until BoUpSLP is destructed. + /// This is required to ensure that there are no incorrect collisions in the + /// AliasCache, which can happen if a new instruction is allocated at the + /// same address as a previously deleted instruction. + void eraseInstruction(Instruction *I) { + I->removeFromParent(); + I->dropAllReferences(); + DeletedInstructions.push_back(std::unique_ptr<Instruction>(I)); + } + + /// Temporary store for deleted instructions. Instructions will be deleted + /// eventually when the BoUpSLP is destructed. + SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions; + /// A list of values that need to extracted out of the tree. /// This list holds pairs of (Internal Scalar : External User). UserList ExternalUses; @@ -791,7 +793,7 @@ private: /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef<Value *> VL, AliasAnalysis *AA); + bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP); /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef<Value *> VL); @@ -808,7 +810,7 @@ private: /// Updates the dependency information of a bundle and of all instructions/ /// bundles which depend on the original bundle. void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, - AliasAnalysis *AA); + BoUpSLP *SLP); /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); @@ -857,7 +859,7 @@ private: }; /// Attaches the BlockScheduling structures to basic blocks. - DenseMap<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; + MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. @@ -1031,11 +1033,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } } - // If any of the scalars appears in the table OR it is marked as a value that - // needs to stat scalar then we need to gather the scalars. + // If any of the scalars is marked as a value that needs to stay scalar then + // we need to gather the scalars. for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) { - DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n"); + if (MustGather.count(VL[i])) { + DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); newTreeEntry(VL, false); return; } @@ -1069,7 +1071,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, AA)) { + if (!BS.tryScheduleBundle(VL, this)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); BS.cancelScheduling(VL); newTreeEntry(VL, false); @@ -1158,7 +1160,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { Type *SrcTy = VL0->getOperand(0)->getType(); for (unsigned i = 0; i < VL.size(); ++i) { Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); - if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { + if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); @@ -1381,6 +1383,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + + // Reorder operands if reordering would enable vectorization. + if (isa<BinaryOperator>(VL0)) { + ValueList Left, Right; + reorderAltShuffleOperands(VL, Left, Right); + buildTree_rec(Left, Depth + 1); + buildTree_rec(Right, Depth + 1); + return; + } + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1704,7 +1716,7 @@ int BoUpSLP::getTreeCost() { // We only vectorize tiny trees if it is fully vectorizable. if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { - if (!VectorizableTree.size()) { + if (VectorizableTree.empty()) { assert(!ExternalUses.size() && "We should not have any external users"); } return INT_MAX; @@ -1818,6 +1830,195 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { return X == PtrSCEVB; } +// Reorder commutative operations in alternate shuffle if the resulting vectors +// are consecutive loads. This would allow us to vectorize the tree. +// If we have something like- +// load a[0] - load b[0] +// load b[1] + load a[1] +// load a[2] - load b[2] +// load a[3] + load b[3] +// Reordering the second load b[1] load a[1] would allow us to vectorize this +// code. +void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right) { + + // Push left and right operands of binary operation into Left and Right + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Left.push_back(cast<Instruction>(VL[i])->getOperand(0)); + Right.push_back(cast<Instruction>(VL[i])->getOperand(1)); + } + + // Reorder if we have a commutative operation and consecutive access + // are on either side of the alternate instructions. + for (unsigned j = 0; j < VL.size() - 1; ++j) { + if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { + Instruction *VL1 = cast<Instruction>(VL[j]); + Instruction *VL2 = cast<Instruction>(VL[j + 1]); + if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + // else unchanged + } + } + if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { + Instruction *VL1 = cast<Instruction>(VL[j]); + Instruction *VL2 = cast<Instruction>(VL[j + 1]); + if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + // else unchanged + } + } + } +} + +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right) { + + SmallVector<Value *, 16> OrigLeft, OrigRight; + + bool AllSameOpcodeLeft = true; + bool AllSameOpcodeRight = true; + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + Instruction *I = cast<Instruction>(VL[i]); + Value *VLeft = I->getOperand(0); + Value *VRight = I->getOperand(1); + + OrigLeft.push_back(VLeft); + OrigRight.push_back(VRight); + + Instruction *ILeft = dyn_cast<Instruction>(VLeft); + Instruction *IRight = dyn_cast<Instruction>(VRight); + + // Check whether all operands on one side have the same opcode. In this case + // we want to preserve the original order and not make things worse by + // reordering. + if (i && AllSameOpcodeLeft && ILeft) { + if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) { + if (PLeft->getOpcode() != ILeft->getOpcode()) + AllSameOpcodeLeft = false; + } else + AllSameOpcodeLeft = false; + } + if (i && AllSameOpcodeRight && IRight) { + if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) { + if (PRight->getOpcode() != IRight->getOpcode()) + AllSameOpcodeRight = false; + } else + AllSameOpcodeRight = false; + } + + // Sort two opcodes. In the code below we try to preserve the ability to use + // broadcast of values instead of individual inserts. + // vl1 = load + // vl2 = phi + // vr1 = load + // vr2 = vr2 + // = vl1 x vr1 + // = vl2 x vr2 + // If we just sorted according to opcode we would leave the first line in + // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). + // = vl1 x vr1 + // = vr2 x vl2 + // Because vr2 and vr1 are from the same load we loose the opportunity of a + // broadcast for the packed right side in the backend: we have [vr1, vl2] + // instead of [vr1, vr2=vr1]. + if (ILeft && IRight) { + if (!i && ILeft->getOpcode() > IRight->getOpcode()) { + Left.push_back(IRight); + Right.push_back(ILeft); + } else if (i && ILeft->getOpcode() > IRight->getOpcode() && + Right[i - 1] != IRight) { + // Try not to destroy a broad cast for no apparent benefit. + Left.push_back(IRight); + Right.push_back(ILeft); + } else if (i && ILeft->getOpcode() == IRight->getOpcode() && + Right[i - 1] == ILeft) { + // Try preserve broadcasts. + Left.push_back(IRight); + Right.push_back(ILeft); + } else if (i && ILeft->getOpcode() == IRight->getOpcode() && + Left[i - 1] == IRight) { + // Try preserve broadcasts. + Left.push_back(IRight); + Right.push_back(ILeft); + } else { + Left.push_back(ILeft); + Right.push_back(IRight); + } + continue; + } + // One opcode, put the instruction on the right. + if (ILeft) { + Left.push_back(VRight); + Right.push_back(ILeft); + continue; + } + Left.push_back(VLeft); + Right.push_back(VRight); + } + + bool LeftBroadcast = isSplat(Left); + bool RightBroadcast = isSplat(Right); + + // If operands end up being broadcast return this operand order. + if (LeftBroadcast || RightBroadcast) + return; + + // Don't reorder if the operands where good to begin. + if (AllSameOpcodeRight || AllSameOpcodeLeft) { + Left = OrigLeft; + Right = OrigRight; + } + + // Finally check if we can get longer vectorizable chain by reordering + // without breaking the good operand order detected above. + // E.g. If we have something like- + // load a[0] load b[0] + // load b[1] load a[1] + // load a[2] load b[2] + // load a[3] load b[3] + // Reordering the second load b[1] load a[1] would allow us to vectorize + // this code and we still retain AllSameOpcode property. + // FIXME: This load reordering might break AllSameOpcode in some rare cases + // such as- + // add a[0],c[0] load b[0] + // add a[1],c[2] load b[1] + // b[2] load b[2] + // add a[3],c[3] load b[3] + for (unsigned j = 0; j < VL.size() - 1; ++j) { + if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { + if (isConsecutiveAccess(L, L1)) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + } + } + if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { + if (isConsecutiveAccess(L, L1)) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + } + } + // else unchanged + } +} + void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { Instruction *VL0 = cast<Instruction>(VL[0]); BasicBlock::iterator NextInst = VL0; @@ -2214,10 +2415,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); - RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); - } + assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand"); + reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars); Value *LHS = vectorizeTree(LHSVL); @@ -2360,7 +2559,7 @@ Value *BoUpSLP::vectorizeTree() { Scalar->replaceAllUsesWith(Undef); } DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); - cast<Instruction>(Scalar)->eraseFromParent(); + eraseInstruction(cast<Instruction>(Scalar)); } } @@ -2442,7 +2641,7 @@ void BoUpSLP::optimizeGatherSequence() { if (In->isIdenticalTo(*v) && DT->dominates((*v)->getParent(), In->getParent())) { In->replaceAllUsesWith(*v); - In->eraseFromParent(); + eraseInstruction(In); In = nullptr; break; } @@ -2460,7 +2659,7 @@ void BoUpSLP::optimizeGatherSequence() { // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, - AliasAnalysis *AA) { + BoUpSLP *SLP) { if (isa<PHINode>(VL[0])) return true; @@ -2517,7 +2716,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " << BB->getName() << "\n"); - calculateDependencies(Bundle, true, AA); + calculateDependencies(Bundle, true, SLP); // Now try to schedule the new bundle. As soon as the bundle is "ready" it // means that there are no cyclic dependencies and we can schedule it. @@ -2648,18 +2847,9 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, } } -/// \returns the AA location that is being access by the instruction. -static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) { - if (StoreInst *SI = dyn_cast<StoreInst>(I)) - return AA->getLocation(SI); - if (LoadInst *LI = dyn_cast<LoadInst>(I)) - return AA->getLocation(LI); - return AliasAnalysis::Location(); -} - void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, bool InsertInReadyList, - AliasAnalysis *AA) { + BoUpSLP *SLP) { assert(SD->isSchedulingEntity()); SmallVector<ScheduleData *, 10> WorkList; @@ -2704,26 +2894,60 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, // Handle the memory dependencies. ScheduleData *DepDest = BundleMember->NextLoadStore; if (DepDest) { - AliasAnalysis::Location SrcLoc = getLocation(BundleMember->Inst, AA); + Instruction *SrcInst = BundleMember->Inst; + AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA); bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); + unsigned numAliased = 0; + unsigned DistToSrc = 1; while (DepDest) { assert(isInSchedulingRegion(DepDest)); - if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) { - AliasAnalysis::Location DstLoc = getLocation(DepDest->Inst, AA); - if (!SrcLoc.Ptr || !DstLoc.Ptr || AA->alias(SrcLoc, DstLoc)) { - DepDest->MemoryDependencies.push_back(BundleMember); - BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); - } + + // We have two limits to reduce the complexity: + // 1) AliasedCheckLimit: It's a small limit to reduce calls to + // SLP->isAliased (which is the expensive part in this loop). + // 2) MaxMemDepDistance: It's for very large blocks and it aborts + // the whole loop (even if the loop is fast, it's quadratic). + // It's important for the loop break condition (see below) to + // check this limit even between two read-only instructions. + if (DistToSrc >= MaxMemDepDistance || + ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && + (numAliased >= AliasedCheckLimit || + SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { + + // We increment the counter only if the locations are aliased + // (instead of counting all alias checks). This gives a better + // balance between reduced runtime and accurate dependencies. + numAliased++; + + DepDest->MemoryDependencies.push_back(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = DepDest->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) { + WorkList.push_back(DestBundle); } } DepDest = DepDest->NextLoadStore; + + // Example, explaining the loop break condition: Let's assume our + // starting instruction is i0 and MaxMemDepDistance = 3. + // + // +--------v--v--v + // i0,i1,i2,i3,i4,i5,i6,i7,i8 + // +--------^--^--^ + // + // MaxMemDepDistance let us stop alias-checking at i3 and we add + // dependencies from i0 to i3,i4,.. (even if they are not aliased). + // Previously we already added dependencies from i3 to i6,i7,i8 + // (because of MaxMemDepDistance). As we added a dependency from + // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 + // and we can abort this loop at i6. + if (DistToSrc >= 2 * MaxMemDepDistance) + break; + DistToSrc++; } } } @@ -2779,7 +3003,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { "scheduler and vectorizer have different opinion on what is a bundle"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { - BS->calculateDependencies(SD, false, AA); + BS->calculateDependencies(SD, false, this); NumToSchedule++; } } @@ -2833,7 +3057,7 @@ struct SLPVectorizer : public FunctionPass { AliasAnalysis *AA; LoopInfo *LI; DominatorTree *DT; - AssumptionTracker *AT; + AssumptionCache *AC; bool runOnFunction(Function &F) override { if (skipOptnoneFunction(F)) @@ -2842,12 +3066,13 @@ struct SLPVectorizer : public FunctionPass { SE = &getAnalysis<ScalarEvolution>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; - TTI = &getAnalysis<TargetTransformInfo>(); - TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TLI = TLIP ? &TLIP->getTLI() : nullptr; AA = &getAnalysis<AliasAnalysis>(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - AT = &getAnalysis<AssumptionTracker>(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); StoreRefs.clear(); bool Changed = false; @@ -2870,7 +3095,10 @@ struct SLPVectorizer : public FunctionPass { // Use the bottom up slp vectorizer to construct chains that start with // store instructions. - BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AT); + BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC); + + // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to + // delete instructions. // Scan the blocks in the function in post order. for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), @@ -2897,13 +3125,13 @@ struct SLPVectorizer : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { FunctionPass::getAnalysisUsage(AU); - AU.addRequired<AssumptionTracker>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<ScalarEvolution>(); AU.addRequired<AliasAnalysis>(); - AU.addRequired<TargetTransformInfo>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } @@ -3078,7 +3306,7 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { // Check that the pointer points to scalars. Type *Ty = SI->getValueOperand()->getType(); - if (Ty->isAggregateType() || Ty->isVectorTy()) + if (!isValidElementType(Ty)) continue; // Find the base pointer. @@ -3119,7 +3347,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, for (int i = 0, e = VL.size(); i < e; ++i) { Type *Ty = VL[i]->getType(); - if (Ty->isAggregateType() || Ty->isVectorTy()) + if (!isValidElementType(Ty)) return false; Instruction *Inst = dyn_cast<Instruction>(VL[i]); if (!Inst || Inst->getOpcode() != Opcode0) @@ -3339,7 +3567,7 @@ public: return false; Type *Ty = B->getType(); - if (Ty->isVectorTy()) + if (!isValidElementType(Ty)) return false; ReductionOpcode = B->getOpcode(); @@ -3502,11 +3730,10 @@ private: /// \brief Emit a horizontal reduction of the vectorized value. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { assert(VectorizedValue && "Need to have a vectorized tree node"); - Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); - Value *TmpVec = ValToReduce; + Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { if (IsPairwiseReduction) { Value *LeftMask = @@ -3730,6 +3957,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // and the iterator may become invalid value. it = BB->begin(); e = BB->end(); + break; } } } @@ -3786,8 +4014,8 @@ char SLPVectorizer::ID = 0; static const char lv_name[] = "SLP Vectorizer"; INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) |