diff options
Diffstat (limited to 'lib/Transforms/Vectorize')
| -rw-r--r-- | lib/Transforms/Vectorize/Android.mk | 4 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/CMakeLists.txt | 2 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 370 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 348 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/VecUtils.cpp | 730 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/VecUtils.h | 164 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/Vectorize.cpp | 7 |
7 files changed, 1543 insertions, 82 deletions
diff --git a/lib/Transforms/Vectorize/Android.mk b/lib/Transforms/Vectorize/Android.mk index de03793..58698fe 100644 --- a/lib/Transforms/Vectorize/Android.mk +++ b/lib/Transforms/Vectorize/Android.mk @@ -3,7 +3,9 @@ LOCAL_PATH:= $(call my-dir) transforms_vectorize_SRC_FILES := \ BBVectorize.cpp \ LoopVectorize.cpp \ - Vectorize.cpp + SLPVectorizer.cpp \ + Vectorize.cpp \ + VecUtils.cpp # For the host # ===================================================== diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index e64034a..7ae082f 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -2,6 +2,8 @@ add_llvm_library(LLVMVectorize BBVectorize.cpp Vectorize.cpp LoopVectorize.cpp + SLPVectorizer.cpp + VecUtils.cpp ) add_dependencies(LLVMVectorize intrinsics_gen) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 930d9c4..9a832f7 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -78,6 +78,7 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/PatternMatch.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" @@ -87,6 +88,7 @@ #include <map> using namespace llvm; +using namespace llvm::PatternMatch; static cl::opt<unsigned> VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, @@ -112,9 +114,9 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), /// We don't unroll loops with a known constant trip count below this number. static const unsigned TinyTripCountUnrollThreshold = 128; -/// When performing a runtime memory check, do not check more than this -/// number of pointers. Notice that the check is quadratic! -static const unsigned RuntimeMemoryCheckThreshold = 4; +/// When performing memory disambiguation checks at runtime do not make more +/// than this number of comparisons. +static const unsigned RuntimeMemoryCheckThreshold = 8; /// We use a metadata with this name to indicate that a scalar loop was /// vectorized and that we don't need to re-vectorize it if we run into it @@ -343,6 +345,7 @@ public: RK_IntegerOr, ///< Bitwise or logical OR of numbers. RK_IntegerAnd, ///< Bitwise or logical AND of numbers. RK_IntegerXor, ///< Bitwise or logical XOR of numbers. + RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). RK_FloatAdd, ///< Sum of floats. RK_FloatMult ///< Product of floats. }; @@ -356,13 +359,23 @@ public: IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). }; + // This enum represents the kind of minmax reduction. + enum MinMaxReductionKind { + MRK_Invalid, + MRK_UIntMin, + MRK_UIntMax, + MRK_SIntMin, + MRK_SIntMax + }; + /// This POD struct holds information about reduction variables. struct ReductionDescriptor { ReductionDescriptor() : StartValue(0), LoopExitInstr(0), - Kind(RK_NoReduction) {} + Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} - ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K) - : StartValue(Start), LoopExitInstr(Exit), Kind(K) {} + ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K, + MinMaxReductionKind MK) + : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {} // The starting value of the reduction. // It does not have to be zero! @@ -371,6 +384,25 @@ public: Instruction *LoopExitInstr; // The kind of the reduction. ReductionKind Kind; + // If this a min/max reduction the kind of reduction. + MinMaxReductionKind MinMaxKind; + }; + + /// This POD struct holds information about a potential reduction operation. + struct ReductionInstDesc { + ReductionInstDesc(bool IsRedux, Instruction *I) : + IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {} + + ReductionInstDesc(Instruction *I, MinMaxReductionKind K) : + IsReduction(true), PatternLastInst(I), MinMaxKind(K) {} + + // Is this instruction a reduction candidate. + bool IsReduction; + // The last instruction in a min/max pattern (select of the select(icmp()) + // pattern), or the current reduction instruction otherwise. + Instruction *PatternLastInst; + // If this is a min/max pattern the comparison predicate. + MinMaxReductionKind MinMaxKind; }; // This POD struct holds information about the memory runtime legality @@ -387,7 +419,7 @@ public: } /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr); + void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr); /// This flag indicates if we need to add the runtime check. bool Need; @@ -397,6 +429,8 @@ public: SmallVector<const SCEV*, 2> Starts; /// Holds the pointer value at the end of the loop. SmallVector<const SCEV*, 2> Ends; + /// Holds the information if this pointer is used for writing to memory. + SmallVector<bool, 2> IsWritePtr; }; /// A POD for saving information about induction variables. @@ -461,6 +495,11 @@ public: /// Returns the information that we collected about runtime memory check. RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } + + /// This function returns the identity element (or neutral element) for + /// the operation K. + static Constant *getReductionIdentity(ReductionKind K, Type *Tp, + MinMaxReductionKind MinMaxK); private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -487,9 +526,17 @@ private: /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. bool AddReductionVar(PHINode *Phi, ReductionKind Kind); - /// Returns true if the instruction I can be a reduction variable of type - /// 'Kind'. - bool isReductionInstr(Instruction *I, ReductionKind Kind); + /// Returns a struct describing if the instruction 'I' can be a reduction + /// variable of type 'Kind'. If the reduction is a min/max pattern of + /// select(icmp()) this function advances the instruction pointer 'I' from the + /// compare instruction to the select instruction and stores this pointer in + /// 'PatternLastInst' member of the returned struct. + ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind, + ReductionInstDesc &Desc); + /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction + /// pattern corresponding to a min(X, Y) or max(X, Y). + static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I, + ReductionInstDesc &Prev); /// Returns the induction kind of Phi. This function may return NoInduction /// if the PHI is not an induction variable. InductionKind isInductionVariable(PHINode *Phi); @@ -662,6 +709,11 @@ struct LoopVectorize : public LoopPass { AA = getAnalysisIfAvailable<AliasAnalysis>(); TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + if (DL == NULL) { + DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout"); + return false; + } + DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); @@ -737,7 +789,8 @@ struct LoopVectorize : public LoopPass { void LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, - Loop *Lp, Value *Ptr) { + Loop *Lp, Value *Ptr, + bool WritePtr) { const SCEV *Sc = SE->getSCEV(Ptr); const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); assert(AR && "Invalid addrec expression"); @@ -746,6 +799,7 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, Pointers.push_back(Ptr); Starts.push_back(AR->getStart()); Ends.push_back(ScEnd); + IsWritePtr.push_back(WritePtr); } Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { @@ -906,12 +960,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); + unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); + unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; + + if (ScalarAllocatedSize != VectorElementSize) + return scalarizeInstruction(Instr); + // If the pointer is loop invariant or if it is non consecutive, // scalarize the load. - int Stride = Legal->isConsecutivePtr(Ptr); - bool Reverse = Stride < 0; + int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + bool Reverse = ConsecutiveStride < 0; bool UniformLoad = LI && Legal->isUniform(Ptr); - if (Stride == 0 || UniformLoad) + if (!ConsecutiveStride || UniformLoad) return scalarizeInstruction(Instr); Constant *Zero = Builder.getInt32(0); @@ -1040,10 +1100,10 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); - // For each scalar that we create: - for (unsigned Width = 0; Width < VF; ++Width) { - // For each vector unroll 'part': - for (unsigned Part = 0; Part < UF; ++Part) { + // For each vector unroll 'part': + for (unsigned Part = 0; Part < UF; ++Part) { + // For each scalar that we create: + for (unsigned Width = 0; Width < VF; ++Width) { Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); @@ -1110,6 +1170,10 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, for (unsigned i = 0; i < NumPointers; ++i) { for (unsigned j = i+1; j < NumPointers; ++j) { + // No need to check if two readonly pointers intersect. + if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) + continue; + Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc"); Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc"); Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc"); @@ -1436,26 +1500,45 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { /// This function returns the identity element (or neutral element) for /// the operation K. -static Constant* -getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) { +Constant* +LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp, + MinMaxReductionKind MinMaxK) { switch (K) { - case LoopVectorizationLegality:: RK_IntegerXor: - case LoopVectorizationLegality:: RK_IntegerAdd: - case LoopVectorizationLegality:: RK_IntegerOr: + case RK_IntegerXor: + case RK_IntegerAdd: + case RK_IntegerOr: // Adding, Xoring, Oring zero to a number does not change it. return ConstantInt::get(Tp, 0); - case LoopVectorizationLegality:: RK_IntegerMult: + case RK_IntegerMult: // Multiplying a number by 1 does not change it. return ConstantInt::get(Tp, 1); - case LoopVectorizationLegality:: RK_IntegerAnd: + case RK_IntegerAnd: // AND-ing a number with an all-1 value does not change it. return ConstantInt::get(Tp, -1, true); - case LoopVectorizationLegality:: RK_FloatMult: + case RK_FloatMult: // Multiplying a number by 1 does not change it. return ConstantFP::get(Tp, 1.0L); - case LoopVectorizationLegality:: RK_FloatAdd: + case RK_FloatAdd: // Adding zero to a number does not change it. return ConstantFP::get(Tp, 0.0L); + case RK_IntegerMinMax: + switch(MinMaxK) { + default: llvm_unreachable("Unknown min/max predicate"); + case MRK_UIntMin: + return ConstantInt::getAllOnesValue(Tp); + case MRK_UIntMax: + return ConstantInt::get(Tp, 0); + case MRK_SIntMin: { + unsigned BitWidth = Tp->getPrimitiveSizeInBits(); + return ConstantInt::get(Tp->getContext(), + APInt::getSignedMaxValue(BitWidth)); + } + case LoopVectorizationLegality::MRK_SIntMax: { + unsigned BitWidth = Tp->getPrimitiveSizeInBits(); + return ConstantInt::get(Tp->getContext(), + APInt::getSignedMinValue(BitWidth)); + } + } default: llvm_unreachable("Unknown reduction kind"); } @@ -1566,7 +1649,7 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { } /// This function translates the reduction kind to an LLVM binary operator. -static Instruction::BinaryOps +static unsigned getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { switch (Kind) { case LoopVectorizationLegality::RK_IntegerAdd: @@ -1583,11 +1666,38 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { return Instruction::FMul; case LoopVectorizationLegality::RK_FloatAdd: return Instruction::FAdd; + case LoopVectorizationLegality::RK_IntegerMinMax: + return Instruction::ICmp; default: llvm_unreachable("Unknown reduction operation"); } } +Value *createMinMaxOp(IRBuilder<> &Builder, + LoopVectorizationLegality::MinMaxReductionKind RK, + Value *Left, + Value *Right) { + CmpInst::Predicate P = CmpInst::ICMP_NE; + switch (RK) { + default: + llvm_unreachable("Unknown min/max reduction kind"); + case LoopVectorizationLegality::MRK_UIntMin: + P = CmpInst::ICMP_ULT; + break; + case LoopVectorizationLegality::MRK_UIntMax: + P = CmpInst::ICMP_UGT; + break; + case LoopVectorizationLegality::MRK_SIntMin: + P = CmpInst::ICMP_SLT; + break; + case LoopVectorizationLegality::MRK_SIntMax: + P = CmpInst::ICMP_SGT; + } + Value *Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); + Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); + return Select; +} + void InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { //===------------------------------------------------===// @@ -1651,7 +1761,10 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Find the reduction identity variable. Zero for addition, or, xor, // one for multiplication, -1 for And. - Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType()); + Constant *Iden = + LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, + VecTy->getScalarType(), + RdxDesc.MinMaxKind); Constant *Identity = ConstantVector::getSplat(VF, Iden); // This vector is the Identity vector where the first element is the @@ -1699,10 +1812,15 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = RdxParts[0]; + unsigned Op = getReductionBinOp(RdxDesc.Kind); for (unsigned part = 1; part < UF; ++part) { - Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind); - ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx, - "bin.rdx"); + if (Op != Instruction::ICmp) + ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op, + RdxParts[part], ReducedPartRdx, + "bin.rdx"); + else + ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind, + ReducedPartRdx, RdxParts[part]); } // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles @@ -1727,8 +1845,11 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { ConstantVector::get(ShuffleMask), "rdx.shuf"); - Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind); - TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx"); + if (Op != Instruction::ICmp) + TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, + "bin.rdx"); + else + TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); } // The result is in the first element of the vector. @@ -2315,6 +2436,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); continue; } + if (AddReductionVar(Phi, RK_IntegerMinMax)) { + DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n"); + continue; + } if (AddReductionVar(Phi, RK_FloatMult)) { DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n"); continue; @@ -2442,13 +2567,6 @@ LoopVectorizationLegality::hasPossibleGlobalWriteReorder( bool LoopVectorizationLegality::canVectorizeMemory() { - if (TheLoop->isAnnotatedParallel()) { - DEBUG(dbgs() - << "LV: A loop annotated parallel, ignore memory dependency " - << "checks.\n"); - return true; - } - typedef SmallVector<Value*, 16> ValueVector; typedef SmallPtrSet<Value*, 16> ValueSet; // Holds the Load and Store *instructions*. @@ -2457,6 +2575,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { PtrRtCheck.Pointers.clear(); PtrRtCheck.Need = false; + const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; ++bb) { @@ -2471,7 +2591,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (it->mayReadFromMemory()) { LoadInst *Ld = dyn_cast<LoadInst>(it); if (!Ld) return false; - if (!Ld->isSimple()) { + if (!Ld->isSimple() && !IsAnnotatedParallel) { DEBUG(dbgs() << "LV: Found a non-simple load.\n"); return false; } @@ -2483,7 +2603,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (it->mayWriteToMemory()) { StoreInst *St = dyn_cast<StoreInst>(it); if (!St) return false; - if (!St->isSimple()) { + if (!St->isSimple() && !IsAnnotatedParallel) { DEBUG(dbgs() << "LV: Found a non-simple store.\n"); return false; } @@ -2530,6 +2650,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() { ReadWrites.insert(std::make_pair(Ptr, ST)); } + if (IsAnnotatedParallel) { + DEBUG(dbgs() + << "LV: A loop annotated parallel, ignore memory dependency " + << "checks.\n"); + return true; + } + for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { LoadInst *LD = cast<LoadInst>(*I); Value* Ptr = LD->getPointerOperand(); @@ -2552,6 +2679,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } + unsigned NumReadPtrs = 0; + unsigned NumWritePtrs = 0; + // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. bool CanDoRT = true; @@ -2559,7 +2689,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { Value *V = (*MI).first; if (hasComputableBounds(V)) { - PtrRtCheck.insert(SE, TheLoop, V); + PtrRtCheck.insert(SE, TheLoop, V, true); + NumWritePtrs++; DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); } else { CanDoRT = false; @@ -2569,7 +2700,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { Value *V = (*MI).first; if (hasComputableBounds(V)) { - PtrRtCheck.insert(SE, TheLoop, V); + PtrRtCheck.insert(SE, TheLoop, V, false); + NumReadPtrs++; DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); } else { CanDoRT = false; @@ -2579,7 +2711,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // Check that we did not collect too many pointers or found a // unsizeable pointer. - if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { + unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1)); + DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n"); + if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { PtrRtCheck.reset(); CanDoRT = false; } @@ -2733,7 +2867,18 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // used as reduction variables (such as ADD). We may have a single // out-of-block user. The cycle must end with the original PHI. Instruction *Iter = Phi; - while (true) { + + // To recognize min/max patterns formed by a icmp select sequence, we store + // the number of instruction we saw from the recognized min/max pattern, + // such that we don't stop when we see the phi has two uses (one by the select + // and one by the icmp) and to make sure we only see exactly the two + // instructions. + unsigned NumICmpSelectPatternInst = 0; + ReductionInstDesc ReduxDesc(false, 0); + + // Avoid cycles in the chain. + SmallPtrSet<Instruction *, 8> VisitedInsts; + while (VisitedInsts.insert(Iter)) { // If the instruction has no users then this is a broken // chain and can't be a reduction variable. if (Iter->use_empty()) @@ -2747,9 +2892,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // Is this a bin op ? FoundBinOp |= !isa<PHINode>(Iter); - // Remember the current instruction. - Instruction *OldIter = Iter; - // For each of the *users* of iter. for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); it != e; ++it) { @@ -2778,25 +2920,33 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, Iter->hasNUsesOrMore(2)) continue; - // We can't have multiple inside users. - if (FoundInBlockUser) + // We can't have multiple inside users except for a combination of + // icmp/select both using the phi. + if (FoundInBlockUser && !NumICmpSelectPatternInst) return false; FoundInBlockUser = true; // Any reduction instr must be of one of the allowed kinds. - if (!isReductionInstr(U, Kind)) + ReduxDesc = isReductionInstr(U, Kind, ReduxDesc); + if (!ReduxDesc.IsReduction) return false; + if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) || + isa<SelectInst>(U))) + ++NumICmpSelectPatternInst; + // Reductions of instructions such as Div, and Sub is only // possible if the LHS is the reduction variable. - if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter) + if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) && + !isa<ICmpInst>(U) && U->getOperand(0) != Iter) return false; - Iter = U; + Iter = ReduxDesc.PatternLastInst; } - // If all uses were skipped this can't be a reduction variable. - if (Iter == OldIter) + // This means we have seen one but not the other instruction of the + // pattern or more than just a select and cmp. + if (Kind == RK_IntegerMinMax && NumICmpSelectPatternInst != 2) return false; // We found a reduction var if we have reached the original @@ -2807,47 +2957,94 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, AllowedExit.insert(ExitInstruction); // Save the description of this reduction variable. - ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); + ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, + ReduxDesc.MinMaxKind); Reductions[Phi] = RD; // We've ended the cycle. This is a reduction variable if we have an // outside user and it has a binary op. return FoundBinOp && ExitInstruction; } } + + return false; } -bool +/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction +/// pattern corresponding to a min(X, Y) or max(X, Y). +LoopVectorizationLegality::ReductionInstDesc +LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionInstDesc &Prev) { + + assert((isa<ICmpInst>(I) || isa<SelectInst>(I)) && + "Expect a select instruction"); + ICmpInst *Cmp = 0; + SelectInst *Select = 0; + + // We must handle the select(cmp()) as a single instruction. Advance to the + // select. + if ((Cmp = dyn_cast<ICmpInst>(I))) { + if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin()))) + return ReductionInstDesc(false, I); + return ReductionInstDesc(Select, Prev.MinMaxKind); + } + + // Only handle single use cases for now. + if (!(Select = dyn_cast<SelectInst>(I))) + return ReductionInstDesc(false, I); + if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0)))) + return ReductionInstDesc(false, I); + if (!Cmp->hasOneUse()) + return ReductionInstDesc(false, I); + + Value *CmpLeft = Cmp->getOperand(0); + Value *CmpRight = Cmp->getOperand(1); + + // Look for a min/max pattern. + if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_UIntMin); + else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_UIntMax); + else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_SIntMax); + else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_SIntMin); + + return ReductionInstDesc(false, I); +} + +LoopVectorizationLegality::ReductionInstDesc LoopVectorizationLegality::isReductionInstr(Instruction *I, - ReductionKind Kind) { + ReductionKind Kind, + ReductionInstDesc &Prev) { bool FP = I->getType()->isFloatingPointTy(); bool FastMath = (FP && I->isCommutative() && I->isAssociative()); - switch (I->getOpcode()) { default: - return false; + return ReductionInstDesc(false, I); case Instruction::PHI: if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd)) - return false; - // possibly. - return true; + return ReductionInstDesc(false, I); + return ReductionInstDesc(I, Prev.MinMaxKind); case Instruction::Sub: case Instruction::Add: - return Kind == RK_IntegerAdd; - case Instruction::SDiv: - case Instruction::UDiv: + return ReductionInstDesc(Kind == RK_IntegerAdd, I); case Instruction::Mul: - return Kind == RK_IntegerMult; + return ReductionInstDesc(Kind == RK_IntegerMult, I); case Instruction::And: - return Kind == RK_IntegerAnd; + return ReductionInstDesc(Kind == RK_IntegerAnd, I); case Instruction::Or: - return Kind == RK_IntegerOr; + return ReductionInstDesc(Kind == RK_IntegerOr, I); case Instruction::Xor: - return Kind == RK_IntegerXor; + return ReductionInstDesc(Kind == RK_IntegerXor, I); case Instruction::FMul: - return Kind == RK_FloatMult && FastMath; + return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); case Instruction::FAdd: - return Kind == RK_FloatAdd && FastMath; - } + return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); + case Instruction::ICmp: + case Instruction::Select: + if (Kind != RK_IntegerMinMax) + return ReductionInstDesc(false, I); + return isMinMaxSelectCmpPattern(I, Prev); + } } LoopVectorizationLegality::InductionKind @@ -3331,8 +3528,19 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: - return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy); + case Instruction::Xor: { + // Certain instructions can be cheaper to vectorize if they have a constant + // second vector operand. One example of this are shifts on x86. + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue; + + if (isa<ConstantInt>(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + + return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK); + } case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); @@ -3369,9 +3577,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); // Scalarized loads/stores. - int Stride = Legal->isConsecutivePtr(Ptr); - bool Reverse = Stride < 0; - if (0 == Stride) { + int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + bool Reverse = ConsecutiveStride < 0; + unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); + unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; + if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { unsigned Cost = 0; // The cost of extracting from the value vector and pointer vector. Type *PtrTy = ToVectorTy(Ptr->getType(), VF); diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp new file mode 100644 index 0000000..cc30cc9 --- /dev/null +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -0,0 +1,348 @@ +//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This pass implements the Bottom Up SLP vectorizer. It detects consecutive +// stores that can be put together into vector-stores. Next, it attempts to +// construct vectorizable tree using the use-def chains. If a profitable tree +// was found, the SLP vectorizer performs vectorization on the tree. +// +// The pass is inspired by the work described in the paper: +// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. +// +//===----------------------------------------------------------------------===// +#define SV_NAME "slp-vectorizer" +#define DEBUG_TYPE SV_NAME + +#include "VecUtils.h" +#include "llvm/Transforms/Vectorize.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <map> + +using namespace llvm; + +static cl::opt<int> +SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, + cl::desc("Only vectorize trees if the gain is above this " + "number. (gain = -cost of vectorization)")); +namespace { + +/// The SLPVectorizer Pass. +struct SLPVectorizer : public FunctionPass { + typedef std::map<Value*, BoUpSLP::StoreList> StoreListMap; + + /// Pass identification, replacement for typeid + static char ID; + + explicit SLPVectorizer() : FunctionPass(ID) { + initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); + } + + ScalarEvolution *SE; + DataLayout *DL; + TargetTransformInfo *TTI; + AliasAnalysis *AA; + LoopInfo *LI; + + virtual bool runOnFunction(Function &F) { + SE = &getAnalysis<ScalarEvolution>(); + DL = getAnalysisIfAvailable<DataLayout>(); + TTI = &getAnalysis<TargetTransformInfo>(); + AA = &getAnalysis<AliasAnalysis>(); + LI = &getAnalysis<LoopInfo>(); + + StoreRefs.clear(); + bool Changed = false; + + // Must have DataLayout. We can't require it because some tests run w/o + // triple. + if (!DL) + return false; + + for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) { + BasicBlock *BB = it; + bool BBChanged = false; + + // Use the bollom up slp vectorizer to construct chains that start with + // he store instructions. + BoUpSLP R(BB, SE, DL, TTI, AA, LI->getLoopFor(BB)); + + // Vectorize trees that end at reductions. + BBChanged |= vectorizeReductions(BB, R); + + // Vectorize trees that end at stores. + if (unsigned count = collectStores(BB, R)) { + (void)count; + DEBUG(dbgs()<<"SLP: Found " << count << " stores to vectorize.\n"); + BBChanged |= vectorizeStoreChains(R); + } + + // Try to hoist some of the scalarization code to the preheader. + if (BBChanged) hoistGatherSequence(LI, BB, R); + + Changed |= BBChanged; + } + + if (Changed) { + DEBUG(dbgs()<<"SLP: vectorized \""<<F.getName()<<"\"\n"); + DEBUG(verifyFunction(F)); + } + return Changed; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + FunctionPass::getAnalysisUsage(AU); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<AliasAnalysis>(); + AU.addRequired<TargetTransformInfo>(); + AU.addRequired<LoopInfo>(); + } + +private: + + /// \brief Collect memory references and sort them according to their base + /// object. We sort the stores to their base objects to reduce the cost of the + /// quadratic search on the stores. TODO: We can further reduce this cost + /// if we flush the chain creation every time we run into a memory barrier. + unsigned collectStores(BasicBlock *BB, BoUpSLP &R); + + /// \brief Try to vectorize a chain that starts at two arithmetic instrs. + bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); + + /// \brief Try to vectorize a list of operands. + bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R); + + /// \brief Try to vectorize a chain that may start at the operands of \V; + bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); + + /// \brief Vectorize the stores that were collected in StoreRefs. + bool vectorizeStoreChains(BoUpSLP &R); + + /// \brief Try to hoist gather sequences outside of the loop in cases where + /// all of the sources are loop invariant. + void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R); + + /// \brief Scan the basic block and look for reductions that may start a + /// vectorization chain. + bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R); + +private: + StoreListMap StoreRefs; +}; + +unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { + unsigned count = 0; + StoreRefs.clear(); + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + StoreInst *SI = dyn_cast<StoreInst>(it); + if (!SI) + continue; + + // Check that the pointer points to scalars. + Type *Ty = SI->getValueOperand()->getType(); + if (Ty->isAggregateType() || Ty->isVectorTy()) + return 0; + + // Find the base of the GEP. + Value *Ptr = SI->getPointerOperand(); + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) + Ptr = GEP->getPointerOperand(); + + // Save the store locations. + StoreRefs[Ptr].push_back(SI); + count++; + } + return count; +} + +bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { + if (!A || !B) return false; + Value *VL[] = { A, B }; + return tryToVectorizeList(VL, R); +} + +bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { + DEBUG(dbgs()<<"SLP: Vectorizing a list of length = " << VL.size() << ".\n"); + + // Check that all of the parts are scalar. + for (int i = 0, e = VL.size(); i < e; ++i) { + Type *Ty = VL[i]->getType(); + if (Ty->isAggregateType() || Ty->isVectorTy()) + return 0; + } + + int Cost = R.getTreeCost(VL); + int ExtrCost = R.getScalarizationCost(VL); + DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << + " Cost of extract:" << ExtrCost << ".\n"); + if ((Cost+ExtrCost) >= -SLPCostThreshold) return false; + DEBUG(dbgs()<<"SLP: Vectorizing pair.\n"); + R.vectorizeArith(VL); + return true; +} + +bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { + if (!V) return false; + // Try to vectorize V. + if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) + return true; + + BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0)); + BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1)); + // Try to skip B. + if (B && B->hasOneUse()) { + BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); + BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); + if (tryToVectorizePair(A, B0, R)) { + B->moveBefore(V); + return true; + } + if (tryToVectorizePair(A, B1, R)) { + B->moveBefore(V); + return true; + } + } + + // Try to skip A. + if (A && A->hasOneUse()) { + BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); + BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); + if (tryToVectorizePair(A0, B, R)) { + A->moveBefore(V); + return true; + } + if (tryToVectorizePair(A1, B, R)) { + A->moveBefore(V); + return true; + } + } + return 0; +} + +bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) { + bool Changed = false; + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + if (isa<DbgInfoIntrinsic>(it)) continue; + + // Try to vectorize reductions that use PHINodes. + if (PHINode *P = dyn_cast<PHINode>(it)) { + // Check that the PHI is a reduction PHI. + if (P->getNumIncomingValues() != 2) return Changed; + Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) : + (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : + 0)); + // Check if this is a Binary Operator. + BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); + if (!BI) + continue; + + Value *Inst = BI->getOperand(0); + if (Inst == P) Inst = BI->getOperand(1); + Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R); + continue; + } + + // Try to vectorize trees that start at compare instructions. + if (CmpInst *CI = dyn_cast<CmpInst>(it)) { + if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { + Changed |= true; + continue; + } + for (int i = 0; i < 2; ++i) + if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) + Changed |= tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R); + continue; + } + } + + return Changed; +} + +bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { + bool Changed = false; + // Attempt to sort and vectorize each of the store-groups. + for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); + it != e; ++it) { + if (it->second.size() < 2) + continue; + + DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " << + it->second.size() << ".\n"); + + Changed |= R.vectorizeStores(it->second, -SLPCostThreshold); + } + return Changed; +} + +void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, + BoUpSLP &R) { + // Check if this block is inside a loop. + Loop *L = LI->getLoopFor(BB); + if (!L) + return; + + // Check if it has a preheader. + BasicBlock *PreHeader = L->getLoopPreheader(); + if (!PreHeader) + return; + + // Mark the insertion point for the block. + Instruction *Location = PreHeader->getTerminator(); + + BoUpSLP::ValueList &Gathers = R.getGatherSeqInstructions(); + for (BoUpSLP::ValueList::iterator it = Gathers.begin(), e = Gathers.end(); + it != e; ++it) { + InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); + + // The InsertElement sequence can be simplified into a constant. + if (!Insert) + continue; + + // If the vector or the element that we insert into it are + // instructions that are defined in this basic block then we can't + // hoist this instruction. + Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); + Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); + if (CurrVec && L->contains(CurrVec)) continue; + if (NewElem && L->contains(NewElem)) continue; + + // We can hoist this instruction. Move it to the pre-header. + Insert->moveBefore(Location); + } +} + +} // end anonymous namespace + +char SLPVectorizer::ID = 0; +static const char lv_name[] = "SLP Vectorizer"; +INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) + +namespace llvm { + Pass *createSLPVectorizerPass() { + return new SLPVectorizer(); + } +} + diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp new file mode 100644 index 0000000..9b94366 --- /dev/null +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -0,0 +1,730 @@ +//===- VecUtils.cpp --- Vectorization Utilities ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "SLP" + +#include "VecUtils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include <algorithm> +#include <map> + +using namespace llvm; + +static const unsigned MinVecRegSize = 128; + +static const unsigned RecursionMaxDepth = 6; + +namespace llvm { + +BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) : + BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) { + numberInstructions(); +} + +void BoUpSLP::numberInstructions() { + int Loc = 0; + InstrIdx.clear(); + InstrVec.clear(); + // Number the instructions in the block. + for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) { + InstrIdx[it] = Loc++; + InstrVec.push_back(it); + assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); + } +} + +Value *BoUpSLP::getPointerOperand(Value *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand(); + return 0; +} + +unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { + if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace(); + if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace(); + return -1; +} + +bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) return false; + + // Check that A and B are of the same type. + if (PtrA->getType() != PtrB->getType()) return false; + + // Calculate the distance. + const SCEV *PtrSCEVA = SE->getSCEV(PtrA); + const SCEV *PtrSCEVB = SE->getSCEV(PtrB); + const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB); + const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV); + + // Non constant distance. + if (!ConstOffSCEV) return false; + + int64_t Offset = ConstOffSCEV->getValue()->getSExtValue(); + Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); + // The Instructions are connsecutive if the size of the first load/store is + // the same as the offset. + int64_t Sz = DL->getTypeStoreSize(Ty); + return ((-Offset) == Sz); +} + +bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) { + Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); + unsigned Sz = DL->getTypeSizeInBits(StoreTy); + unsigned VF = MinVecRegSize / Sz; + + if (!isPowerOf2_32(Sz) || VF < 2) return false; + + bool Changed = false; + // Look for profitable vectorizable trees at all offsets, starting at zero. + for (unsigned i = 0, e = Chain.size(); i < e; ++i) { + if (i + VF > e) return Changed; + DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n"); + ArrayRef<Value *> Operands = Chain.slice(i, VF); + + int Cost = getTreeCost(Operands); + DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + vectorizeTree(Operands, VF); + i += VF - 1; + Changed = true; + } + } + + return Changed; +} + +bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) { + ValueSet Heads, Tails; + SmallDenseMap<Value*, Value*> ConsecutiveChain; + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we vectorized so that we don't visit the same store twice. + ValueSet VectorizedStores; + bool Changed = false; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of loads that follow each other. + for (unsigned i = 0, e = Stores.size(); i < e; ++i) + for (unsigned j = 0; j < e; ++j) { + if (i == j) continue; + if (isConsecutiveAccess(Stores[i], Stores[j])) { + Tails.insert(Stores[j]); + Heads.insert(Stores[i]); + ConsecutiveChain[Stores[i]] = Stores[j]; + } + } + + // For stores that start but don't end a link in the chain: + for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) { + if (Tails.count(*it)) continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to vectorize it. + ValueList Operands; + Value *I = *it; + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + if (VectorizedStores.count(I)) break; + Operands.push_back(I); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + bool Vectorized = vectorizeStoreChain(Operands, costThreshold); + + // Mark the vectorized stores so that we don't vectorize them again. + if (Vectorized) + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed |= Vectorized; + } + + return Changed; +} + +int BoUpSLP::getScalarizationCost(ArrayRef<Value *> VL) { + // Find the type of the operands in VL. + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + // Find the cost of inserting/extracting values from the vector. + return getScalarizationCost(VecTy); +} + +int BoUpSLP::getScalarizationCost(Type *Ty) { + int Cost = 0; + for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + return Cost; +} + +AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { + if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI); + return AliasAnalysis::Location(); +} + +Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { + assert(Src->getParent() == Dst->getParent() && "Not the same BB"); + BasicBlock::iterator I = Src, E = Dst; + /// Scan all of the instruction from SRC to DST and check if + /// the source may alias. + for (++I; I != E; ++I) { + // Ignore store instructions that are marked as 'ignore'. + if (MemBarrierIgnoreList.count(I)) continue; + if (Src->mayWriteToMemory()) /* Write */ { + if (!I->mayReadOrWriteMemory()) continue; + } else /* Read */ { + if (!I->mayWriteToMemory()) continue; + } + AliasAnalysis::Location A = getLocation(&*I); + AliasAnalysis::Location B = getLocation(Src); + + if (!A.Ptr || !B.Ptr || AA->alias(A, B)) + return I; + } + return 0; +} + +void BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) { + Value *Vec = vectorizeTree(Operands, Operands.size()); + BasicBlock::iterator Loc = cast<Instruction>(Vec); + IRBuilder<> Builder(++Loc); + // After vectorizing the operands we need to generate extractelement + // instructions and replace all of the uses of the scalar values with + // the values that we extracted from the vectorized tree. + for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); + Operands[i]->replaceAllUsesWith(S); + } +} + +int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) { + // Get rid of the list of stores that were removed, and from the + // lists of instructions with multiple users. + MemBarrierIgnoreList.clear(); + LaneMap.clear(); + MultiUserVals.clear(); + MustScalarize.clear(); + + // Scan the tree and find which value is used by which lane, and which values + // must be scalarized. + getTreeUses_rec(VL, 0); + + // Check that instructions with multiple users can be vectorized. Mark unsafe + // instructions. + for (ValueSet::iterator it = MultiUserVals.begin(), + e = MultiUserVals.end(); it != e; ++it) { + // Check that all of the users of this instr are within the tree + // and that they are all from the same lane. + int Lane = -1; + for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end(); + I != E; ++I) { + if (LaneMap.find(*I) == LaneMap.end()) { + MustScalarize.insert(*it); + DEBUG(dbgs()<<"SLP: Adding " << **it << + " to MustScalarize because of an out of tree usage.\n"); + break; + } + if (Lane == -1) Lane = LaneMap[*I]; + if (Lane != LaneMap[*I]) { + MustScalarize.insert(*it); + DEBUG(dbgs()<<"Adding " << **it << + " to MustScalarize because multiple lane use it: " + << Lane << " and " << LaneMap[*I] << ".\n"); + break; + } + } + } + + // Now calculate the cost of vectorizing the tree. + return getTreeCost_rec(VL, 0); +} + +void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) { + if (Depth == RecursionMaxDepth) return; + + // Don't handle vectors. + if (VL[0]->getType()->isVectorTy()) return; + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + if (SI->getValueOperand()->getType()->isVectorTy()) return; + + // Check if all of the operands are constants. + bool AllConst = true; + bool AllSameScalar = true; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + AllConst &= isa<Constant>(VL[i]); + AllSameScalar &= (VL[0] == VL[i]); + Instruction *I = dyn_cast<Instruction>(VL[i]); + // If one of the instructions is out of this BB, we need to scalarize all. + if (I && I->getParent() != BB) return; + } + + // If all of the operands are identical or constant we have a simple solution. + if (AllConst || AllSameScalar) return; + + // Scalarize unknown structures. + Instruction *VL0 = dyn_cast<Instruction>(VL[0]); + if (!VL0) return; + + unsigned Opcode = VL0->getOpcode(); + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + // If not all of the instructions are identical then we have to scalarize. + if (!I || Opcode != I->getOpcode()) return; + } + + // Mark instructions with multiple users. + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + // Remember to check if all of the users of this instr are vectorized + // within our tree. + if (I && I->getNumUses() > 1) MultiUserVals.insert(I); + } + + for (int i = 0, e = VL.size(); i < e; ++i) { + // Check that the instruction is only used within + // one lane. + if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return; + // Make this instruction as 'seen' and remember the lane. + LaneMap[VL[i]] = i; + } + + switch (Opcode) { + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + getTreeUses_rec(Operands, Depth+1); + } + return; + } + case Instruction::Store: { + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); + getTreeUses_rec(Operands, Depth+1); + return; + } + default: + return; + } +} + +int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) { + Type *ScalarTy = VL[0]->getType(); + + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + + /// Don't mess with vectors. + if (ScalarTy->isVectorTy()) return max_cost; + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy); + + // Check if all of the operands are constants. + bool AllConst = true; + bool AllSameScalar = true; + bool MustScalarizeFlag = false; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + AllConst &= isa<Constant>(VL[i]); + AllSameScalar &= (VL[0] == VL[i]); + // Must have a single use. + Instruction *I = dyn_cast<Instruction>(VL[i]); + MustScalarizeFlag |= MustScalarize.count(VL[i]); + // This instruction is outside the basic block. + if (I && I->getParent() != BB) + return getScalarizationCost(VecTy); + } + + // Is this a simple vector constant. + if (AllConst) return 0; + + // If all of the operands are identical we can broadcast them. + Instruction *VL0 = dyn_cast<Instruction>(VL[0]); + if (AllSameScalar) { + // If we are in a loop, and this is not an instruction (e.g. constant or + // argument) or the instruction is defined outside the loop then assume + // that the cost is zero. + if (L && (!VL0 || !L->contains(VL0))) + return 0; + + // We need to broadcast the scalar. + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + } + + // If this is not a constant, or a scalar from outside the loop then we + // need to scalarize it. + if (MustScalarizeFlag) + return getScalarizationCost(VecTy); + + if (!VL0) return getScalarizationCost(VecTy); + assert(VL0->getParent() == BB && "Wrong BB"); + + unsigned Opcode = VL0->getOpcode(); + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + // If not all of the instructions are identical then we have to scalarize. + if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy); + } + + // Check if it is safe to sink the loads or the stores. + if (Opcode == Instruction::Load || Opcode == Instruction::Store) { + int MaxIdx = InstrIdx[VL0]; + for (unsigned i = 1, e = VL.size(); i < e; ++i ) + MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); + + Instruction *Last = InstrVec[MaxIdx]; + for (unsigned i = 0, e = VL.size(); i < e; ++i ) { + if (VL[i] == Last) continue; + Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last); + if (Barrier) { + DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << + *Last << "\n because of " << *Barrier << "\n"); + return max_cost; + } + } + } + + switch (Opcode) { + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + int Cost = 0; + ValueList Operands; + Type *SrcTy = VL0->getOperand(0)->getType(); + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) { + Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); + // Check that the casted type is the same for all users. + if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy) + return getScalarizationCost(VecTy); + } + + Cost += getTreeCost_rec(Operands, Depth+1); + if (Cost >= max_cost) return max_cost; + + // Calculate the cost of this instruction. + int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), + VL0->getType(), SrcTy); + + VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + Cost += (VecCost - ScalarCost); + return Cost; + } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + int Cost = 0; + // Calculate the cost of all of the operands. + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + Cost += getTreeCost_rec(Operands, Depth+1); + if (Cost >= max_cost) return max_cost; + } + + // Calculate the cost of this instruction. + int ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy); + + int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); + Cost += (VecCost - ScalarCost); + return Cost; + } + case Instruction::Load: { + // If we are scalarize the loads, add the cost of forming the vector. + for (unsigned i = 0, e = VL.size()-1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i+1])) + return getScalarizationCost(VecTy); + + // Cost of wide load - cost of scalar loads. + int ScalarLdCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + return VecLdCost - ScalarLdCost; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + int ScalarStCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0); + int StoreCost = VecStCost - ScalarStCost; + + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) { + Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); + MemBarrierIgnoreList.insert(VL[j]); + } + + int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1); + return TotalCost; + } + default: + // Unable to vectorize unknown instructions. + return getScalarizationCost(VecTy); + } +} + +Instruction *BoUpSLP::GetLastInstr(ArrayRef<Value *> VL, unsigned VF) { + int MaxIdx = InstrIdx[BB->getFirstNonPHI()]; + for (unsigned i = 0; i < VF; ++i ) + MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); + return InstrVec[MaxIdx + 1]; +} + +Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) { + IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements())); + Value *Vec = UndefValue::get(Ty); + for (unsigned i=0; i < Ty->getNumElements(); ++i) { + // Generate the 'InsertElement' instruction. + Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); + // Remember that this instruction is used as part of a 'gather' sequence. + // The caller of the bottom-up slp vectorizer can try to hoist the sequence + // if the users are outside of the basic block. + GatherInstructions.push_back(Vec); + } + + return Vec; +} + +Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) { + Value *V = vectorizeTree_rec(VL, VF); + // We moved some instructions around. We have to number them again + // before we can do any analysis. + numberInstructions(); + MustScalarize.clear(); + return V; +} + +Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) { + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VF); + + // Check if all of the operands are constants or identical. + bool AllConst = true; + bool AllSameScalar = true; + for (unsigned i = 0, e = VF; i < e; ++i) { + AllConst &= isa<Constant>(VL[i]); + AllSameScalar &= (VL[0] == VL[i]); + // The instruction must be in the same BB, and it must be vectorizable. + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB)) + return Scalarize(VL, VecTy); + } + + // Check that this is a simple vector constant. + if (AllConst || AllSameScalar) return Scalarize(VL, VecTy); + + // Scalarize unknown structures. + Instruction *VL0 = dyn_cast<Instruction>(VL[0]); + if (!VL0) return Scalarize(VL, VecTy); + + if (VectorizedValues.count(VL0)) return VectorizedValues[VL0]; + + unsigned Opcode = VL0->getOpcode(); + for (unsigned i = 0, e = VF; i < e; ++i) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + // If not all of the instructions are identical then we have to scalarize. + if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy); + } + + switch (Opcode) { + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + ValueList INVL; + for (int i = 0; i < VF; ++i) + INVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); + Value *InVec = vectorizeTree_rec(INVL, VF); + IRBuilder<> Builder(GetLastInstr(VL, VF)); + CastInst *CI = dyn_cast<CastInst>(VL0); + Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + VectorizedValues[VL0] = V; + return V; + } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList LHSVL, RHSVL; + for (int i = 0; i < VF; ++i) { + RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); + LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1)); + } + + Value *RHS = vectorizeTree_rec(RHSVL, VF); + Value *LHS = vectorizeTree_rec(LHSVL, VF); + IRBuilder<> Builder(GetLastInstr(VL, VF)); + BinaryOperator *BinOp = cast<BinaryOperator>(VL0); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS); + VectorizedValues[VL0] = V; + return V; + } + case Instruction::Load: { + LoadInst *LI = cast<LoadInst>(VL0); + unsigned Alignment = LI->getAlignment(); + + // Check if all of the loads are consecutive. + for (unsigned i = 1, e = VF; i < e; ++i) + if (!isConsecutiveAccess(VL[i-1], VL[i])) + return Scalarize(VL, VecTy); + + IRBuilder<> Builder(GetLastInstr(VL, VF)); + Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), + VecTy->getPointerTo()); + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + VectorizedValues[VL0] = LI; + return LI; + } + case Instruction::Store: { + StoreInst *SI = cast<StoreInst>(VL0); + unsigned Alignment = SI->getAlignment(); + + ValueList ValueOp; + for (int i = 0; i < VF; ++i) + ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand()); + + Value *VecValue = vectorizeTree_rec(ValueOp, VF); + + IRBuilder<> Builder(GetLastInstr(VL, VF)); + Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), + VecTy->getPointerTo()); + Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); + + for (int i = 0; i < VF; ++i) + cast<Instruction>(VL[i])->eraseFromParent(); + return 0; + } + default: + Value *S = Scalarize(VL, VecTy); + VectorizedValues[VL0] = S; + return S; + } +} + +} // end of namespace diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h new file mode 100644 index 0000000..5456c6c --- /dev/null +++ b/lib/Transforms/Vectorize/VecUtils.h @@ -0,0 +1,164 @@ +//===- VecUtils.h - Vectorization Utilities -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This family of classes and functions manipulate vectors and chains of +// vectors. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H +#define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include <vector> + +namespace llvm { + +class BasicBlock; class Instruction; class Type; +class VectorType; class StoreInst; class Value; +class ScalarEvolution; class DataLayout; +class TargetTransformInfo; class AliasAnalysis; +class Loop; + +/// Bottom Up SLP vectorization utility class. +struct BoUpSLP { + typedef SmallVector<Value*, 8> ValueList; + typedef SmallPtrSet<Value*, 16> ValueSet; + typedef SmallVector<StoreInst*, 8> StoreList; + static const int max_cost = 1<<20; + + // \brief C'tor. + BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp); + + /// \brief Take the pointer operand from the Load/Store instruction. + /// \returns NULL if this is not a valid Load/Store instruction. + static Value *getPointerOperand(Value *I); + + /// \brief Take the address space operand from the Load/Store instruction. + /// \returns -1 if this is not a valid Load/Store instruction. + static unsigned getAddressSpaceOperand(Value *I); + + /// \returns true if the memory operations A and B are consecutive. + bool isConsecutiveAccess(Value *A, Value *B); + + /// \brief Vectorize the tree that starts with the elements in \p VL. + /// \returns the vectorized value. + Value *vectorizeTree(ArrayRef<Value *> VL, int VF); + + /// \returns the vectorization cost of the subtree that starts at \p VL. + /// A negative number means that this is profitable. + int getTreeCost(ArrayRef<Value *> VL); + + /// \returns the scalarization cost for this list of values. Assuming that + /// this subtree gets vectorized, we may need to extract the values from the + /// roots. This method calculates the cost of extracting the values. + int getScalarizationCost(ArrayRef<Value *> VL); + + /// \brief Attempts to order and vectorize a sequence of stores. This + /// function does a quadratic scan of the given stores. + /// \returns true if the basic block was modified. + bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold); + + /// \brief Vectorize a group of scalars into a vector tree. + void vectorizeArith(ArrayRef<Value *> Operands); + + /// \returns the list of new instructions that were added in order to collect + /// scalars into vectors. This list can be used to further optimize the gather + /// sequences. + ValueList &getGatherSeqInstructions() {return GatherInstructions; } + +private: + /// \brief This method contains the recursive part of getTreeCost. + int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth); + + /// \brief This recursive method looks for vectorization hazards such as + /// values that are used by multiple users and checks that values are used + /// by only one vector lane. It updates the variables LaneMap, MultiUserVals. + void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth); + + /// \brief This method contains the recursive part of vectorizeTree. + Value *vectorizeTree_rec(ArrayRef<Value *> VL, int VF); + + /// \brief Number all of the instructions in the block. + void numberInstructions(); + + /// \brief Vectorize a sorted sequence of stores. + bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold); + + /// \returns the scalarization cost for this type. Scalarization in this + /// context means the creation of vectors from a group of scalars. + int getScalarizationCost(Type *Ty); + + /// \returns the AA location that is being access by the instruction. + AliasAnalysis::Location getLocation(Instruction *I); + + /// \brief Checks if it is possible to sink an instruction from + /// \p Src to \p Dst. + /// \returns the pointer to the barrier instruction if we can't sink. + Value *isUnsafeToSink(Instruction *Src, Instruction *Dst); + + /// \returns the instruction that appears last in the BB from \p VL. + /// Only consider the first \p VF elements. + Instruction *GetLastInstr(ArrayRef<Value *> VL, unsigned VF); + + /// \returns a vector from a collection of scalars in \p VL. + Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty); + +private: + /// Maps instructions to numbers and back. + SmallDenseMap<Value*, int> InstrIdx; + /// Maps integers to Instructions. + std::vector<Instruction*> InstrVec; + + // -- containers that are used during getTreeCost -- // + + /// Contains values that must be scalarized because they are used + /// by multiple lanes, or by users outside the tree. + /// NOTICE: The vectorization methods also use this set. + ValueSet MustScalarize; + + /// Contains a list of values that are used outside the current tree. This + /// set must be reset between runs. + ValueSet MultiUserVals; + /// Maps values in the tree to the vector lanes that uses them. This map must + /// be reset between runs of getCost. + std::map<Value*, int> LaneMap; + /// A list of instructions to ignore while sinking + /// memory instructions. This map must be reset between runs of getCost. + SmallPtrSet<Value *, 8> MemBarrierIgnoreList; + + // -- Containers that are used during vectorizeTree -- // + + /// Maps between the first scalar to the vector. This map must be reset + ///between runs. + DenseMap<Value*, Value*> VectorizedValues; + + // -- Containers that are used after vectorization by the caller -- // + + /// A list of instructions that are used when gathering scalars into vectors. + /// In many cases these instructions can be hoisted outside of the BB. + /// Iterating over this list is faster than calling LICM. + ValueList GatherInstructions; + + // Analysis and block reference. + BasicBlock *BB; + ScalarEvolution *SE; + DataLayout *DL; + TargetTransformInfo *TTI; + AliasAnalysis *AA; + Loop *L; +}; + +} // end of namespace + +#endif // LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index 19eefd2..a927fe1 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -1,4 +1,4 @@ - //===-- Vectorize.cpp -----------------------------------------------------===// +//===-- Vectorize.cpp -----------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -28,6 +28,7 @@ using namespace llvm; void llvm::initializeVectorization(PassRegistry &Registry) { initializeBBVectorizePass(Registry); initializeLoopVectorizePass(Registry); + initializeSLPVectorizerPass(Registry); } void LLVMInitializeVectorization(LLVMPassRegistryRef R) { @@ -41,3 +42,7 @@ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) { void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopVectorizePass()); } + +void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createSLPVectorizerPass()); +} |
