diff options
author | Stephen Hines <srhines@google.com> | 2014-02-11 20:01:10 -0800 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2014-02-11 20:01:10 -0800 |
commit | ce9904c6ea8fd669978a8eefb854b330eb9828ff (patch) | |
tree | 2418ee2e96ea220977c8fb74959192036ab5b133 /lib/Transforms/Vectorize/LoopVectorize.cpp | |
parent | c27b10b198c1d9e9b51f2303994313ec2778edd7 (diff) | |
parent | dbb832b83351cec97b025b61c26536ef50c3181c (diff) | |
download | external_llvm-ce9904c6ea8fd669978a8eefb854b330eb9828ff.zip external_llvm-ce9904c6ea8fd669978a8eefb854b330eb9828ff.tar.gz external_llvm-ce9904c6ea8fd669978a8eefb854b330eb9828ff.tar.bz2 |
Merge remote-tracking branch 'upstream/release_34' into merge-20140211
Conflicts:
lib/Linker/LinkModules.cpp
lib/Support/Unix/Signals.inc
Change-Id: Ia54f291fa5dc828052d2412736e8495c1282aa64
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 1119 |
1 files changed, 792 insertions, 327 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index a62fedc..5e75871 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -48,6 +48,7 @@ #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" @@ -126,6 +127,9 @@ static const unsigned MaxVectorWidth = 64; /// Maximum vectorization unroll count. static const unsigned MaxUnrollFactor = 16; +/// The cost of a loop that is considered 'small' by the unroller. +static const unsigned SmallLoopCost = 20; + namespace { // Forward declarations. @@ -167,7 +171,9 @@ public: updateAnalysis(); } -private: + virtual ~InnerLoopVectorizer() {} + +protected: /// A small list of PHINodes. typedef SmallVector<PHINode*, 4> PhiVector; /// When we unroll loops we have multiple vector values for each scalar. @@ -187,7 +193,13 @@ private: /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(LoopVectorizationLegality *Legal); /// Copy and widen the instructions from the old loop. - void vectorizeLoop(LoopVectorizationLegality *Legal); + virtual void vectorizeLoop(LoopVectorizationLegality *Legal); + + /// \brief The Loop exit block may have single value PHI nodes where the + /// incoming value is 'Undef'. While vectorizing we only handled real values + /// that were defined inside the loop. Here we fix the 'undef case'. + /// See PR14725. + void fixLCSSAPHIs(); /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* @@ -201,16 +213,23 @@ private: void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, PhiVector *PV); + /// Vectorize a single PHINode in a block. This method handles the induction + /// variable canonicalization. It supports both VF = 1 for unrolled loops and + /// arbitrary length vectors. + void widenPHIInstruction(Instruction *PN, VectorParts &Entry, + LoopVectorizationLegality *Legal, + unsigned UF, unsigned VF, PhiVector *PV); + /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence /// of scalars. - void scalarizeInstruction(Instruction *Instr); + virtual void scalarizeInstruction(Instruction *Instr); /// Vectorize Load and Store instructions, - void vectorizeMemoryInstruction(Instruction *Instr, + virtual void vectorizeMemoryInstruction(Instruction *Instr, LoopVectorizationLegality *Legal); /// Create a broadcast instruction. This method generates a broadcast @@ -218,12 +237,12 @@ private: /// value. If this is the induction variable then we extend it to N, N+1, ... /// this is needed because each iteration in the loop corresponds to a SIMD /// element. - Value *getBroadcastInstrs(Value *V); + virtual Value *getBroadcastInstrs(Value *V); /// This function adds 0, 1, 2 ... to each vector element, starting at zero. /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). /// The sequence starts at StartIndex. - Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); + virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. @@ -233,7 +252,7 @@ private: VectorParts &getVectorValue(Value *V); /// Generate a shuffle sequence that will reverse the vector Vec. - Value *reverseVector(Value *Vec); + virtual Value *reverseVector(Value *Vec); /// This is a helper class that holds the vectorizer state. It maps scalar /// instructions to vector instructions. When the code is 'unrolled' then @@ -291,6 +310,8 @@ private: /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. unsigned VF; + +protected: /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -326,6 +347,22 @@ private: EdgeMaskCache MaskCache; }; +class InnerLoopUnroller : public InnerLoopVectorizer { +public: + InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, + DominatorTree *DT, DataLayout *DL, + const TargetLibraryInfo *TLI, unsigned UnrollFactor) : + InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { } + +private: + virtual void scalarizeInstruction(Instruction *Instr); + virtual void vectorizeMemoryInstruction(Instruction *Instr, + LoopVectorizationLegality *Legal); + virtual Value *getBroadcastInstrs(Value *V); + virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); + virtual Value *reverseVector(Value *Vec); +}; + /// \brief Look for a meaningful debug location on the instruction or it's /// operands. static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { @@ -409,7 +446,7 @@ public: MRK_FloatMax }; - /// This POD struct holds information about reduction variables. + /// This struct holds information about reduction variables. struct ReductionDescriptor { ReductionDescriptor() : StartValue(0), LoopExitInstr(0), Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} @@ -446,8 +483,8 @@ public: MinMaxReductionKind MinMaxKind; }; - // This POD struct holds information about the memory runtime legality - // check that a group of pointers do not overlap. + /// This struct holds information about the memory runtime legality + /// check that a group of pointers do not overlap. struct RuntimePointerCheck { RuntimePointerCheck() : Need(false) {} @@ -457,6 +494,8 @@ public: Pointers.clear(); Starts.clear(); Ends.clear(); + IsWritePtr.clear(); + DependencySetId.clear(); } /// Insert a pointer and calculate the start and end SCEVs. @@ -478,7 +517,7 @@ public: SmallVector<unsigned, 2> DependencySetId; }; - /// A POD for saving information about induction variables. + /// A struct for saving information about induction variables. struct InductionInfo { InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} InductionInfo() : StartValue(0), IK(IK_NoInduction) {} @@ -725,9 +764,9 @@ struct LoopVectorizeHints { /// Vectorization unroll factor. unsigned Unroll; - LoopVectorizeHints(const Loop *L) + LoopVectorizeHints(const Loop *L, bool DisableUnrolling) : Width(VectorizationFactor) - , Unroll(VectorizationUnroll) + , Unroll(DisableUnrolling ? 1 : VectorizationUnroll) , LoopID(L->getLoopID()) { getHints(L); // The command line options override any loop metadata except for when @@ -736,6 +775,9 @@ struct LoopVectorizeHints { Width = VectorizationFactor; if (VectorizationUnroll.getNumOccurrences() > 0) Unroll = VectorizationUnroll; + + DEBUG(if (DisableUnrolling && Unroll == 1) + dbgs() << "LV: Unrolling disabled by the pass manager\n"); } /// Return the loop vectorizer metadata prefix. @@ -762,6 +804,7 @@ struct LoopVectorizeHints { Vals.push_back(LoopID->getOperand(i)); Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width)); + Vals.push_back(createHint(Context, Twine(Prefix(), "unroll").str(), 1)); MDNode *NewLoopID = MDNode::get(Context, Vals); // Set operand 0 to refer to the loop id itself. @@ -825,15 +868,18 @@ private: unsigned Val = C->getZExtValue(); if (Hint == "width") { - assert(isPowerOf2_32(Val) && Val <= MaxVectorWidth && - "Invalid width metadata"); - Width = Val; + if (isPowerOf2_32(Val) && Val <= MaxVectorWidth) + Width = Val; + else + DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n"); } else if (Hint == "unroll") { - assert(isPowerOf2_32(Val) && Val <= MaxUnrollFactor && - "Invalid unroll metadata"); - Unroll = Val; - } else - DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint); + if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor) + Unroll = Val; + else + DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n"); + } else { + DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n'); + } } }; @@ -842,7 +888,8 @@ struct LoopVectorize : public LoopPass { /// Pass identification, replacement for typeid static char ID; - explicit LoopVectorize() : LoopPass(ID) { + explicit LoopVectorize(bool NoUnrolling = false) + : LoopPass(ID), DisableUnrolling(NoUnrolling) { initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } @@ -852,6 +899,7 @@ struct LoopVectorize : public LoopPass { TargetTransformInfo *TTI; DominatorTree *DT; TargetLibraryInfo *TLI; + bool DisableUnrolling; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { // We only vectorize innermost loops. @@ -865,17 +913,22 @@ struct LoopVectorize : public LoopPass { DT = &getAnalysis<DominatorTree>(); TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + // If the target claims to have no vector registers don't attempt + // vectorization. + if (!TTI->getNumberOfRegisters(true)) + return false; + if (DL == NULL) { - DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout"); + DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout\n"); return false; } DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); - LoopVectorizeHints Hints(L); + LoopVectorizeHints Hints(L, DisableUnrolling); - if (Hints.Width == 1) { + if (Hints.Width == 1 && Hints.Unroll == 1) { DEBUG(dbgs() << "LV: Not vectorizing.\n"); return false; } @@ -912,19 +965,23 @@ struct LoopVectorize : public LoopPass { unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width, VF.Cost); + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< + F->getParent()->getModuleIdentifier() << '\n'); + DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n'); + if (VF.Width == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); - return false; + if (UF == 1) + return false; + // We decided not to vectorize, but we may want to unroll. + InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); + Unroller.vectorize(&LVL); + } else { + // If we decided that it is *legal* to vectorize the loop then do it. + InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); + LB.vectorize(&LVL); } - DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< - F->getParent()->getModuleIdentifier()<<"\n"); - DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n"); - - // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); - LB.vectorize(&LVL); - // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(L); @@ -971,25 +1028,19 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, } Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { - // Save the current insertion location. - Instruction *Loc = Builder.GetInsertPoint(); - // We need to place the broadcast of invariant variables outside the loop. Instruction *Instr = dyn_cast<Instruction>(V); bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; // Place the code for broadcasting invariant variables in the new preheader. + IRBuilder<>::InsertPointGuard Guard(Builder); if (Invariant) Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); // Broadcast the scalar into all locations in the vector. Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); - // Restore the builder insertion point. - if (Invariant) - Builder.SetInsertPoint(Loc); - return Shuf; } @@ -1016,10 +1067,35 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, return Builder.CreateAdd(Val, Cv, "induction"); } +/// \brief Find the operand of the GEP that should be checked for consecutive +/// stores. This ignores trailing indices that have no effect on the final +/// pointer. +static unsigned getGEPInductionOperand(DataLayout *DL, + const GetElementPtrInst *Gep) { + unsigned LastOperand = Gep->getNumOperands() - 1; + unsigned GEPAllocSize = DL->getTypeAllocSize( + cast<PointerType>(Gep->getType()->getScalarType())->getElementType()); + + // Walk backwards and try to peel off zeros. + while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) { + // Find the type we're currently indexing into. + gep_type_iterator GEPTI = gep_type_begin(Gep); + std::advance(GEPTI, LastOperand - 1); + + // If it's a type with the same allocation size as the result of the GEP we + // can peel off the zero index. + if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize) + break; + --LastOperand; + } + + return LastOperand; +} + int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); // Make sure that the pointer does not point to structs. - if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType()) + if (Ptr->getType()->getPointerElementType()->isAggregateType()) return 0; // If this value is a pointer induction variable we know it is consecutive. @@ -1037,8 +1113,6 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return 0; unsigned NumOperands = Gep->getNumOperands(); - Value *LastIndex = Gep->getOperand(NumOperands - 1); - Value *GpPtr = Gep->getPointerOperand(); // If this GEP value is a consecutive pointer induction variable and all of // the indices are constant then we know it is consecutive. We can @@ -1062,14 +1136,18 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return -1; } - // Check that all of the gep indices are uniform except for the last. - for (unsigned i = 0; i < NumOperands - 1; ++i) - if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + unsigned InductionOperand = getGEPInductionOperand(DL, Gep); + + // Check that all of the gep indices are uniform except for our induction + // operand. + for (unsigned i = 0; i != NumOperands; ++i) + if (i != InductionOperand && + !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) return 0; - // We can emit wide load/stores only if the last index is the induction - // variable. - const SCEV *Last = SE->getSCEV(LastIndex); + // We can emit wide load/stores only if the last non-zero index is the + // induction variable. + const SCEV *Last = SE->getSCEV(Gep->getOperand(InductionOperand)); if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { const SCEV *Step = AR->getStepRecurrence(*SE); @@ -1127,6 +1205,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); + // An alignment of 0 means target abi alignment. We need to use the scalar's + // target abi alignment in such a case. + if (!Alignment) + Alignment = DL->getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; @@ -1166,7 +1248,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); - unsigned LastOperand = NumOperands - 1; + unsigned InductionOperand = getGEPInductionOperand(DL, Gep); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); @@ -1175,9 +1257,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); // Update last index or loop invariant instruction anchored in loop. - if (i == LastOperand || + if (i == InductionOperand || (GepOperandInst && OrigLoop->contains(GepOperandInst))) { - assert((i == LastOperand || + assert((i == InductionOperand || SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && "Must be last index or loop invariant"); @@ -1301,7 +1383,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instrucions with extracted scalars. + // Replace the operands of the cloned instructions with extracted scalars. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { Value *Op = Params[op][Part]; // Param is a vector. Need to extract the right lane. @@ -1335,11 +1417,9 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, SmallVector<TrackingVH<Value> , 2> Starts; SmallVector<TrackingVH<Value> , 2> Ends; + LLVMContext &Ctx = Loc->getContext(); SCEVExpander Exp(*SE, "induction"); - // Use this type for pointer arithmetic. - Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0); - for (unsigned i = 0; i < NumPointers; ++i) { Value *Ptr = PtrRtCheck->Pointers[i]; const SCEV *Sc = SE->getSCEV(Ptr); @@ -1350,7 +1430,11 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, Starts.push_back(Ptr); Ends.push_back(Ptr); } else { - DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); + DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'); + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + + // Use this type for pointer arithmetic. + Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); @@ -1372,10 +1456,20 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) continue; - Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc"); - Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc"); - Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc"); - Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy, "bc"); + unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); + unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); + + assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && + (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && + "Trying to bounds check pointers with different address spaces"); + + Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); + Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); + + Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); + Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); + Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); + Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); @@ -1390,9 +1484,8 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, // We have to do this trickery because the IRBuilder might fold the check to a // constant expression in which case there is no Instruction anchored in a // the block. - LLVMContext &Ctx = Loc->getContext(); - Instruction * Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, - ConstantInt::getTrue(Ctx)); + Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, + ConstantInt::getTrue(Ctx)); ChkBuilder.Insert(Check, "memcheck.conflict"); return Check; } @@ -1444,6 +1537,16 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); + // The exit count might have the type of i64 while the phi is i32. This can + // happen if we have an induction variable that is sign extended before the + // compare. The only way that we get a backedge taken count is that the + // induction variable was signed and as such will not overflow. In such a case + // truncation is legal. + if (ExitCount->getType()->getPrimitiveSizeInBits() > + IdxTy->getPrimitiveSizeInBits()) + ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); + + ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); // Get the total trip count from the count by adding 1. ExitCount = SE->getAddExpr(ExitCount, SE->getConstant(ExitCount->getType(), 1)); @@ -1496,7 +1599,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(VecBody->getFirstNonPHI()); // Generate the induction variable. setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); @@ -1724,6 +1827,9 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { LoopExitBlock = ExitBlock; LoopVectorBody = VecBody; LoopScalarBody = OldBasicBlock; + + LoopVectorizeHints Hints(Lp, true); + Hints.setAlreadyVectorized(Lp); } /// This function returns the identity element (or neutral element) for @@ -1753,6 +1859,31 @@ LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { } } +static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I, + Intrinsic::ID ValidIntrinsicID) { + if (I.getNumArgOperands() != 1 || + !I.getArgOperand(0)->getType()->isFloatingPointTy() || + I.getType() != I.getArgOperand(0)->getType() || + !I.onlyReadsMemory()) + return Intrinsic::not_intrinsic; + + return ValidIntrinsicID; +} + +static Intrinsic::ID checkBinaryFloatSignature(const CallInst &I, + Intrinsic::ID ValidIntrinsicID) { + if (I.getNumArgOperands() != 2 || + !I.getArgOperand(0)->getType()->isFloatingPointTy() || + !I.getArgOperand(1)->getType()->isFloatingPointTy() || + I.getType() != I.getArgOperand(0)->getType() || + I.getType() != I.getArgOperand(1)->getType() || + !I.onlyReadsMemory()) + return Intrinsic::not_intrinsic; + + return ValidIntrinsicID; +} + + static Intrinsic::ID getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { // If we have an intrinsic call, check if it is trivially vectorizable. @@ -1767,11 +1898,13 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { case Intrinsic::log10: case Intrinsic::log2: case Intrinsic::fabs: + case Intrinsic::copysign: case Intrinsic::floor: case Intrinsic::ceil: case Intrinsic::trunc: case Intrinsic::rint: case Intrinsic::nearbyint: + case Intrinsic::round: case Intrinsic::pow: case Intrinsic::fma: case Intrinsic::fmuladd: @@ -1789,8 +1922,9 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { LibFunc::Func Func; Function *F = CI->getCalledFunction(); // We're going to make assumptions on the semantics of the functions, check - // that the target knows that it's available in this environment. - if (!F || !TLI->getLibFunc(F->getName(), Func)) + // that the target knows that it's available in this environment and it does + // not have local linkage. + if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func)) return Intrinsic::not_intrinsic; // Otherwise check if we have a call to a function that can be turned into a @@ -1801,59 +1935,67 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { case LibFunc::sin: case LibFunc::sinf: case LibFunc::sinl: - return Intrinsic::sin; + return checkUnaryFloatSignature(*CI, Intrinsic::sin); case LibFunc::cos: case LibFunc::cosf: case LibFunc::cosl: - return Intrinsic::cos; + return checkUnaryFloatSignature(*CI, Intrinsic::cos); case LibFunc::exp: case LibFunc::expf: case LibFunc::expl: - return Intrinsic::exp; + return checkUnaryFloatSignature(*CI, Intrinsic::exp); case LibFunc::exp2: case LibFunc::exp2f: case LibFunc::exp2l: - return Intrinsic::exp2; + return checkUnaryFloatSignature(*CI, Intrinsic::exp2); case LibFunc::log: case LibFunc::logf: case LibFunc::logl: - return Intrinsic::log; + return checkUnaryFloatSignature(*CI, Intrinsic::log); case LibFunc::log10: case LibFunc::log10f: case LibFunc::log10l: - return Intrinsic::log10; + return checkUnaryFloatSignature(*CI, Intrinsic::log10); case LibFunc::log2: case LibFunc::log2f: case LibFunc::log2l: - return Intrinsic::log2; + return checkUnaryFloatSignature(*CI, Intrinsic::log2); case LibFunc::fabs: case LibFunc::fabsf: case LibFunc::fabsl: - return Intrinsic::fabs; + return checkUnaryFloatSignature(*CI, Intrinsic::fabs); + case LibFunc::copysign: + case LibFunc::copysignf: + case LibFunc::copysignl: + return checkBinaryFloatSignature(*CI, Intrinsic::copysign); case LibFunc::floor: case LibFunc::floorf: case LibFunc::floorl: - return Intrinsic::floor; + return checkUnaryFloatSignature(*CI, Intrinsic::floor); case LibFunc::ceil: case LibFunc::ceilf: case LibFunc::ceill: - return Intrinsic::ceil; + return checkUnaryFloatSignature(*CI, Intrinsic::ceil); case LibFunc::trunc: case LibFunc::truncf: case LibFunc::truncl: - return Intrinsic::trunc; + return checkUnaryFloatSignature(*CI, Intrinsic::trunc); case LibFunc::rint: case LibFunc::rintf: case LibFunc::rintl: - return Intrinsic::rint; + return checkUnaryFloatSignature(*CI, Intrinsic::rint); case LibFunc::nearbyint: case LibFunc::nearbyintf: case LibFunc::nearbyintl: - return Intrinsic::nearbyint; + return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint); + case LibFunc::round: + case LibFunc::roundf: + case LibFunc::roundl: + return checkUnaryFloatSignature(*CI, Intrinsic::round); case LibFunc::pow: case LibFunc::powf: case LibFunc::powl: - return Intrinsic::pow; + return checkBinaryFloatSignature(*CI, Intrinsic::pow); } return Intrinsic::not_intrinsic; @@ -1925,6 +2067,54 @@ Value *createMinMaxOp(IRBuilder<> &Builder, return Select; } +namespace { +struct CSEDenseMapInfo { + static bool canHandle(Instruction *I) { + return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || + isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); + } + static inline Instruction *getEmptyKey() { + return DenseMapInfo<Instruction *>::getEmptyKey(); + } + static inline Instruction *getTombstoneKey() { + return DenseMapInfo<Instruction *>::getTombstoneKey(); + } + static unsigned getHashValue(Instruction *I) { + assert(canHandle(I) && "Unknown instruction!"); + return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), + I->value_op_end())); + } + static bool isEqual(Instruction *LHS, Instruction *RHS) { + if (LHS == getEmptyKey() || RHS == getEmptyKey() || + LHS == getTombstoneKey() || RHS == getTombstoneKey()) + return LHS == RHS; + return LHS->isIdenticalTo(RHS); + } +}; +} + +///\brief Perform cse of induction variable instructions. +static void cse(BasicBlock *BB) { + // Perform simple cse. + SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { + Instruction *In = I++; + + if (!CSEDenseMapInfo::canHandle(In)) + continue; + + // Check if we can replace this instruction with any of the + // visited instructions. + if (Instruction *V = CSEMap.lookup(In)) { + In->replaceAllUsesWith(V); + In->eraseFromParent(); + continue; + } + + CSEMap[In] = In; + } +} + void InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { //===------------------------------------------------===// @@ -1995,18 +2185,31 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { // MinMax reduction have the start value as their identify. - VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue, - "minmax.ident"); + if (VF == 1) { + VectorStart = Identity = RdxDesc.StartValue; + } else { + VectorStart = Identity = Builder.CreateVectorSplat(VF, + RdxDesc.StartValue, + "minmax.ident"); + } } else { + // Handle other reduction kinds: Constant *Iden = - LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, - VecTy->getScalarType()); - Identity = ConstantVector::getSplat(VF, Iden); - - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - VectorStart = Builder.CreateInsertElement(Identity, - RdxDesc.StartValue, Zero); + LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, + VecTy->getScalarType()); + if (VF == 1) { + Identity = Iden; + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + VectorStart = RdxDesc.StartValue; + } else { + Identity = ConstantVector::getSplat(VF, Iden); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + VectorStart = Builder.CreateInsertElement(Identity, + RdxDesc.StartValue, Zero); + } } // Fix the vector-loop phi. @@ -2062,37 +2265,40 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { ReducedPartRdx, RdxParts[part]); } - // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles - // and vector ops, reducing the set of values being computed by half each - // round. - assert(isPowerOf2_32(VF) && - "Reduction emission only supported for pow2 vectors!"); - Value *TmpVec = ReducedPartRdx; - SmallVector<Constant*, 32> ShuffleMask(VF, 0); - for (unsigned i = VF; i != 1; i >>= 1) { - // Move the upper half of the vector to the lower half. - for (unsigned j = 0; j != i/2; ++j) - ShuffleMask[j] = Builder.getInt32(i/2 + j); - - // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i/2], ShuffleMask.end(), - UndefValue::get(Builder.getInt32Ty())); - - Value *Shuf = + if (VF > 1) { + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = ReducedPartRdx; + SmallVector<Constant*, 32> ShuffleMask(VF, 0); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i/2; ++j) + ShuffleMask[j] = Builder.getInt32(i/2 + j); + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i/2], ShuffleMask.end(), + UndefValue::get(Builder.getInt32Ty())); + + Value *Shuf = Builder.CreateShuffleVector(TmpVec, UndefValue::get(TmpVec->getType()), ConstantVector::get(ShuffleMask), "rdx.shuf"); - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, - "bin.rdx"); - else - TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); - } + if (Op != Instruction::ICmp && Op != Instruction::FCmp) + TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, + "bin.rdx"); + else + TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); + } - // The result is in the first element of the vector. - Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + // The result is in the first element of the vector. + ReducedPartRdx = Builder.CreateExtractElement(TmpVec, + Builder.getInt32(0)); + } // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. @@ -2101,7 +2307,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { for (BasicBlock::iterator LEI = LoopExitBlock->begin(), LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); - if (!LCSSAPhi) continue; + if (!LCSSAPhi) break; // All PHINodes need to have a single entry edge, or two if // we already fixed them. @@ -2111,7 +2317,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // incoming bypass edge. if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { // Add an edge coming from the bypass. - LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); + LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); break; } }// end of the LCSSA phi scan. @@ -2123,23 +2329,26 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); // Pick the other block. int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); + (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx); (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); }// end of for each redux variable. - // The Loop exit block may have single value PHI nodes where the incoming - // value is 'undef'. While vectorizing we only handled real values that - // were defined inside the loop. Here we handle the 'undef case'. - // See PR14725. + fixLCSSAPHIs(); + + // Remove redundant induction instructions. + cse(LoopVectorBody); +} + +void InnerLoopVectorizer::fixLCSSAPHIs() { for (BasicBlock::iterator LEI = LoopExitBlock->begin(), LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); - if (!LCSSAPhi) continue; + if (!LCSSAPhi) break; if (LCSSAPhi->getNumIncomingValues() == 1) LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), LoopMiddleBlock); } -} +} InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { @@ -2200,161 +2409,185 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { return BlockMask; } -void -InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, - BasicBlock *BB, PhiVector *PV) { - // For each instruction in the old loop. - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - VectorParts &Entry = WidenMap.get(it); - switch (it->getOpcode()) { - case Instruction::Br: - // Nothing to do for PHIs and BR, since we already took care of the - // loop control flow instructions. - continue; - case Instruction::PHI:{ - PHINode* P = cast<PHINode>(it); - // Handle reduction variables: - if (Legal->getReductionVars()->count(P)) { - for (unsigned part = 0; part < UF; ++part) { - // This is phase one of vectorizing PHIs. - Type *VecTy = VectorType::get(it->getType(), VF); - Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", - LoopVectorBody-> getFirstInsertionPt()); - } - PV->push_back(P); - continue; - } +void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, + InnerLoopVectorizer::VectorParts &Entry, + LoopVectorizationLegality *Legal, + unsigned UF, unsigned VF, PhiVector *PV) { + PHINode* P = cast<PHINode>(PN); + // Handle reduction variables: + if (Legal->getReductionVars()->count(P)) { + for (unsigned part = 0; part < UF; ++part) { + // This is phase one of vectorizing PHIs. + Type *VecTy = (VF == 1) ? PN->getType() : + VectorType::get(PN->getType(), VF); + Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", + LoopVectorBody-> getFirstInsertionPt()); + } + PV->push_back(P); + return; + } - setDebugLocFromInst(Builder, P); - // Check for PHI nodes that are lowered to vector selects. - if (P->getParent() != OrigLoop->getHeader()) { - // We know that all PHIs in non header blocks are converted into - // selects, so we don't have to worry about the insertion order and we - // can just use the builder. - // At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - unsigned NumIncoming = P->getNumIncomingValues(); - - // Generate a sequence of selects of the form: - // SELECT(Mask3, In3, - // SELECT(Mask2, In2, - // ( ...))) - for (unsigned In = 0; In < NumIncoming; In++) { - VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), - P->getParent()); - VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); - - for (unsigned part = 0; part < UF; ++part) { - // We might have single edge PHIs (blocks) - use an identity - // 'select' for the first PHI operand. - if (In == 0) - Entry[part] = Builder.CreateSelect(Cond[part], In0[part], - In0[part]); - else - // Select between the current value and the previous incoming edge - // based on the incoming mask. - Entry[part] = Builder.CreateSelect(Cond[part], In0[part], - Entry[part], "predphi"); - } - } - continue; + setDebugLocFromInst(Builder, P); + // Check for PHI nodes that are lowered to vector selects. + if (P->getParent() != OrigLoop->getHeader()) { + // We know that all PHIs in non header blocks are converted into + // selects, so we don't have to worry about the insertion order and we + // can just use the builder. + // At this point we generate the predication tree. There may be + // duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + + unsigned NumIncoming = P->getNumIncomingValues(); + + // Generate a sequence of selects of the form: + // SELECT(Mask3, In3, + // SELECT(Mask2, In2, + // ( ...))) + for (unsigned In = 0; In < NumIncoming; In++) { + VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), + P->getParent()); + VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); + + for (unsigned part = 0; part < UF; ++part) { + // We might have single edge PHIs (blocks) - use an identity + // 'select' for the first PHI operand. + if (In == 0) + Entry[part] = Builder.CreateSelect(Cond[part], In0[part], + In0[part]); + else + // Select between the current value and the previous incoming edge + // based on the incoming mask. + Entry[part] = Builder.CreateSelect(Cond[part], In0[part], + Entry[part], "predphi"); } + } + return; + } - // This PHINode must be an induction variable. - // Make sure that we know about it. - assert(Legal->getInductionVars()->count(P) && - "Not an induction variable"); - - LoopVectorizationLegality::InductionInfo II = - Legal->getInductionVars()->lookup(P); - - switch (II.IK) { - case LoopVectorizationLegality::IK_NoInduction: - llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IK_IntInduction: { - assert(P->getType() == II.StartValue->getType() && "Types must match"); - Type *PhiTy = P->getType(); - Value *Broadcasted; - if (P == OldInduction) { - // Handle the canonical induction variable. We might have had to - // extend the type. - Broadcasted = Builder.CreateTrunc(Induction, PhiTy); - } else { - // Handle other induction variables that are now based on the - // canonical one. - Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, - "normalized.idx"); - NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); - Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, - "offset.idx"); - } - Broadcasted = getBroadcastInstrs(Broadcasted); - // After broadcasting the induction variable we need to make the vector - // consecutive by adding 0, 1, 2, etc. + // This PHINode must be an induction variable. + // Make sure that we know about it. + assert(Legal->getInductionVars()->count(P) && + "Not an induction variable"); + + LoopVectorizationLegality::InductionInfo II = + Legal->getInductionVars()->lookup(P); + + switch (II.IK) { + case LoopVectorizationLegality::IK_NoInduction: + llvm_unreachable("Unknown induction"); + case LoopVectorizationLegality::IK_IntInduction: { + assert(P->getType() == II.StartValue->getType() && "Types must match"); + Type *PhiTy = P->getType(); + Value *Broadcasted; + if (P == OldInduction) { + // Handle the canonical induction variable. We might have had to + // extend the type. + Broadcasted = Builder.CreateTrunc(Induction, PhiTy); + } else { + // Handle other induction variables that are now based on the + // canonical one. + Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, + "normalized.idx"); + NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); + Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, + "offset.idx"); + } + Broadcasted = getBroadcastInstrs(Broadcasted); + // After broadcasting the induction variable we need to make the vector + // consecutive by adding 0, 1, 2, etc. + for (unsigned part = 0; part < UF; ++part) + Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); + return; + } + case LoopVectorizationLegality::IK_ReverseIntInduction: + case LoopVectorizationLegality::IK_PtrInduction: + case LoopVectorizationLegality::IK_ReversePtrInduction: + // Handle reverse integer and pointer inductions. + Value *StartIdx = ExtendedIdx; + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, + "normalized.idx"); + + // Handle the reverse integer induction variable case. + if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { + IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); + Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, + "resize.norm.idx"); + Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, + "reverse.idx"); + + // This is a new value so do not hoist it out. + Value *Broadcasted = getBroadcastInstrs(ReverseInd); + // After broadcasting the induction variable we need to make the + // vector consecutive by adding ... -3, -2, -1, 0. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); - continue; + Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, + true); + return; } - case LoopVectorizationLegality::IK_ReverseIntInduction: - case LoopVectorizationLegality::IK_PtrInduction: - case LoopVectorizationLegality::IK_ReversePtrInduction: - // Handle reverse integer and pointer inductions. - Value *StartIdx = ExtendedIdx; - // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, - "normalized.idx"); - // Handle the reverse integer induction variable case. - if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { - IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); - Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, - "resize.norm.idx"); - Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, - "reverse.idx"); - - // This is a new value so do not hoist it out. - Value *Broadcasted = getBroadcastInstrs(ReverseInd); - // After broadcasting the induction variable we need to make the - // vector consecutive by adding ... -3, -2, -1, 0. - for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, - true); + // Handle the pointer induction variable case. + assert(P->getType()->isPointerTy() && "Unexpected type."); + + // Is this a reverse induction ptr or a consecutive induction ptr. + bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == + II.IK); + + // This is the vector of results. Notice that we don't generate + // vector geps because scalar geps result in better code. + for (unsigned part = 0; part < UF; ++part) { + if (VF == 1) { + int EltIndex = (part) * (Reverse ? -1 : 1); + Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); + Value *GlobalIdx; + if (Reverse) + GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); + else + GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); + + Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, + "next.gep"); + Entry[part] = SclrGep; continue; } - // Handle the pointer induction variable case. - assert(P->getType()->isPointerTy() && "Unexpected type."); - - // Is this a reverse induction ptr or a consecutive induction ptr. - bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == - II.IK); - - // This is the vector of results. Notice that we don't generate - // vector geps because scalar geps result in better code. - for (unsigned part = 0; part < UF; ++part) { - Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); - for (unsigned int i = 0; i < VF; ++i) { - int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); - Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); - Value *GlobalIdx; - if (!Reverse) - GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); - else - GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); - - Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, - "next.gep"); - VecVal = Builder.CreateInsertElement(VecVal, SclrGep, - Builder.getInt32(i), - "insert.gep"); - } - Entry[part] = VecVal; + Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); + for (unsigned int i = 0; i < VF; ++i) { + int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); + Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); + Value *GlobalIdx; + if (!Reverse) + GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); + else + GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); + + Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, + "next.gep"); + VecVal = Builder.CreateInsertElement(VecVal, SclrGep, + Builder.getInt32(i), + "insert.gep"); } - continue; + Entry[part] = VecVal; } + return; + } +} +void +InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, + BasicBlock *BB, PhiVector *PV) { + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + VectorParts &Entry = WidenMap.get(it); + switch (it->getOpcode()) { + case Instruction::Br: + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + continue; + case Instruction::PHI:{ + // Vectorize PHINodes. + widenPHIInstruction(it, Entry, Legal, UF, VF, PV); + continue; }// End of PHI. case Instruction::Add: @@ -2413,8 +2646,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, VectorParts &Cond = getVectorValue(it->getOperand(0)); VectorParts &Op0 = getVectorValue(it->getOperand(1)); VectorParts &Op1 = getVectorValue(it->getOperand(2)); - Value *ScalarCond = Builder.CreateExtractElement(Cond[0], - Builder.getInt32(0)); + + Value *ScalarCond = (VF == 1) ? Cond[0] : + Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); + for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = Builder.CreateSelect( InvariantCond ? ScalarCond : Cond[Part], @@ -2475,7 +2710,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, break; } /// Vectorize casts. - Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); + Type *DestTy = (VF == 1) ? CI->getType() : + VectorType::get(CI->getType(), VF); VectorParts &A = getVectorValue(it->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) @@ -2505,7 +2741,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); Args.push_back(Arg[Part]); } - Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) }; + Type *Tys[] = {CI->getType()}; + if (VF > 1) + Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF); + Function *F = Intrinsic::getDeclaration(M, ID, Tys); Entry[Part] = Builder.CreateCall(F, Args); } @@ -2542,19 +2781,36 @@ void InnerLoopVectorizer::updateAnalysis() { DEBUG(DT->verifyAnalysis()); } +/// \brief Check whether it is safe to if-convert this phi node. +/// +/// Phi nodes with constant expressions that can trap are not safe to if +/// convert. +static bool canIfConvertPHINodes(BasicBlock *BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + PHINode *Phi = dyn_cast<PHINode>(I); + if (!Phi) + return true; + for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p) + if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p))) + if (C->canTrap()) + return false; + } + return true; +} + bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) return false; assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); - std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); // A list of pointers that we can safely read and write to. SmallPtrSet<Value *, 8> SafePointes; // Collect safe addresses. - for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { - BasicBlock *BB = LoopBlocks[i]; + for (Loop::block_iterator BI = TheLoop->block_begin(), + BE = TheLoop->block_end(); BI != BE; ++BI) { + BasicBlock *BB = *BI; if (blockNeedsPredication(BB)) continue; @@ -2568,16 +2824,22 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { } // Collect the blocks that need predication. - for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { - BasicBlock *BB = LoopBlocks[i]; + BasicBlock *Header = TheLoop->getHeader(); + for (Loop::block_iterator BI = TheLoop->block_begin(), + BE = TheLoop->block_end(); BI != BE; ++BI) { + BasicBlock *BB = *BI; // We don't support switch statements inside loops. if (!isa<BranchInst>(BB->getTerminator())) return false; // We must be able to predicate all blocks that need to be predicated. - if (blockNeedsPredication(BB) && !blockCanBePredicated(BB, SafePointes)) + if (blockNeedsPredication(BB)) { + if (!blockCanBePredicated(BB, SafePointes)) + return false; + } else if (BB != Header && !canIfConvertPHINodes(BB)) return false; + } // We can if-convert this loop. @@ -2602,19 +2864,17 @@ bool LoopVectorizationLegality::canVectorize() { if (!TheLoop->getExitingBlock()) return false; - unsigned NumBlocks = TheLoop->getNumBlocks(); + // We need to have a loop header. + DEBUG(dbgs() << "LV: Found a loop: " << + TheLoop->getHeader()->getName() << '\n'); // Check if we can if-convert non single-bb loops. + unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); return false; } - // We need to have a loop header. - BasicBlock *Latch = TheLoop->getLoopLatch(); - DEBUG(dbgs() << "LV: Found a loop: " << - TheLoop->getHeader()->getName() << "\n"); - // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); if (ExitCount == SE->getCouldNotCompute()) { @@ -2623,6 +2883,7 @@ bool LoopVectorizationLegality::canVectorize() { } // Do not loop-vectorize loops with a tiny trip count. + BasicBlock *Latch = TheLoop->getLoopLatch(); unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); if (TC > 0u && TC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << @@ -2657,7 +2918,13 @@ bool LoopVectorizationLegality::canVectorize() { static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) { if (Ty->isPointerTy()) - return DL.getIntPtrType(Ty->getContext()); + return DL.getIntPtrType(Ty); + + // It is possible that char's or short's overflow when we ask for the loop's + // trip count, work around this by changing the type size. + if (Ty->getScalarSizeInBits() < 32) + return Type::getInt32Ty(Ty->getContext()); + return Ty; } @@ -2682,7 +2949,7 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, Instruction *U = cast<Instruction>(*I); // This user may be a reduction exit value. if (!TheLoop->contains(U)) { - DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + DEBUG(dbgs() << "LV: Found an outside user for : " << *U << '\n'); return true; } } @@ -2758,6 +3025,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { DEBUG(dbgs() << "LV: Found an induction variable.\n"); Inductions[Phi] = InductionInfo(StartValue, IK); + + // Until we explicitly handle the case of an induction variable with + // an outside loop user we have to give up vectorizing this loop. + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + return false; + continue; } @@ -2812,9 +3085,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } // Check that the instruction return type is vectorizable. - if (!VectorType::isValidElementType(it->getType()) && - !it->getType()->isVoidTy()) { - DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); + // Also, we can't vectorize extractelement instructions. + if ((!VectorType::isValidElementType(it->getType()) && + !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { + DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; } @@ -2904,7 +3178,7 @@ public: /// non-intersection. bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, unsigned &NumComparisons, ScalarEvolution *SE, - Loop *TheLoop); + Loop *TheLoop, bool ShouldCheckStride = false); /// \brief Goes over all memory accesses, checks whether a RT check is needed /// and builds sets of dependent accesses. @@ -2918,6 +3192,7 @@ public: bool isRTCheckNeeded() { return IsRTCheckNeeded; } bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } + void resetDepChecks() { CheckDeps.clear(); } MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } @@ -2972,10 +3247,15 @@ static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) { return AR->isAffine(); } +/// \brief Check the stride of the pointer and ensure that it does not wrap in +/// the address space. +static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr, + const Loop *Lp); + bool AccessAnalysis::canCheckPtrAtRT( LoopVectorizationLegality::RuntimePointerCheck &RtCheck, unsigned &NumComparisons, ScalarEvolution *SE, - Loop *TheLoop) { + Loop *TheLoop, bool ShouldCheckStride) { // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. unsigned NumReadPtrChecks = 0; @@ -3003,7 +3283,10 @@ bool AccessAnalysis::canCheckPtrAtRT( else ++NumReadPtrChecks; - if (hasComputableBounds(SE, Ptr)) { + if (hasComputableBounds(SE, Ptr) && + // When we run after a failing dependency check we have to make sure we + // don't have wrapping pointers. + (!ShouldCheckStride || isStridedPtr(SE, DL, Ptr, TheLoop) == 1)) { // The id of the dependence set. unsigned DepId; @@ -3019,7 +3302,7 @@ bool AccessAnalysis::canCheckPtrAtRT( RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId); - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n"); + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); } else { CanDoRT = false; } @@ -3027,9 +3310,36 @@ bool AccessAnalysis::canCheckPtrAtRT( if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) NumComparisons = 0; // Only one dependence set. - else + else { NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks + NumWritePtrChecks - 1)); + } + + // If the pointers that we would use for the bounds comparison have different + // address spaces, assume the values aren't directly comparable, so we can't + // use them for the runtime check. We also have to assume they could + // overlap. In the future there should be metadata for whether address spaces + // are disjoint. + unsigned NumPointers = RtCheck.Pointers.size(); + for (unsigned i = 0; i < NumPointers; ++i) { + for (unsigned j = i + 1; j < NumPointers; ++j) { + // Only need to check pointers between two different dependency sets. + if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) + continue; + + Value *PtrI = RtCheck.Pointers[i]; + Value *PtrJ = RtCheck.Pointers[j]; + + unsigned ASi = PtrI->getType()->getPointerAddressSpace(); + unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); + if (ASi != ASj) { + DEBUG(dbgs() << "LV: Runtime check would require comparison between" + " different address spaces\n"); + return false; + } + } + } + return CanDoRT; } @@ -3084,7 +3394,7 @@ void AccessAnalysis::processMemAccesses(bool UseDeferred) { !isa<Argument>(UnderlyingObj)) && !isIdentifiedObject(UnderlyingObj))) { DEBUG(dbgs() << "LV: Found an unidentified " << - (IsWrite ? "write" : "read" ) << " ptr:" << *UnderlyingObj << + (IsWrite ? "write" : "read" ) << " ptr: " << *UnderlyingObj << "\n"); IsRTCheckNeeded = (IsRTCheckNeeded || !isIdentifiedObject(UnderlyingObj) || @@ -3158,8 +3468,9 @@ public: typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; - MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) : - SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0) {} + MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) + : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), + ShouldRetryWithRuntimeCheck(false) {} /// \brief Register the location (instructions are given increasing numbers) /// of a write access. @@ -3189,6 +3500,10 @@ public: /// the accesses safely with. unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + /// \brief In same cases when the dependency check fails we can still + /// vectorize the loop with a dynamic array access check. + bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } + private: ScalarEvolution *SE; DataLayout *DL; @@ -3206,6 +3521,10 @@ private: // We can access this many bytes in parallel safely. unsigned MaxSafeDepDistBytes; + /// \brief If we see a non constant dependence distance we can still try to + /// vectorize this loop with runtime checks. + bool ShouldRetryWithRuntimeCheck; + /// \brief Check whether there is a plausible dependence between the two /// accesses. /// @@ -3403,6 +3722,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); if (!C) { DEBUG(dbgs() << "LV: Dependence because of non constant distance\n"); + ShouldRetryWithRuntimeCheck = true; return true; } @@ -3428,7 +3748,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, if (Val == 0) { if (ATy == BTy) return false; - DEBUG(dbgs() << "LV: Zero dependence difference but different types"); + DEBUG(dbgs() << "LV: Zero dependence difference but different types\n"); return true; } @@ -3437,7 +3757,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // Positive distance bigger than max vectorization factor. if (ATy != BTy) { DEBUG(dbgs() << - "LV: ReadWrite-Write positive dependency with different types"); + "LV: ReadWrite-Write positive dependency with different types\n"); return false; } @@ -3454,7 +3774,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, 2*TypeByteSize > MaxSafeDepDistBytes || Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { DEBUG(dbgs() << "LV: Failure because of Positive distance " - << Val.getSExtValue() << "\n"); + << Val.getSExtValue() << '\n'); return true; } @@ -3467,7 +3787,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, return true; DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() << - " with max VF=" << MaxSafeDepDistBytes/TypeByteSize << "\n"); + " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'); return false; } @@ -3571,8 +3891,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { Stores.push_back(St); DepChecker.addAccess(St); } - } // next instr. - } // next block. + } // Next instr. + } // Next block. // Now we have two lists that hold the loads and the stores. // Next, we find the pointers that they use. @@ -3619,7 +3939,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } - SmallPtrSet<Value *, 16> ReadOnlyPtr; for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { LoadInst *LD = cast<LoadInst>(*I); Value* Ptr = LD->getPointerOperand(); @@ -3667,7 +3986,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (NumComparisons == 0 && NeedRTCheck) NeedRTCheck = false; - // Check that we did not collect too many pointers or found a unsizeable + // Check that we did not collect too many pointers or found an unsizeable // pointer. if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { PtrRtCheck.reset(); @@ -3693,9 +4012,32 @@ bool LoopVectorizationLegality::canVectorizeMemory() { CanVecMem = DepChecker.areDepsSafe(DependentAccesses, Accesses.getDependenciesToCheck()); MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); + + if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { + DEBUG(dbgs() << "LV: Retrying with memory checks\n"); + NeedRTCheck = true; + + // Clear the dependency checks. We assume they are not needed. + Accesses.resetDepChecks(); + + PtrRtCheck.reset(); + PtrRtCheck.Need = true; + + CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, + TheLoop, true); + // Check that we did not collect too many pointers or found an unsizeable + // pointer. + if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { + DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); + PtrRtCheck.reset(); + return false; + } + + CanVecMem = true; + } } - DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << + DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"); return CanVecMem; @@ -3839,6 +4181,12 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, if (ExitInstruction != 0 || Cur == Phi) return false; + // The instruction used by an outside user must be the last instruction + // before we feed back to the reduction phi. Otherwise, we loose VF-1 + // operations on the value. + if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end()) + return false; + ExitInstruction = Cur; continue; } @@ -4045,6 +4393,14 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, if (it->mayWriteToMemory() || it->mayThrow()) return false; + // Check that we don't have a constant expression that can trap as operand. + for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); + OI != OE; ++OI) { + if (Constant *C = dyn_cast<Constant>(*OI)) + if (C->canTrap()) + return false; + } + // The instructions below can trap. switch (it->getOpcode()) { default: continue; @@ -4071,7 +4427,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, // Find the trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); - DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n"); + DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); unsigned WidestType = getWidestType(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); @@ -4082,7 +4438,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); - DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n"); + DEBUG(dbgs() << "LV: The Widest register is: " + << WidestRegister << " bits.\n"); if (MaxVectorSize == 0) { DEBUG(dbgs() << "LV: The target has no vector registers.\n"); @@ -4118,7 +4475,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, if (UserVF != 0) { assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); - DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n"); + DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); Factor.Width = UserVF; return Factor; @@ -4126,13 +4483,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, float Cost = expectedCost(1); unsigned Width = 1; - DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); + DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n"); for (unsigned i=2; i <= VF; i*=2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. float VectorCost = expectedCost(i) / (float)i; - DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << + DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); if (VectorCost < Cost) { Cost = VectorCost; @@ -4256,8 +4613,20 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, else if (UF < 1) UF = 1; - if (Legal->getReductionVars()->size()) { - DEBUG(dbgs() << "LV: Unrolling because of reductions. \n"); + bool HasReductions = Legal->getReductionVars()->size(); + + // Decide if we want to unroll if we decided that it is legal to vectorize + // but not profitable. + if (VF == 1) { + if (TheLoop->getNumBlocks() > 1 || !HasReductions || + LoopCost > SmallLoopCost) + return 1; + + return UF; + } + + if (HasReductions) { + DEBUG(dbgs() << "LV: Unrolling because of reductions.\n"); return UF; } @@ -4265,14 +4634,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and unroll until the cost of the // loop overhead is about 5% of the cost of the loop. - DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n"); - if (LoopCost < 20) { - DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n"); - unsigned NewUF = 20/LoopCost + 1; + DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); + if (LoopCost < SmallLoopCost) { + DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n"); + unsigned NewUF = SmallLoopCost / (LoopCost + 1); return std::min(NewUF, UF); } - DEBUG(dbgs() << "LV: Not Unrolling. \n"); + DEBUG(dbgs() << "LV: Not Unrolling.\n"); return 1; } @@ -4373,16 +4742,16 @@ LoopVectorizationCostModel::calculateRegisterUsage() { MaxUsage = std::max(MaxUsage, OpenIntervals.size()); DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << - OpenIntervals.size() <<"\n"); + OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); } unsigned Invariant = LoopInvariants.size(); - DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n"); - DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n"); - DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n"); + DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); + DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); + DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); R.LoopInvariantRegs = Invariant; R.MaxLocalUsers = MaxUsage; @@ -4406,8 +4775,8 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { unsigned C = getInstructionCost(it, VF); BlockCost += C; - DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << - VF << " For instruction: "<< *it << "\n"); + DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " << + VF << " For instruction: " << *it << '\n'); } // We assume that if-converted blocks have a 50% chance of being executed. @@ -4669,13 +5038,16 @@ char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { - Pass *createLoopVectorizePass() { - return new LoopVectorize(); + Pass *createLoopVectorizePass(bool NoUnrolling) { + return new LoopVectorize(NoUnrolling); } } @@ -4690,3 +5062,96 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { return false; } + + +void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { + assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + // Holds vector parameters or scalars, in case of uniform vals. + SmallVector<VectorParts, 4> Params; + + setDebugLocFromInst(Builder, Instr); + + // Find all of the vectorized parameters. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *SrcOp = Instr->getOperand(op); + + // If we are accessing the old induction variable, use the new one. + if (SrcOp == OldInduction) { + Params.push_back(getVectorValue(SrcOp)); + continue; + } + + // Try using previously calculated values. + Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); + + // If the src is an instruction that appeared earlier in the basic block + // then it should already be vectorized. + if (SrcInst && OrigLoop->contains(SrcInst)) { + assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); + // The parameter is a vector value from earlier. + Params.push_back(WidenMap.get(SrcInst)); + } else { + // The parameter is a scalar from outside the loop. Maybe even a constant. + VectorParts Scalars; + Scalars.append(UF, SrcOp); + Params.push_back(Scalars); + } + } + + assert(Params.size() == Instr->getNumOperands() && + "Invalid number of operands"); + + // Does this instruction return a value ? + bool IsVoidRetTy = Instr->getType()->isVoidTy(); + + Value *UndefVec = IsVoidRetTy ? 0 : + UndefValue::get(Instr->getType()); + // Create a new entry in the WidenMap and initialize it to Undef or Null. + VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + + // For each vector unroll 'part': + for (unsigned Part = 0; Part < UF; ++Part) { + // For each scalar that we create: + + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); + // Replace the operands of the cloned instructions with extracted scalars. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *Op = Params[op][Part]; + Cloned->setOperand(op, Op); + } + + // Place the cloned scalar in the new loop. + Builder.Insert(Cloned); + + // If the original scalar returns a value we need to place it in a vector + // so that future users will be able to use it. + if (!IsVoidRetTy) + VecResults[Part] = Cloned; + } +} + +void +InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr, + LoopVectorizationLegality*) { + return scalarizeInstruction(Instr); +} + +Value *InnerLoopUnroller::reverseVector(Value *Vec) { + return Vec; +} + +Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { + return V; +} + +Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, + bool Negate) { + // When unrolling and the VF is 1, we only need to add a simple scalar. + Type *ITy = Val->getType(); + assert(!ITy->isVectorTy() && "Val must be a scalar"); + Constant *C = ConstantInt::get(ITy, StartIdx, Negate); + return Builder.CreateAdd(Val, C, "induction"); +} + |