diff options
author | Stephen Hines <srhines@google.com> | 2014-07-21 00:45:20 -0700 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2014-07-21 00:45:20 -0700 |
commit | c6a4f5e819217e1e12c458aed8e7b122e23a3a58 (patch) | |
tree | 81b7dd2bb4370a392f31d332a566c903b5744764 /lib/Transforms/Vectorize | |
parent | 19c6fbb3e8aaf74093afa08013134b61fa08f245 (diff) | |
download | external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.zip external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.tar.gz external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.tar.bz2 |
Update LLVM for rebase to r212749.
Includes a cherry-pick of:
r212948 - fixes a small issue with atomic calls
Change-Id: Ib97bd980b59f18142a69506400911a6009d9df18
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 373 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 297 |
2 files changed, 587 insertions, 83 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 34d8a10..cb8a41d 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -209,6 +209,29 @@ namespace { class LoopVectorizationLegality; class LoopVectorizationCostModel; +/// Optimization analysis message produced during vectorization. Messages inform +/// the user why vectorization did not occur. +class Report { + std::string Message; + raw_string_ostream Out; + Instruction *Instr; + +public: + Report(Instruction *I = nullptr) : Out(Message), Instr(I) { + Out << "loop not vectorized: "; + } + + template <typename A> Report &operator<<(const A &Value) { + Out << Value; + return *this; + } + + Instruction *getInstr() { return Instr; } + + std::string &str() { return Out.str(); } + operator Twine() { return Out.str(); } +}; + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -515,10 +538,12 @@ public: unsigned NumPredStores; LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, - DominatorTree *DT, TargetLibraryInfo *TLI) + DominatorTree *DT, TargetLibraryInfo *TLI, + Function *F) : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), - DT(DT), TLI(TLI), Induction(nullptr), WidestIndTy(nullptr), - HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {} + DT(DT), TLI(TLI), TheFunction(F), Induction(nullptr), + WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) { + } /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -747,6 +772,16 @@ private: /// invariant. void collectStridedAcccess(Value *LoadOrStoreInst); + /// Report an analysis message to assist the user in diagnosing loops that are + /// not vectorized. + void emitAnalysis(Report &Message) { + DebugLoc DL = TheLoop->getStartLoc(); + if (Instruction *I = Message.getInstr()) + DL = I->getDebugLoc(); + emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE, + *TheFunction, DL, Message.str()); + } + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. @@ -757,6 +792,8 @@ private: DominatorTree *DT; /// Target Library Info. TargetLibraryInfo *TLI; + /// Parent function + Function *TheFunction; // --- vectorization state --- // @@ -906,7 +943,7 @@ public: } /// Return the loop vectorizer metadata prefix. - static StringRef Prefix() { return "llvm.vectorizer."; } + static StringRef Prefix() { return "llvm.loop.vectorize."; } MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) const { SmallVector<Value*, 2> Vals; @@ -942,6 +979,29 @@ public: LoopID = NewLoopID; } + std::string emitRemark() const { + Report R; + R << "vectorization "; + switch (Force) { + case LoopVectorizeHints::FK_Disabled: + R << "is explicitly disabled"; + break; + case LoopVectorizeHints::FK_Enabled: + R << "is explicitly enabled"; + if (Width != 0 && Unroll != 0) + R << " with width " << Width << " and interleave count " << Unroll; + else if (Width != 0) + R << " with width " << Width; + else if (Unroll != 0) + R << " with interleave count " << Unroll; + break; + case LoopVectorizeHints::FK_Undefined: + R << "was not specified"; + break; + } + return R.str(); + } + unsigned getWidth() const { return Width; } unsigned getUnroll() const { return Unroll; } enum ForceKind getForce() const { return Force; } @@ -1125,18 +1185,37 @@ struct LoopVectorize : public FunctionPass { : "?")) << " width=" << Hints.getWidth() << " unroll=" << Hints.getUnroll() << "\n"); + // Function containing loop + Function *F = L->getHeader()->getParent(); + + // Looking at the diagnostic output is the only way to determine if a loop + // was vectorized (other than looking at the IR or machine code), so it + // is important to generate an optimization remark for each loop. Most of + // these messages are generated by emitOptimizationRemarkAnalysis. Remarks + // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are + // less verbose reporting vectorized loops and unvectorized loops that may + // benefit from vectorization, respectively. + if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) { DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, + L->getStartLoc(), Hints.emitRemark()); return false; } if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) { DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, + L->getStartLoc(), Hints.emitRemark()); return false; } if (Hints.getWidth() == 1 && Hints.getUnroll() == 1) { DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "loop not vectorized: vector width and interleave count are " + "explicitly set to 1"); return false; } @@ -1151,14 +1230,19 @@ struct LoopVectorize : public FunctionPass { DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { DEBUG(dbgs() << "\n"); + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "vectorization is not beneficial and is not explicitly forced"); return false; } } // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TLI); + LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, F); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); + emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F, + L->getStartLoc(), Hints.emitRemark()); return false; } @@ -1167,7 +1251,6 @@ struct LoopVectorize : public FunctionPass { // Check the function attributes to find out if this function should be // optimized for size. - Function *F = L->getHeader()->getParent(); bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->hasFnAttribute(Attribute::OptimizeForSize); @@ -1190,6 +1273,11 @@ struct LoopVectorize : public FunctionPass { if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"); + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "loop not vectorized due to NoImplicitFloat attribute"); + emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F, + L->getStartLoc(), Hints.emitRemark()); return false; } @@ -1208,9 +1296,14 @@ struct LoopVectorize : public FunctionPass { DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n'); if (VF.Width == 1) { - DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); - if (UF == 1) + DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n"); + + if (UF == 1) { + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "not beneficial to vectorize and user disabled interleaving"); return false; + } DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n"); // Report the unrolling decision. @@ -1220,6 +1313,7 @@ struct LoopVectorize : public FunctionPass { " (vectorization not beneficial)")); // We decided not to vectorize, but we may want to unroll. + InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); Unroller.vectorize(&LVL); } else { @@ -1909,20 +2003,23 @@ void InnerLoopVectorizer::createEmptyLoop() { the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- vector loop bypass (may consist of multiple blocks). - / | - / v - | [ ] <-- vector pre header. - | | - | v - | [ ] \ - | [ ]_| <-- vector loop. - | | - \ v - >[ ] <--- middle-block. - / | - / v - | [ ] <--- new preheader. + [ ] <-- Back-edge taken count overflow check. + / | + / v + | [ ] <-- vector loop bypass (may consist of multiple blocks). + | / | + | / v + || [ ] <-- vector pre header. + || | + || v + || [ ] \ + || [ ]_| <-- vector loop. + || | + | \ v + | >[ ] <--- middle-block. + | / | + | / v + -|- >[ ] <--- new preheader. | | | v | [ ] \ @@ -1936,6 +2033,7 @@ void InnerLoopVectorizer::createEmptyLoop() { BasicBlock *OldBasicBlock = OrigLoop->getHeader(); BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); + assert(BypassBlock && "Invalid loop structure"); assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops @@ -1958,18 +2056,30 @@ void InnerLoopVectorizer::createEmptyLoop() { IdxTy->getPrimitiveSizeInBits()) ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); - ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); + const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); // Get the total trip count from the count by adding 1. - ExitCount = SE->getAddExpr(ExitCount, - SE->getConstant(ExitCount->getType(), 1)); + ExitCount = SE->getAddExpr(BackedgeTakeCount, + SE->getConstant(BackedgeTakeCount->getType(), 1)); // Expand the trip count and place the new instructions in the preheader. // Notice that the pre-header does not change, only the loop body. SCEVExpander Exp(*SE, "induction"); - // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - BypassBlock->getTerminator()); + // We need to test whether the backedge-taken count is uint##_max. Adding one + // to it will cause overflow and an incorrect loop trip count in the vector + // body. In case of overflow we want to directly jump to the scalar remainder + // loop. + Value *BackedgeCount = + Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), + BypassBlock->getTerminator()); + if (BackedgeCount->getType()->isPointerTy()) + BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, + "backedge.ptrcnt.to.int", + BypassBlock->getTerminator()); + Instruction *CheckBCOverflow = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, + Constant::getAllOnesValue(BackedgeCount->getType()), + "backedge.overflow", BypassBlock->getTerminator()); // The loop index does not have to start at Zero. Find the original start // value from the induction PHI node. If we don't have an induction variable @@ -1980,7 +2090,18 @@ void InnerLoopVectorizer::createEmptyLoop() { IdxTy): ConstantInt::get(IdxTy, 0); - assert(BypassBlock && "Invalid loop structure"); + // We need an instruction to anchor the overflow check on. StartIdx needs to + // be defined before the overflow check branch. Because the scalar preheader + // is going to merge the start index and so the overflow branch block needs to + // contain a definition of the start index. + Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd( + StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor", + BypassBlock->getTerminator()); + + // Count holds the overall loop count (N). + Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + BypassBlock->getTerminator()); + LoopBypassBlocks.push_back(BypassBlock); // Split the single block loop into the two loop structure described above. @@ -2049,29 +2170,45 @@ void InnerLoopVectorizer::createEmptyLoop() { // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. - Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, - "cmp.zero"); + Value *Cmp = + BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); BasicBlock *LastBypassBlock = BypassBlock; + // Generate code to check that the loops trip count that we computed by adding + // one to the backedge-taken count will not overflow. + { + auto PastOverflowCheck = + std::next(BasicBlock::iterator(OverflowCheckAnchor)); + BasicBlock *CheckBlock = + LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked"); + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + LoopBypassBlocks.push_back(CheckBlock); + Instruction *OldTerm = LastBypassBlock->getTerminator(); + BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm); + OldTerm->eraseFromParent(); + LastBypassBlock = CheckBlock; + } + // Generate the code to check that the strides we assumed to be one are really // one. We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. Instruction *StrideCheck; Instruction *FirstCheckInst; std::tie(FirstCheckInst, StrideCheck) = - addStrideCheck(BypassBlock->getTerminator()); + addStrideCheck(LastBypassBlock->getTerminator()); if (StrideCheck) { // Create a new block containing the stride check. BasicBlock *CheckBlock = - BypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); + LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); if (ParentLoop) ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch // for the "few elements case". - Instruction *OldTerm = BypassBlock->getTerminator(); + Instruction *OldTerm = LastBypassBlock->getTerminator(); BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); OldTerm->eraseFromParent(); @@ -2134,6 +2271,19 @@ void InnerLoopVectorizer::createEmptyLoop() { PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", MiddleBlock->getTerminator()) : nullptr; + // Create phi nodes to merge from the backedge-taken check block. + PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val", + ScalarPH->getTerminator()); + BCResumeVal->addIncoming(ResumeVal, MiddleBlock); + + PHINode *BCTruncResumeVal = nullptr; + if (OrigPhi == OldInduction) { + BCTruncResumeVal = + PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val", + ScalarPH->getTerminator()); + BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock); + } + Value *EndValue = nullptr; switch (II.IK) { case LoopVectorizationLegality::IK_NoInduction: @@ -2150,10 +2300,12 @@ void InnerLoopVectorizer::createEmptyLoop() { BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); TruncResumeVal->addIncoming(EndValue, VecBody); + BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); + // We know what the end value is. EndValue = IdxEndRoundDown; // We also know which PHI node holds it. @@ -2199,7 +2351,7 @@ void InnerLoopVectorizer::createEmptyLoop() { // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) { + for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) { if (OrigPhi == OldInduction) ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); else @@ -2209,11 +2361,16 @@ void InnerLoopVectorizer::createEmptyLoop() { // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); - // The old inductions phi node in the scalar body needs the truncated value. - if (OrigPhi == OldInduction) - OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal); - else - OrigPhi->setIncomingValue(BlockIdx, ResumeVal); + + // The old induction's phi node in the scalar body needs the truncated + // value. + if (OrigPhi == OldInduction) { + BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]); + OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal); + } else { + BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); + OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); + } } // If we are generating a new induction variable then we also need to @@ -2224,7 +2381,7 @@ void InnerLoopVectorizer::createEmptyLoop() { assert(!ResumeIndex && "Unexpected resume value found"); ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", MiddleBlock->getTerminator()); - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); } @@ -2494,7 +2651,7 @@ void InnerLoopVectorizer::vectorizeLoop() { // To do so, we need to generate the 'identity' vector and override // one of the elements with the incoming scalar reduction. We need // to do it in the vector-loop preheader. - Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator()); + Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); // This is the vector-clone of the value that leaves the loop. VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); @@ -2568,7 +2725,7 @@ void InnerLoopVectorizer::vectorizeLoop() { VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); Value *StartVal = (part == 0) ? VectorStart : Identity; - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody.back()); @@ -2626,6 +2783,13 @@ void InnerLoopVectorizer::vectorizeLoop() { Builder.getInt32(0)); } + // Create a phi node that merges control-flow from the backedge-taken check + // block and the middle block. + PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", + LoopScalarPreHeader->getTerminator()); + BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]); + BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. // We know that the loop is in LCSSA form. We need to update the @@ -2655,7 +2819,7 @@ void InnerLoopVectorizer::vectorizeLoop() { assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); // Pick the other block. int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx); + (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); }// end of for each redux variable. @@ -3062,9 +3226,14 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { scalarizeInstruction(it); break; default: + bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector<Value *, 4> Args; for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + if (HasScalarOpd && i == 1) { + Args.push_back(CI->getArgOperand(i)); + continue; + } VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); Args.push_back(Arg[Part]); } @@ -3112,8 +3281,8 @@ void InnerLoopVectorizer::updateAnalysis() { } } - DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front()); - DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); + DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); + DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); @@ -3138,8 +3307,10 @@ static bool canIfConvertPHINodes(BasicBlock *BB) { } bool LoopVectorizationLegality::canVectorizeWithIfConvert() { - if (!EnableIfConversion) + if (!EnableIfConversion) { + emitAnalysis(Report() << "if-conversion is disabled"); return false; + } assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); @@ -3169,16 +3340,24 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { BasicBlock *BB = *BI; // We don't support switch statements inside loops. - if (!isa<BranchInst>(BB->getTerminator())) + if (!isa<BranchInst>(BB->getTerminator())) { + emitAnalysis(Report(BB->getTerminator()) + << "loop contains a switch statement"); return false; + } // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { - if (!blockCanBePredicated(BB, SafePointes)) + if (!blockCanBePredicated(BB, SafePointes)) { + emitAnalysis(Report(BB->getTerminator()) + << "control flow cannot be substituted for a select"); return false; - } else if (BB != Header && !canIfConvertPHINodes(BB)) + } + } else if (BB != Header && !canIfConvertPHINodes(BB)) { + emitAnalysis(Report(BB->getTerminator()) + << "control flow cannot be substituted for a select"); return false; - + } } // We can if-convert this loop. @@ -3188,20 +3367,31 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { bool LoopVectorizationLegality::canVectorize() { // We must have a loop in canonical form. Loops with indirectbr in them cannot // be canonicalized. - if (!TheLoop->getLoopPreheader()) + if (!TheLoop->getLoopPreheader()) { + emitAnalysis( + Report() << "loop control flow is not understood by vectorizer"); return false; + } // We can only vectorize innermost loops. - if (TheLoop->getSubLoopsVector().size()) + if (TheLoop->getSubLoopsVector().size()) { + emitAnalysis(Report() << "loop is not the innermost loop"); return false; + } // We must have a single backedge. - if (TheLoop->getNumBackEdges() != 1) + if (TheLoop->getNumBackEdges() != 1) { + emitAnalysis( + Report() << "loop control flow is not understood by vectorizer"); return false; + } // We must have a single exiting block. - if (!TheLoop->getExitingBlock()) + if (!TheLoop->getExitingBlock()) { + emitAnalysis( + Report() << "loop control flow is not understood by vectorizer"); return false; + } // We need to have a loop header. DEBUG(dbgs() << "LV: Found a loop: " << @@ -3217,6 +3407,7 @@ bool LoopVectorizationLegality::canVectorize() { // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); if (ExitCount == SE->getCouldNotCompute()) { + emitAnalysis(Report() << "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } @@ -3310,6 +3501,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { + emitAnalysis(Report(it) + << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; } @@ -3320,13 +3513,17 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (*bb != Header) { // Check that this instruction has no outside users or is an // identified reduction value with an outside user. - if(!hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) continue; + emitAnalysis(Report(it) << "value that could not be identified as " + "reduction is used outside the loop"); return false; } // We only allow if-converted PHIs with more than two incoming values. if (Phi->getNumIncomingValues() != 2) { + emitAnalysis(Report(it) + << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } @@ -3357,8 +3554,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Until we explicitly handle the case of an induction variable with // an outside loop user we have to give up vectorizing this loop. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { + emitAnalysis(Report(it) << "use of induction value outside of the " + "loop is not handled by vectorizer"); return false; + } continue; } @@ -3401,6 +3601,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } + emitAnalysis(Report(it) << "unvectorizable operation"); DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); return false; }// end of PHI handling @@ -3409,14 +3610,29 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // calls and we do handle certain intrinsic and libm functions. CallInst *CI = dyn_cast<CallInst>(it); if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) { + emitAnalysis(Report(it) << "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a call site.\n"); return false; } + // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the + // second argument is the same (i.e. loop invariant) + if (CI && + hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { + if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { + emitAnalysis(Report(it) + << "intrinsic instruction cannot be vectorized"); + DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); + return false; + } + } + // Check that the instruction return type is vectorizable. // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { + emitAnalysis(Report(it) + << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; } @@ -3424,8 +3640,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Check that the stored type is vectorizable. if (StoreInst *ST = dyn_cast<StoreInst>(it)) { Type *T = ST->getValueOperand()->getType(); - if (!VectorType::isValidElementType(T)) + if (!VectorType::isValidElementType(T)) { + emitAnalysis(Report(ST) << "store instruction cannot be vectorized"); return false; + } if (EnableMemAccessVersioning) collectStridedAcccess(ST); } @@ -3436,8 +3654,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { + emitAnalysis(Report(it) << "value cannot be used outside the loop"); return false; + } } // next instr. @@ -3445,8 +3665,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); - if (Inductions.empty()) + if (Inductions.empty()) { + emitAnalysis(Report() + << "loop induction variable could not be identified"); return false; + } } return true; @@ -4353,8 +4576,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { continue; LoadInst *Ld = dyn_cast<LoadInst>(it); - if (!Ld) return false; - if (!Ld->isSimple() && !IsAnnotatedParallel) { + if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { + emitAnalysis(Report(Ld) + << "read with atomic ordering or volatile read"); DEBUG(dbgs() << "LV: Found a non-simple load.\n"); return false; } @@ -4367,8 +4591,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // Save 'store' instructions. Abort if other instructions write to memory. if (it->mayWriteToMemory()) { StoreInst *St = dyn_cast<StoreInst>(it); - if (!St) return false; + if (!St) { + emitAnalysis(Report(it) << "instruction cannot be vectorized"); + return false; + } if (!St->isSimple() && !IsAnnotatedParallel) { + emitAnalysis(Report(St) + << "write with atomic ordering or volatile write"); DEBUG(dbgs() << "LV: Found a non-simple store.\n"); return false; } @@ -4405,6 +4634,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { Value* Ptr = ST->getPointerOperand(); if (isUniform(Ptr)) { + emitAnalysis( + Report(ST) + << "write to a loop invariant address could not be vectorized"); DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); return false; } @@ -4483,6 +4715,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } if (NeedRTCheck && !CanDoRT) { + emitAnalysis(Report() << "cannot identify array bounds"); DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"); PtrRtCheck.reset(); @@ -4513,6 +4746,14 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // Check that we did not collect too many pointers or found an unsizeable // pointer. if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { + if (!CanDoRT && NumComparisons > 0) + emitAnalysis(Report() + << "cannot check memory dependencies at runtime"); + else + emitAnalysis(Report() + << NumComparisons << " exceeds limit of " + << RuntimeMemoryCheckThreshold + << " dependent memory operations checked at runtime"); DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); PtrRtCheck.reset(); return false; @@ -4522,6 +4763,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } } + if (!CanVecMem) + emitAnalysis(Report() << "unsafe dependent memory operations in loop"); + DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"); @@ -5774,4 +6018,3 @@ Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, Constant *C = ConstantInt::get(ITy, StartIdx, Negate); return Builder.CreateAdd(Val, C, "induction"); } - diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index e13ba95..53a43d9 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -149,6 +149,48 @@ static bool isSplat(ArrayRef<Value *> VL) { return true; } +///\returns Opcode that can be clubbed with \p Op to create an alternate +/// sequence which can later be merged as a ShuffleVector instruction. +static unsigned getAltOpcode(unsigned Op) { + switch (Op) { + case Instruction::FAdd: + return Instruction::FSub; + case Instruction::FSub: + return Instruction::FAdd; + case Instruction::Add: + return Instruction::Sub; + case Instruction::Sub: + return Instruction::Add; + default: + return 0; + } +} + +///\returns bool representing if Opcode \p Op can be part +/// of an alternate sequence which can later be merged as +/// a ShuffleVector instruction. +static bool canCombineAsAltInst(unsigned Op) { + if (Op == Instruction::FAdd || Op == Instruction::FSub || + Op == Instruction::Sub || Op == Instruction::Add) + return true; + return false; +} + +/// \returns ShuffleVector instruction if intructions in \p VL have +/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. +/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) +static unsigned isAltInst(ArrayRef<Value *> VL) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + unsigned Opcode = I0->getOpcode(); + unsigned AltOpcode = getAltOpcode(Opcode); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) + return 0; + } + return Instruction::ShuffleVector; +} + /// \returns The opcode if all of the Instructions in \p VL have the same /// opcode, or zero. static unsigned getSameOpcode(ArrayRef<Value *> VL) { @@ -158,8 +200,11 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) { unsigned Opcode = I0->getOpcode(); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast<Instruction>(VL[i]); - if (!I || Opcode != I->getOpcode()) + if (!I || Opcode != I->getOpcode()) { + if (canCombineAsAltInst(Opcode) && i == 1) + return isAltInst(VL); return 0; + } } return Opcode; } @@ -377,6 +422,7 @@ public: /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); + private: struct TreeEntry; @@ -594,6 +640,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { bool SameTy = getSameType(VL); (void)SameTy; + bool isAltShuffle = false; assert(SameTy && "Invalid types!"); if (Depth == RecursionMaxDepth) { @@ -615,10 +662,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { newTreeEntry(VL, false); return; } + unsigned Opcode = getSameOpcode(VL); + + // Check that this shuffle vector refers to the alternate + // sequence of opcodes. + if (Opcode == Instruction::ShuffleVector) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + unsigned Op = I0->getOpcode(); + if (Op != Instruction::ShuffleVector) + isAltShuffle = true; + } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || - !getSameOpcode(VL)) { + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); newTreeEntry(VL, false); return; @@ -754,8 +810,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - unsigned Opcode = getSameOpcode(VL); - // Check if it is safe to sink the loads or the stores. if (Opcode == Instruction::Load || Opcode == Instruction::Store) { Instruction *Last = getLastInstruction(VL); @@ -914,8 +968,20 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right); - buildTree_rec(Left, Depth + 1); - buildTree_rec(Right, Depth + 1); + BasicBlock *LeftBB = getSameBlock(Left); + BasicBlock *RightBB = getSameBlock(Right); + // If we have common uses on separate paths in the tree make sure we + // process the one with greater common depth first. + // We can use block numbering to determine the subtree traversal as + // earler user has to come in between the common use and the later user. + if (LeftBB && RightBB && LeftBB == RightBB && + getLastIndex(Right) > getLastIndex(Left)) { + buildTree_rec(Right, Depth + 1); + buildTree_rec(Left, Depth + 1); + } else { + buildTree_rec(Left, Depth + 1); + buildTree_rec(Right, Depth + 1); + } return; } @@ -929,6 +995,51 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } return; } + case Instruction::GetElementPtr: { + // We don't combine GEPs with complicated (nested) indexing. + for (unsigned j = 0; j < VL.size(); ++j) { + if (cast<Instruction>(VL[j])->getNumOperands() != 2) { + DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); + newTreeEntry(VL, false); + return; + } + } + + // We can't combine several GEPs into one vector if they operate on + // different types. + Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType(); + for (unsigned j = 0; j < VL.size(); ++j) { + Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); + if (Ty0 != CurTy) { + DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); + newTreeEntry(VL, false); + return; + } + } + + // We don't combine GEPs with non-constant indexes. + for (unsigned j = 0; j < VL.size(); ++j) { + auto Op = cast<Instruction>(VL[j])->getOperand(1); + if (!isa<ConstantInt>(Op)) { + DEBUG( + dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); + newTreeEntry(VL, false); + return; + } + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); + for (unsigned i = 0, e = 2; i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } case Instruction::Store: { // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) @@ -961,9 +1072,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } - Function *Int = CI->getCalledFunction(); - + Value *A1I = nullptr; + if (hasVectorInstrinsicScalarOpd(ID, 1)) + A1I = CI->getArgOperand(1); for (unsigned i = 1, e = VL.size(); i != e; ++i) { CallInst *CI2 = dyn_cast<CallInst>(VL[i]); if (!CI2 || CI2->getCalledFunction() != Int || @@ -973,6 +1085,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { << "\n"); return; } + // ctlz,cttz and powi are special intrinsics whose second argument + // should be same in order for them to be vectorized. + if (hasVectorInstrinsicScalarOpd(ID, 1)) { + Value *A1J = CI2->getArgOperand(1); + if (A1I != A1J) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI + << " argument "<< A1I<<"!=" << A1J + << "\n"); + return; + } + } } newTreeEntry(VL, true); @@ -987,6 +1111,26 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } return; } + case Instruction::ShuffleVector: { + // If this is not an alternate sequence of opcode like add-sub + // then do not vectorize this instruction. + if (!isAltShuffle) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); + return; + } + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } default: newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); @@ -1010,11 +1154,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return getGatherCost(E->Scalars); } - - assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) && - "Invalid VL"); + unsigned Opcode = getSameOpcode(VL); + assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); Instruction *VL0 = cast<Instruction>(VL[0]); - unsigned Opcode = VL0->getOpcode(); switch (Opcode) { case Instruction::PHI: { return 0; @@ -1121,6 +1263,20 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return VecCost - ScalarCost; } + case Instruction::GetElementPtr: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + + int ScalarCost = + VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); + int VecCost = + TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); + + return VecCost - ScalarCost; + } case Instruction::Load: { // Cost of wide load - cost of scalar loads. int ScalarLdCost = VecTy->getNumElements() * @@ -1158,6 +1314,32 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { return VecCallCost - ScalarCallCost; } + case Instruction::ShuffleVector: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue; + int ScalarCost = 0; + int VecCost = 0; + for (unsigned i = 0; i < VL.size(); ++i) { + Instruction *I = cast<Instruction>(VL[i]); + if (!I) + break; + ScalarCost += + TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); + } + // VecCost is equal to sum of the cost of creating 2 vectors + // and the cost of creating shuffle. + Instruction *I0 = cast<Instruction>(VL[0]); + VecCost = + TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); + Instruction *I1 = cast<Instruction>(VL[1]); + VecCost += + TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); + VecCost += + TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); + return VecCost - ScalarCost; + } default: llvm_unreachable("Unknown instruction"); } @@ -1438,9 +1620,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E->Scalars); return Gather(E->Scalars, VecTy); } - - unsigned Opcode = VL0->getOpcode(); - assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); + unsigned Opcode = getSameOpcode(E->Scalars); switch (Opcode) { case Instruction::PHI: { @@ -1649,12 +1829,52 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = S; return propagateMetadata(S, E->Scalars); } + case Instruction::GetElementPtr: { + setInsertPointAfterBundle(E->Scalars); + + ValueList Op0VL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0)); + + Value *Op0 = vectorizeTree(Op0VL); + + std::vector<Value *> OpVecs; + for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; + ++j) { + ValueList OpVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j)); + + Value *OpVec = vectorizeTree(OpVL); + OpVecs.push_back(OpVec); + } + + Value *V = Builder.CreateGEP(Op0, OpVecs); + E->VectorizedValue = V; + + if (Instruction *I = dyn_cast<Instruction>(V)) + return propagateMetadata(I, E->Scalars); + + return V; + } case Instruction::Call: { CallInst *CI = cast<CallInst>(VL0); setInsertPointAfterBundle(E->Scalars); + Function *FI; + Intrinsic::ID IID = Intrinsic::not_intrinsic; + if (CI && (FI = CI->getCalledFunction())) { + IID = (Intrinsic::ID) FI->getIntrinsicID(); + } std::vector<Value *> OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { ValueList OpVL; + // ctlz,cttz and powi are special intrinsics whose second argument is + // a scalar. This argument should not be vectorized. + if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { + CallInst *CEI = cast<CallInst>(E->Scalars[0]); + OpVecs.push_back(CEI->getArgOperand(j)); + continue; + } for (int i = 0, e = E->Scalars.size(); i < e; ++i) { CallInst *CEI = cast<CallInst>(E->Scalars[i]); OpVL.push_back(CEI->getArgOperand(j)); @@ -1673,6 +1893,49 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = V; return V; } + case Instruction::ShuffleVector: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + } + setInsertPointAfterBundle(E->Scalars); + + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + // Create a vector of LHS op1 RHS + BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0); + Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); + + // Create a vector of LHS op2 RHS + Instruction *VL1 = cast<Instruction>(E->Scalars[1]); + BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1); + Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); + + // Create appropriate shuffle to take alternative operations from + // the vector. + std::vector<Constant *> Mask(E->Scalars.size()); + unsigned e = E->Scalars.size(); + for (unsigned i = 0; i < e; ++i) { + if (i & 1) + Mask[i] = Builder.getInt32(e + i); + else + Mask[i] = Builder.getInt32(i); + } + + Value *ShuffleMask = ConstantVector::get(Mask); + + Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); + E->VectorizedValue = V; + if (Instruction *I = dyn_cast<Instruction>(V)) + return propagateMetadata(I, E->Scalars); + + return V; + } default: llvm_unreachable("unknown inst"); } @@ -1741,7 +2004,6 @@ Value *BoUpSLP::vectorizeTree() { // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - // No need to handle users of gathered values. if (Entry->NeedToGather) continue; @@ -1925,7 +2187,6 @@ struct SLPVectorizer : public FunctionPass { for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), e = po_end(&F.getEntryBlock()); it != e; ++it) { BasicBlock *BB = *it; - // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; |