aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Transforms/Vectorize
diff options
context:
space:
mode:
authorStephen Hines <srhines@google.com>2014-07-21 00:45:20 -0700
committerStephen Hines <srhines@google.com>2014-07-21 00:45:20 -0700
commitc6a4f5e819217e1e12c458aed8e7b122e23a3a58 (patch)
tree81b7dd2bb4370a392f31d332a566c903b5744764 /lib/Transforms/Vectorize
parent19c6fbb3e8aaf74093afa08013134b61fa08f245 (diff)
downloadexternal_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.zip
external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.tar.gz
external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.tar.bz2
Update LLVM for rebase to r212749.
Includes a cherry-pick of: r212948 - fixes a small issue with atomic calls Change-Id: Ib97bd980b59f18142a69506400911a6009d9df18
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp373
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp297
2 files changed, 587 insertions, 83 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 34d8a10..cb8a41d 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -209,6 +209,29 @@ namespace {
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
+/// Optimization analysis message produced during vectorization. Messages inform
+/// the user why vectorization did not occur.
+class Report {
+ std::string Message;
+ raw_string_ostream Out;
+ Instruction *Instr;
+
+public:
+ Report(Instruction *I = nullptr) : Out(Message), Instr(I) {
+ Out << "loop not vectorized: ";
+ }
+
+ template <typename A> Report &operator<<(const A &Value) {
+ Out << Value;
+ return *this;
+ }
+
+ Instruction *getInstr() { return Instr; }
+
+ std::string &str() { return Out.str(); }
+ operator Twine() { return Out.str(); }
+};
+
/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
@@ -515,10 +538,12 @@ public:
unsigned NumPredStores;
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
- DominatorTree *DT, TargetLibraryInfo *TLI)
+ DominatorTree *DT, TargetLibraryInfo *TLI,
+ Function *F)
: NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
- DT(DT), TLI(TLI), Induction(nullptr), WidestIndTy(nullptr),
- HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {}
+ DT(DT), TLI(TLI), TheFunction(F), Induction(nullptr),
+ WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {
+ }
/// This enum represents the kinds of reductions that we support.
enum ReductionKind {
@@ -747,6 +772,16 @@ private:
/// invariant.
void collectStridedAcccess(Value *LoadOrStoreInst);
+ /// Report an analysis message to assist the user in diagnosing loops that are
+ /// not vectorized.
+ void emitAnalysis(Report &Message) {
+ DebugLoc DL = TheLoop->getStartLoc();
+ if (Instruction *I = Message.getInstr())
+ DL = I->getDebugLoc();
+ emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE,
+ *TheFunction, DL, Message.str());
+ }
+
/// The loop that we evaluate.
Loop *TheLoop;
/// Scev analysis.
@@ -757,6 +792,8 @@ private:
DominatorTree *DT;
/// Target Library Info.
TargetLibraryInfo *TLI;
+ /// Parent function
+ Function *TheFunction;
// --- vectorization state --- //
@@ -906,7 +943,7 @@ public:
}
/// Return the loop vectorizer metadata prefix.
- static StringRef Prefix() { return "llvm.vectorizer."; }
+ static StringRef Prefix() { return "llvm.loop.vectorize."; }
MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) const {
SmallVector<Value*, 2> Vals;
@@ -942,6 +979,29 @@ public:
LoopID = NewLoopID;
}
+ std::string emitRemark() const {
+ Report R;
+ R << "vectorization ";
+ switch (Force) {
+ case LoopVectorizeHints::FK_Disabled:
+ R << "is explicitly disabled";
+ break;
+ case LoopVectorizeHints::FK_Enabled:
+ R << "is explicitly enabled";
+ if (Width != 0 && Unroll != 0)
+ R << " with width " << Width << " and interleave count " << Unroll;
+ else if (Width != 0)
+ R << " with width " << Width;
+ else if (Unroll != 0)
+ R << " with interleave count " << Unroll;
+ break;
+ case LoopVectorizeHints::FK_Undefined:
+ R << "was not specified";
+ break;
+ }
+ return R.str();
+ }
+
unsigned getWidth() const { return Width; }
unsigned getUnroll() const { return Unroll; }
enum ForceKind getForce() const { return Force; }
@@ -1125,18 +1185,37 @@ struct LoopVectorize : public FunctionPass {
: "?")) << " width=" << Hints.getWidth()
<< " unroll=" << Hints.getUnroll() << "\n");
+ // Function containing loop
+ Function *F = L->getHeader()->getParent();
+
+ // Looking at the diagnostic output is the only way to determine if a loop
+ // was vectorized (other than looking at the IR or machine code), so it
+ // is important to generate an optimization remark for each loop. Most of
+ // these messages are generated by emitOptimizationRemarkAnalysis. Remarks
+ // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
+ // less verbose reporting vectorized loops and unvectorized loops that may
+ // benefit from vectorization, respectively.
+
if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+ emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
+ L->getStartLoc(), Hints.emitRemark());
return false;
}
if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+ emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
+ L->getStartLoc(), Hints.emitRemark());
return false;
}
if (Hints.getWidth() == 1 && Hints.getUnroll() == 1) {
DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
+ emitOptimizationRemarkAnalysis(
+ F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+ "loop not vectorized: vector width and interleave count are "
+ "explicitly set to 1");
return false;
}
@@ -1151,14 +1230,19 @@ struct LoopVectorize : public FunctionPass {
DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
else {
DEBUG(dbgs() << "\n");
+ emitOptimizationRemarkAnalysis(
+ F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+ "vectorization is not beneficial and is not explicitly forced");
return false;
}
}
// Check if it is legal to vectorize the loop.
- LoopVectorizationLegality LVL(L, SE, DL, DT, TLI);
+ LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, F);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+ emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
+ L->getStartLoc(), Hints.emitRemark());
return false;
}
@@ -1167,7 +1251,6 @@ struct LoopVectorize : public FunctionPass {
// Check the function attributes to find out if this function should be
// optimized for size.
- Function *F = L->getHeader()->getParent();
bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
F->hasFnAttribute(Attribute::OptimizeForSize);
@@ -1190,6 +1273,11 @@ struct LoopVectorize : public FunctionPass {
if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
"attribute is used.\n");
+ emitOptimizationRemarkAnalysis(
+ F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+ "loop not vectorized due to NoImplicitFloat attribute");
+ emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
+ L->getStartLoc(), Hints.emitRemark());
return false;
}
@@ -1208,9 +1296,14 @@ struct LoopVectorize : public FunctionPass {
DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
if (VF.Width == 1) {
- DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
- if (UF == 1)
+ DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n");
+
+ if (UF == 1) {
+ emitOptimizationRemarkAnalysis(
+ F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+ "not beneficial to vectorize and user disabled interleaving");
return false;
+ }
DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
// Report the unrolling decision.
@@ -1220,6 +1313,7 @@ struct LoopVectorize : public FunctionPass {
" (vectorization not beneficial)"));
// We decided not to vectorize, but we may want to unroll.
+
InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
Unroller.vectorize(&LVL);
} else {
@@ -1909,20 +2003,23 @@ void InnerLoopVectorizer::createEmptyLoop() {
the vectorized instructions while the old loop will continue to run the
scalar remainder.
- [ ] <-- vector loop bypass (may consist of multiple blocks).
- / |
- / v
- | [ ] <-- vector pre header.
- | |
- | v
- | [ ] \
- | [ ]_| <-- vector loop.
- | |
- \ v
- >[ ] <--- middle-block.
- / |
- / v
- | [ ] <--- new preheader.
+ [ ] <-- Back-edge taken count overflow check.
+ / |
+ / v
+ | [ ] <-- vector loop bypass (may consist of multiple blocks).
+ | / |
+ | / v
+ || [ ] <-- vector pre header.
+ || |
+ || v
+ || [ ] \
+ || [ ]_| <-- vector loop.
+ || |
+ | \ v
+ | >[ ] <--- middle-block.
+ | / |
+ | / v
+ -|- >[ ] <--- new preheader.
| |
| v
| [ ] \
@@ -1936,6 +2033,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
BasicBlock *OldBasicBlock = OrigLoop->getHeader();
BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+ assert(BypassBlock && "Invalid loop structure");
assert(ExitBlock && "Must have an exit block");
// Some loops have a single integer induction variable, while other loops
@@ -1958,18 +2056,30 @@ void InnerLoopVectorizer::createEmptyLoop() {
IdxTy->getPrimitiveSizeInBits())
ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
- ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
+ const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
// Get the total trip count from the count by adding 1.
- ExitCount = SE->getAddExpr(ExitCount,
- SE->getConstant(ExitCount->getType(), 1));
+ ExitCount = SE->getAddExpr(BackedgeTakeCount,
+ SE->getConstant(BackedgeTakeCount->getType(), 1));
// Expand the trip count and place the new instructions in the preheader.
// Notice that the pre-header does not change, only the loop body.
SCEVExpander Exp(*SE, "induction");
- // Count holds the overall loop count (N).
- Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
- BypassBlock->getTerminator());
+ // We need to test whether the backedge-taken count is uint##_max. Adding one
+ // to it will cause overflow and an incorrect loop trip count in the vector
+ // body. In case of overflow we want to directly jump to the scalar remainder
+ // loop.
+ Value *BackedgeCount =
+ Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
+ BypassBlock->getTerminator());
+ if (BackedgeCount->getType()->isPointerTy())
+ BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
+ "backedge.ptrcnt.to.int",
+ BypassBlock->getTerminator());
+ Instruction *CheckBCOverflow =
+ CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
+ Constant::getAllOnesValue(BackedgeCount->getType()),
+ "backedge.overflow", BypassBlock->getTerminator());
// The loop index does not have to start at Zero. Find the original start
// value from the induction PHI node. If we don't have an induction variable
@@ -1980,7 +2090,18 @@ void InnerLoopVectorizer::createEmptyLoop() {
IdxTy):
ConstantInt::get(IdxTy, 0);
- assert(BypassBlock && "Invalid loop structure");
+ // We need an instruction to anchor the overflow check on. StartIdx needs to
+ // be defined before the overflow check branch. Because the scalar preheader
+ // is going to merge the start index and so the overflow branch block needs to
+ // contain a definition of the start index.
+ Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd(
+ StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor",
+ BypassBlock->getTerminator());
+
+ // Count holds the overall loop count (N).
+ Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+ BypassBlock->getTerminator());
+
LoopBypassBlocks.push_back(BypassBlock);
// Split the single block loop into the two loop structure described above.
@@ -2049,29 +2170,45 @@ void InnerLoopVectorizer::createEmptyLoop() {
// Now, compare the new count to zero. If it is zero skip the vector loop and
// jump to the scalar loop.
- Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
- "cmp.zero");
+ Value *Cmp =
+ BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
BasicBlock *LastBypassBlock = BypassBlock;
+ // Generate code to check that the loops trip count that we computed by adding
+ // one to the backedge-taken count will not overflow.
+ {
+ auto PastOverflowCheck =
+ std::next(BasicBlock::iterator(OverflowCheckAnchor));
+ BasicBlock *CheckBlock =
+ LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
+ if (ParentLoop)
+ ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
+ LoopBypassBlocks.push_back(CheckBlock);
+ Instruction *OldTerm = LastBypassBlock->getTerminator();
+ BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);
+ OldTerm->eraseFromParent();
+ LastBypassBlock = CheckBlock;
+ }
+
// Generate the code to check that the strides we assumed to be one are really
// one. We want the new basic block to start at the first instruction in a
// sequence of instructions that form a check.
Instruction *StrideCheck;
Instruction *FirstCheckInst;
std::tie(FirstCheckInst, StrideCheck) =
- addStrideCheck(BypassBlock->getTerminator());
+ addStrideCheck(LastBypassBlock->getTerminator());
if (StrideCheck) {
// Create a new block containing the stride check.
BasicBlock *CheckBlock =
- BypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
+ LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
if (ParentLoop)
ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
LoopBypassBlocks.push_back(CheckBlock);
// Replace the branch into the memory check block with a conditional branch
// for the "few elements case".
- Instruction *OldTerm = BypassBlock->getTerminator();
+ Instruction *OldTerm = LastBypassBlock->getTerminator();
BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
OldTerm->eraseFromParent();
@@ -2134,6 +2271,19 @@ void InnerLoopVectorizer::createEmptyLoop() {
PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
MiddleBlock->getTerminator()) : nullptr;
+ // Create phi nodes to merge from the backedge-taken check block.
+ PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
+ ScalarPH->getTerminator());
+ BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
+
+ PHINode *BCTruncResumeVal = nullptr;
+ if (OrigPhi == OldInduction) {
+ BCTruncResumeVal =
+ PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
+ ScalarPH->getTerminator());
+ BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
+ }
+
Value *EndValue = nullptr;
switch (II.IK) {
case LoopVectorizationLegality::IK_NoInduction:
@@ -2150,10 +2300,12 @@ void InnerLoopVectorizer::createEmptyLoop() {
BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
// The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop.
- for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
TruncResumeVal->addIncoming(EndValue, VecBody);
+ BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
+
// We know what the end value is.
EndValue = IdxEndRoundDown;
// We also know which PHI node holds it.
@@ -2199,7 +2351,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
// The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop.
- for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) {
+ for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
if (OrigPhi == OldInduction)
ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
else
@@ -2209,11 +2361,16 @@ void InnerLoopVectorizer::createEmptyLoop() {
// Fix the scalar body counter (PHI node).
unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
- // The old inductions phi node in the scalar body needs the truncated value.
- if (OrigPhi == OldInduction)
- OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal);
- else
- OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+
+ // The old induction's phi node in the scalar body needs the truncated
+ // value.
+ if (OrigPhi == OldInduction) {
+ BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
+ OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
+ } else {
+ BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
+ OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
+ }
}
// If we are generating a new induction variable then we also need to
@@ -2224,7 +2381,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
assert(!ResumeIndex && "Unexpected resume value found");
ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
MiddleBlock->getTerminator());
- for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
}
@@ -2494,7 +2651,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
// To do so, we need to generate the 'identity' vector and override
// one of the elements with the incoming scalar reduction. We need
// to do it in the vector-loop preheader.
- Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
+ Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
// This is the vector-clone of the value that leaves the loop.
VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
@@ -2568,7 +2725,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
Value *StartVal = (part == 0) ? VectorStart : Identity;
- for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
NewPhi->addIncoming(RdxExitVal[part],
LoopVectorBody.back());
@@ -2626,6 +2783,13 @@ void InnerLoopVectorizer::vectorizeLoop() {
Builder.getInt32(0));
}
+ // Create a phi node that merges control-flow from the backedge-taken check
+ // block and the middle block.
+ PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
+ LoopScalarPreHeader->getTerminator());
+ BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]);
+ BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
// Now, we need to fix the users of the reduction variable
// inside and outside of the scalar remainder loop.
// We know that the loop is in LCSSA form. We need to update the
@@ -2655,7 +2819,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
// Pick the other block.
int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
- (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx);
+ (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
}// end of for each redux variable.
@@ -3062,9 +3226,14 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
scalarizeInstruction(it);
break;
default:
+ bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1);
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 4> Args;
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+ if (HasScalarOpd && i == 1) {
+ Args.push_back(CI->getArgOperand(i));
+ continue;
+ }
VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
Args.push_back(Arg[Part]);
}
@@ -3112,8 +3281,8 @@ void InnerLoopVectorizer::updateAnalysis() {
}
}
- DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
- DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
+ DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
+ DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
@@ -3138,8 +3307,10 @@ static bool canIfConvertPHINodes(BasicBlock *BB) {
}
bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
- if (!EnableIfConversion)
+ if (!EnableIfConversion) {
+ emitAnalysis(Report() << "if-conversion is disabled");
return false;
+ }
assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
@@ -3169,16 +3340,24 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
BasicBlock *BB = *BI;
// We don't support switch statements inside loops.
- if (!isa<BranchInst>(BB->getTerminator()))
+ if (!isa<BranchInst>(BB->getTerminator())) {
+ emitAnalysis(Report(BB->getTerminator())
+ << "loop contains a switch statement");
return false;
+ }
// We must be able to predicate all blocks that need to be predicated.
if (blockNeedsPredication(BB)) {
- if (!blockCanBePredicated(BB, SafePointes))
+ if (!blockCanBePredicated(BB, SafePointes)) {
+ emitAnalysis(Report(BB->getTerminator())
+ << "control flow cannot be substituted for a select");
return false;
- } else if (BB != Header && !canIfConvertPHINodes(BB))
+ }
+ } else if (BB != Header && !canIfConvertPHINodes(BB)) {
+ emitAnalysis(Report(BB->getTerminator())
+ << "control flow cannot be substituted for a select");
return false;
-
+ }
}
// We can if-convert this loop.
@@ -3188,20 +3367,31 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
bool LoopVectorizationLegality::canVectorize() {
// We must have a loop in canonical form. Loops with indirectbr in them cannot
// be canonicalized.
- if (!TheLoop->getLoopPreheader())
+ if (!TheLoop->getLoopPreheader()) {
+ emitAnalysis(
+ Report() << "loop control flow is not understood by vectorizer");
return false;
+ }
// We can only vectorize innermost loops.
- if (TheLoop->getSubLoopsVector().size())
+ if (TheLoop->getSubLoopsVector().size()) {
+ emitAnalysis(Report() << "loop is not the innermost loop");
return false;
+ }
// We must have a single backedge.
- if (TheLoop->getNumBackEdges() != 1)
+ if (TheLoop->getNumBackEdges() != 1) {
+ emitAnalysis(
+ Report() << "loop control flow is not understood by vectorizer");
return false;
+ }
// We must have a single exiting block.
- if (!TheLoop->getExitingBlock())
+ if (!TheLoop->getExitingBlock()) {
+ emitAnalysis(
+ Report() << "loop control flow is not understood by vectorizer");
return false;
+ }
// We need to have a loop header.
DEBUG(dbgs() << "LV: Found a loop: " <<
@@ -3217,6 +3407,7 @@ bool LoopVectorizationLegality::canVectorize() {
// ScalarEvolution needs to be able to find the exit count.
const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
if (ExitCount == SE->getCouldNotCompute()) {
+ emitAnalysis(Report() << "could not determine number of loop iterations");
DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
return false;
}
@@ -3310,6 +3501,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (!PhiTy->isIntegerTy() &&
!PhiTy->isFloatingPointTy() &&
!PhiTy->isPointerTy()) {
+ emitAnalysis(Report(it)
+ << "loop control flow is not understood by vectorizer");
DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
return false;
}
@@ -3320,13 +3513,17 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (*bb != Header) {
// Check that this instruction has no outside users or is an
// identified reduction value with an outside user.
- if(!hasOutsideLoopUser(TheLoop, it, AllowedExit))
+ if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
continue;
+ emitAnalysis(Report(it) << "value that could not be identified as "
+ "reduction is used outside the loop");
return false;
}
// We only allow if-converted PHIs with more than two incoming values.
if (Phi->getNumIncomingValues() != 2) {
+ emitAnalysis(Report(it)
+ << "control flow not understood by vectorizer");
DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
return false;
}
@@ -3357,8 +3554,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// Until we explicitly handle the case of an induction variable with
// an outside loop user we have to give up vectorizing this loop.
- if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+ if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
+ emitAnalysis(Report(it) << "use of induction value outside of the "
+ "loop is not handled by vectorizer");
return false;
+ }
continue;
}
@@ -3401,6 +3601,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
continue;
}
+ emitAnalysis(Report(it) << "unvectorizable operation");
DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
return false;
}// end of PHI handling
@@ -3409,14 +3610,29 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// calls and we do handle certain intrinsic and libm functions.
CallInst *CI = dyn_cast<CallInst>(it);
if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
+ emitAnalysis(Report(it) << "call instruction cannot be vectorized");
DEBUG(dbgs() << "LV: Found a call site.\n");
return false;
}
+ // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
+ // second argument is the same (i.e. loop invariant)
+ if (CI &&
+ hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
+ if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
+ emitAnalysis(Report(it)
+ << "intrinsic instruction cannot be vectorized");
+ DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
+ return false;
+ }
+ }
+
// Check that the instruction return type is vectorizable.
// Also, we can't vectorize extractelement instructions.
if ((!VectorType::isValidElementType(it->getType()) &&
!it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
+ emitAnalysis(Report(it)
+ << "instruction return type cannot be vectorized");
DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
return false;
}
@@ -3424,8 +3640,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// Check that the stored type is vectorizable.
if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
Type *T = ST->getValueOperand()->getType();
- if (!VectorType::isValidElementType(T))
+ if (!VectorType::isValidElementType(T)) {
+ emitAnalysis(Report(ST) << "store instruction cannot be vectorized");
return false;
+ }
if (EnableMemAccessVersioning)
collectStridedAcccess(ST);
}
@@ -3436,8 +3654,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// Reduction instructions are allowed to have exit users.
// All other instructions must not have external users.
- if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+ if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
+ emitAnalysis(Report(it) << "value cannot be used outside the loop");
return false;
+ }
} // next instr.
@@ -3445,8 +3665,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (!Induction) {
DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
- if (Inductions.empty())
+ if (Inductions.empty()) {
+ emitAnalysis(Report()
+ << "loop induction variable could not be identified");
return false;
+ }
}
return true;
@@ -4353,8 +4576,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
continue;
LoadInst *Ld = dyn_cast<LoadInst>(it);
- if (!Ld) return false;
- if (!Ld->isSimple() && !IsAnnotatedParallel) {
+ if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
+ emitAnalysis(Report(Ld)
+ << "read with atomic ordering or volatile read");
DEBUG(dbgs() << "LV: Found a non-simple load.\n");
return false;
}
@@ -4367,8 +4591,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
// Save 'store' instructions. Abort if other instructions write to memory.
if (it->mayWriteToMemory()) {
StoreInst *St = dyn_cast<StoreInst>(it);
- if (!St) return false;
+ if (!St) {
+ emitAnalysis(Report(it) << "instruction cannot be vectorized");
+ return false;
+ }
if (!St->isSimple() && !IsAnnotatedParallel) {
+ emitAnalysis(Report(St)
+ << "write with atomic ordering or volatile write");
DEBUG(dbgs() << "LV: Found a non-simple store.\n");
return false;
}
@@ -4405,6 +4634,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
Value* Ptr = ST->getPointerOperand();
if (isUniform(Ptr)) {
+ emitAnalysis(
+ Report(ST)
+ << "write to a loop invariant address could not be vectorized");
DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
return false;
}
@@ -4483,6 +4715,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
}
if (NeedRTCheck && !CanDoRT) {
+ emitAnalysis(Report() << "cannot identify array bounds");
DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
"the array bounds.\n");
PtrRtCheck.reset();
@@ -4513,6 +4746,14 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
// Check that we did not collect too many pointers or found an unsizeable
// pointer.
if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
+ if (!CanDoRT && NumComparisons > 0)
+ emitAnalysis(Report()
+ << "cannot check memory dependencies at runtime");
+ else
+ emitAnalysis(Report()
+ << NumComparisons << " exceeds limit of "
+ << RuntimeMemoryCheckThreshold
+ << " dependent memory operations checked at runtime");
DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n");
PtrRtCheck.reset();
return false;
@@ -4522,6 +4763,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
}
}
+ if (!CanVecMem)
+ emitAnalysis(Report() << "unsafe dependent memory operations in loop");
+
DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<
" need a runtime memory check.\n");
@@ -5774,4 +6018,3 @@ Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx,
Constant *C = ConstantInt::get(ITy, StartIdx, Negate);
return Builder.CreateAdd(Val, C, "induction");
}
-
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e13ba95..53a43d9 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -149,6 +149,48 @@ static bool isSplat(ArrayRef<Value *> VL) {
return true;
}
+///\returns Opcode that can be clubbed with \p Op to create an alternate
+/// sequence which can later be merged as a ShuffleVector instruction.
+static unsigned getAltOpcode(unsigned Op) {
+ switch (Op) {
+ case Instruction::FAdd:
+ return Instruction::FSub;
+ case Instruction::FSub:
+ return Instruction::FAdd;
+ case Instruction::Add:
+ return Instruction::Sub;
+ case Instruction::Sub:
+ return Instruction::Add;
+ default:
+ return 0;
+ }
+}
+
+///\returns bool representing if Opcode \p Op can be part
+/// of an alternate sequence which can later be merged as
+/// a ShuffleVector instruction.
+static bool canCombineAsAltInst(unsigned Op) {
+ if (Op == Instruction::FAdd || Op == Instruction::FSub ||
+ Op == Instruction::Sub || Op == Instruction::Add)
+ return true;
+ return false;
+}
+
+/// \returns ShuffleVector instruction if intructions in \p VL have
+/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
+/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
+static unsigned isAltInst(ArrayRef<Value *> VL) {
+ Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+ unsigned Opcode = I0->getOpcode();
+ unsigned AltOpcode = getAltOpcode(Opcode);
+ for (int i = 1, e = VL.size(); i < e; i++) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
+ return 0;
+ }
+ return Instruction::ShuffleVector;
+}
+
/// \returns The opcode if all of the Instructions in \p VL have the same
/// opcode, or zero.
static unsigned getSameOpcode(ArrayRef<Value *> VL) {
@@ -158,8 +200,11 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
unsigned Opcode = I0->getOpcode();
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
- if (!I || Opcode != I->getOpcode())
+ if (!I || Opcode != I->getOpcode()) {
+ if (canCombineAsAltInst(Opcode) && i == 1)
+ return isAltInst(VL);
return 0;
+ }
}
return Opcode;
}
@@ -377,6 +422,7 @@ public:
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
+
private:
struct TreeEntry;
@@ -594,6 +640,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
bool SameTy = getSameType(VL); (void)SameTy;
+ bool isAltShuffle = false;
assert(SameTy && "Invalid types!");
if (Depth == RecursionMaxDepth) {
@@ -615,10 +662,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
newTreeEntry(VL, false);
return;
}
+ unsigned Opcode = getSameOpcode(VL);
+
+ // Check that this shuffle vector refers to the alternate
+ // sequence of opcodes.
+ if (Opcode == Instruction::ShuffleVector) {
+ Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+ unsigned Op = I0->getOpcode();
+ if (Op != Instruction::ShuffleVector)
+ isAltShuffle = true;
+ }
// If all of the operands are identical or constant we have a simple solution.
- if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) ||
- !getSameOpcode(VL)) {
+ if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, false);
return;
@@ -754,8 +810,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
- unsigned Opcode = getSameOpcode(VL);
-
// Check if it is safe to sink the loads or the stores.
if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
Instruction *Last = getLastInstruction(VL);
@@ -914,8 +968,20 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right);
- buildTree_rec(Left, Depth + 1);
- buildTree_rec(Right, Depth + 1);
+ BasicBlock *LeftBB = getSameBlock(Left);
+ BasicBlock *RightBB = getSameBlock(Right);
+ // If we have common uses on separate paths in the tree make sure we
+ // process the one with greater common depth first.
+ // We can use block numbering to determine the subtree traversal as
+ // earler user has to come in between the common use and the later user.
+ if (LeftBB && RightBB && LeftBB == RightBB &&
+ getLastIndex(Right) > getLastIndex(Left)) {
+ buildTree_rec(Right, Depth + 1);
+ buildTree_rec(Left, Depth + 1);
+ } else {
+ buildTree_rec(Left, Depth + 1);
+ buildTree_rec(Right, Depth + 1);
+ }
return;
}
@@ -929,6 +995,51 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
}
return;
}
+ case Instruction::GetElementPtr: {
+ // We don't combine GEPs with complicated (nested) indexing.
+ for (unsigned j = 0; j < VL.size(); ++j) {
+ if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+ DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ // We can't combine several GEPs into one vector if they operate on
+ // different types.
+ Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
+ for (unsigned j = 0; j < VL.size(); ++j) {
+ Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+ if (Ty0 != CurTy) {
+ DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ // We don't combine GEPs with non-constant indexes.
+ for (unsigned j = 0; j < VL.size(); ++j) {
+ auto Op = cast<Instruction>(VL[j])->getOperand(1);
+ if (!isa<ConstantInt>(Op)) {
+ DEBUG(
+ dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+ for (unsigned i = 0, e = 2; i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ buildTree_rec(Operands, Depth + 1);
+ }
+ return;
+ }
case Instruction::Store: {
// Check if the stores are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
@@ -961,9 +1072,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
-
Function *Int = CI->getCalledFunction();
-
+ Value *A1I = nullptr;
+ if (hasVectorInstrinsicScalarOpd(ID, 1))
+ A1I = CI->getArgOperand(1);
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
if (!CI2 || CI2->getCalledFunction() != Int ||
@@ -973,6 +1085,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
<< "\n");
return;
}
+ // ctlz,cttz and powi are special intrinsics whose second argument
+ // should be same in order for them to be vectorized.
+ if (hasVectorInstrinsicScalarOpd(ID, 1)) {
+ Value *A1J = CI2->getArgOperand(1);
+ if (A1I != A1J) {
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
+ << " argument "<< A1I<<"!=" << A1J
+ << "\n");
+ return;
+ }
+ }
}
newTreeEntry(VL, true);
@@ -987,6 +1111,26 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
}
return;
}
+ case Instruction::ShuffleVector: {
+ // If this is not an alternate sequence of opcode like add-sub
+ // then do not vectorize this instruction.
+ if (!isAltShuffle) {
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+ return;
+ }
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ buildTree_rec(Operands, Depth + 1);
+ }
+ return;
+ }
default:
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
@@ -1010,11 +1154,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
return getGatherCost(E->Scalars);
}
-
- assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) &&
- "Invalid VL");
+ unsigned Opcode = getSameOpcode(VL);
+ assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
Instruction *VL0 = cast<Instruction>(VL[0]);
- unsigned Opcode = VL0->getOpcode();
switch (Opcode) {
case Instruction::PHI: {
return 0;
@@ -1121,6 +1263,20 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
return VecCost - ScalarCost;
}
+ case Instruction::GetElementPtr: {
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_UniformConstantValue;
+
+ int ScalarCost =
+ VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+ int VecCost =
+ TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+
+ return VecCost - ScalarCost;
+ }
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
int ScalarLdCost = VecTy->getNumElements() *
@@ -1158,6 +1314,32 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
return VecCallCost - ScalarCallCost;
}
+ case Instruction::ShuffleVector: {
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_AnyValue;
+ int ScalarCost = 0;
+ int VecCost = 0;
+ for (unsigned i = 0; i < VL.size(); ++i) {
+ Instruction *I = cast<Instruction>(VL[i]);
+ if (!I)
+ break;
+ ScalarCost +=
+ TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
+ }
+ // VecCost is equal to sum of the cost of creating 2 vectors
+ // and the cost of creating shuffle.
+ Instruction *I0 = cast<Instruction>(VL[0]);
+ VecCost =
+ TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
+ Instruction *I1 = cast<Instruction>(VL[1]);
+ VecCost +=
+ TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
+ VecCost +=
+ TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
+ return VecCost - ScalarCost;
+ }
default:
llvm_unreachable("Unknown instruction");
}
@@ -1438,9 +1620,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
setInsertPointAfterBundle(E->Scalars);
return Gather(E->Scalars, VecTy);
}
-
- unsigned Opcode = VL0->getOpcode();
- assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
+ unsigned Opcode = getSameOpcode(E->Scalars);
switch (Opcode) {
case Instruction::PHI: {
@@ -1649,12 +1829,52 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
E->VectorizedValue = S;
return propagateMetadata(S, E->Scalars);
}
+ case Instruction::GetElementPtr: {
+ setInsertPointAfterBundle(E->Scalars);
+
+ ValueList Op0VL;
+ for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+ Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
+
+ Value *Op0 = vectorizeTree(Op0VL);
+
+ std::vector<Value *> OpVecs;
+ for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
+ ++j) {
+ ValueList OpVL;
+ for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+ OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
+
+ Value *OpVec = vectorizeTree(OpVL);
+ OpVecs.push_back(OpVec);
+ }
+
+ Value *V = Builder.CreateGEP(Op0, OpVecs);
+ E->VectorizedValue = V;
+
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ return propagateMetadata(I, E->Scalars);
+
+ return V;
+ }
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
setInsertPointAfterBundle(E->Scalars);
+ Function *FI;
+ Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ if (CI && (FI = CI->getCalledFunction())) {
+ IID = (Intrinsic::ID) FI->getIntrinsicID();
+ }
std::vector<Value *> OpVecs;
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
ValueList OpVL;
+ // ctlz,cttz and powi are special intrinsics whose second argument is
+ // a scalar. This argument should not be vectorized.
+ if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
+ CallInst *CEI = cast<CallInst>(E->Scalars[0]);
+ OpVecs.push_back(CEI->getArgOperand(j));
+ continue;
+ }
for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
CallInst *CEI = cast<CallInst>(E->Scalars[i]);
OpVL.push_back(CEI->getArgOperand(j));
@@ -1673,6 +1893,49 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
E->VectorizedValue = V;
return V;
}
+ case Instruction::ShuffleVector: {
+ ValueList LHSVL, RHSVL;
+ for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+ LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+ RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ }
+ setInsertPointAfterBundle(E->Scalars);
+
+ Value *LHS = vectorizeTree(LHSVL);
+ Value *RHS = vectorizeTree(RHSVL);
+
+ if (Value *V = alreadyVectorized(E->Scalars))
+ return V;
+
+ // Create a vector of LHS op1 RHS
+ BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
+ Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
+
+ // Create a vector of LHS op2 RHS
+ Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
+ BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
+ Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
+
+ // Create appropriate shuffle to take alternative operations from
+ // the vector.
+ std::vector<Constant *> Mask(E->Scalars.size());
+ unsigned e = E->Scalars.size();
+ for (unsigned i = 0; i < e; ++i) {
+ if (i & 1)
+ Mask[i] = Builder.getInt32(e + i);
+ else
+ Mask[i] = Builder.getInt32(i);
+ }
+
+ Value *ShuffleMask = ConstantVector::get(Mask);
+
+ Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+ E->VectorizedValue = V;
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ return propagateMetadata(I, E->Scalars);
+
+ return V;
+ }
default:
llvm_unreachable("unknown inst");
}
@@ -1741,7 +2004,6 @@ Value *BoUpSLP::vectorizeTree() {
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
-
// No need to handle users of gathered values.
if (Entry->NeedToGather)
continue;
@@ -1925,7 +2187,6 @@ struct SLPVectorizer : public FunctionPass {
for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
e = po_end(&F.getEntryBlock()); it != e; ++it) {
BasicBlock *BB = *it;
-
// Vectorize trees that end at stores.
if (unsigned count = collectStores(BB, R)) {
(void)count;