aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Transforms/Vectorize
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r--lib/Transforms/Vectorize/BBVectorize.cpp4
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp231
2 files changed, 178 insertions, 57 deletions
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 7636541..17900da 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -1771,7 +1771,7 @@ namespace {
size_t MaxDepth = DAG.lookup(IJ);
DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
- << IJ.first << " <-> " << IJ.second << "} of depth " <<
+ << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
MaxDepth << " and size " << DAG.size() << "\n");
// At this point the DAG has been constructed, but, may contain
@@ -2086,7 +2086,7 @@ namespace {
DEBUG(if (DebugPairSelection)
dbgs() << "BBV: found pruned DAG for pair {"
- << IJ.first << " <-> " << IJ.second << "} of depth " <<
+ << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
MaxDepth << " and size " << PrunedDAG.size() <<
" (effective size: " << EffSize << ")\n");
if (((TTI && !UseChainDepthWithTI) ||
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f489393..930d9c4 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -79,6 +79,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -115,6 +116,12 @@ static const unsigned TinyTripCountUnrollThreshold = 128;
/// number of pointers. Notice that the check is quadratic!
static const unsigned RuntimeMemoryCheckThreshold = 4;
+/// We use a metadata with this name to indicate that a scalar loop was
+/// vectorized and that we don't need to re-vectorize it if we run into it
+/// again.
+static const char*
+AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized";
+
namespace {
// Forward declarations.
@@ -138,10 +145,11 @@ class LoopVectorizationCostModel;
class InnerLoopVectorizer {
public:
InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
- DominatorTree *DT, DataLayout *DL, unsigned VecWidth,
+ DominatorTree *DT, DataLayout *DL,
+ const TargetLibraryInfo *TLI, unsigned VecWidth,
unsigned UnrollFactor)
- : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), VF(VecWidth),
- UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
+ : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
+ VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
OldInduction(0), WidenMap(UnrollFactor) {}
// Perform the actual loop widening (vectorization).
@@ -268,6 +276,9 @@ private:
DominatorTree *DT;
/// Data Layout.
DataLayout *DL;
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
unsigned VF;
@@ -320,8 +331,9 @@ class LoopVectorizationLegality {
public:
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
DominatorTree *DT, TargetTransformInfo* TTI,
- AliasAnalysis* AA)
- : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), Induction(0) {}
+ AliasAnalysis *AA, TargetLibraryInfo *TLI)
+ : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
+ Induction(0) {}
/// This enum represents the kinds of reductions that we support.
enum ReductionKind {
@@ -407,7 +419,7 @@ public:
/// Alias(Multi)Map stores the values (GEPs or underlying objects and their
/// respective Store/Load instruction(s) to calculate aliasing.
- typedef DenseMap<Value*, Instruction* > AliasMap;
+ typedef MapVector<Value*, Instruction* > AliasMap;
typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
/// Returns true if it is legal to vectorize this loop.
@@ -504,6 +516,8 @@ private:
TargetTransformInfo *TTI;
/// Alias Analysis.
AliasAnalysis *AA;
+ /// Target Library Info.
+ TargetLibraryInfo *TLI;
// --- vectorization state --- //
@@ -540,8 +554,8 @@ public:
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
- DataLayout *DL)
- : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL) {}
+ DataLayout *DL, const TargetLibraryInfo *TLI)
+ : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
/// Information about vectorization costs
struct VectorizationFactor {
@@ -614,6 +628,8 @@ private:
const TargetTransformInfo &TTI;
/// Target data layout information.
DataLayout *DL;
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
};
/// The LoopVectorize Pass.
@@ -631,6 +647,7 @@ struct LoopVectorize : public LoopPass {
TargetTransformInfo *TTI;
DominatorTree *DT;
AliasAnalysis *AA;
+ TargetLibraryInfo *TLI;
virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
// We only vectorize innermost loops.
@@ -643,19 +660,20 @@ struct LoopVectorize : public LoopPass {
TTI = &getAnalysis<TargetTransformInfo>();
DT = &getAnalysis<DominatorTree>();
AA = getAnalysisIfAvailable<AliasAnalysis>();
+ TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
L->getHeader()->getParent()->getName() << "\"\n");
// Check if it is legal to vectorize the loop.
- LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA);
+ LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing.\n");
return false;
}
// Use the cost model.
- LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL);
+ LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI);
// Check the function attributes to find out if this function should be
// optimized for size.
@@ -689,7 +707,7 @@ struct LoopVectorize : public LoopPass {
DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
// If we decided that it is *legal* to vectorize the loop then do it.
- InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF.Width, UF);
+ InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
LB.vectorize(&LVL);
DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -1147,6 +1165,11 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
BasicBlock *ExitBlock = OrigLoop->getExitBlock();
assert(ExitBlock && "Must have an exit block");
+ // Mark the old scalar loop with metadata that tells us not to vectorize this
+ // loop again if we run into it.
+ MDNode *MD = MDNode::get(OldBasicBlock->getContext(), ArrayRef<Value*>());
+ OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD);
+
// Some loops have a single integer induction variable, while other loops
// don't. One example is c++ iterators that often have multiple pointer
// induction variables. In the code below we also support a case where we
@@ -1438,34 +1461,108 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
}
}
-static bool
-isTriviallyVectorizableIntrinsic(Instruction *Inst) {
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
- if (!II)
- return false;
- switch (II->getIntrinsicID()) {
- case Intrinsic::sqrt:
- case Intrinsic::sin:
- case Intrinsic::cos:
- case Intrinsic::exp:
- case Intrinsic::exp2:
- case Intrinsic::log:
- case Intrinsic::log10:
- case Intrinsic::log2:
- case Intrinsic::fabs:
- case Intrinsic::floor:
- case Intrinsic::ceil:
- case Intrinsic::trunc:
- case Intrinsic::rint:
- case Intrinsic::nearbyint:
- case Intrinsic::pow:
- case Intrinsic::fma:
- case Intrinsic::fmuladd:
- return true;
- default:
- return false;
+static Intrinsic::ID
+getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
+ // If we have an intrinsic call, check if it is trivially vectorizable.
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::sqrt:
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::log:
+ case Intrinsic::log10:
+ case Intrinsic::log2:
+ case Intrinsic::fabs:
+ case Intrinsic::floor:
+ case Intrinsic::ceil:
+ case Intrinsic::trunc:
+ case Intrinsic::rint:
+ case Intrinsic::nearbyint:
+ case Intrinsic::pow:
+ case Intrinsic::fma:
+ case Intrinsic::fmuladd:
+ return II->getIntrinsicID();
+ default:
+ return Intrinsic::not_intrinsic;
+ }
}
- return false;
+
+ if (!TLI)
+ return Intrinsic::not_intrinsic;
+
+ LibFunc::Func Func;
+ Function *F = CI->getCalledFunction();
+ // We're going to make assumptions on the semantics of the functions, check
+ // that the target knows that it's available in this environment.
+ if (!F || !TLI->getLibFunc(F->getName(), Func))
+ return Intrinsic::not_intrinsic;
+
+ // Otherwise check if we have a call to a function that can be turned into a
+ // vector intrinsic.
+ switch (Func) {
+ default:
+ break;
+ case LibFunc::sin:
+ case LibFunc::sinf:
+ case LibFunc::sinl:
+ return Intrinsic::sin;
+ case LibFunc::cos:
+ case LibFunc::cosf:
+ case LibFunc::cosl:
+ return Intrinsic::cos;
+ case LibFunc::exp:
+ case LibFunc::expf:
+ case LibFunc::expl:
+ return Intrinsic::exp;
+ case LibFunc::exp2:
+ case LibFunc::exp2f:
+ case LibFunc::exp2l:
+ return Intrinsic::exp2;
+ case LibFunc::log:
+ case LibFunc::logf:
+ case LibFunc::logl:
+ return Intrinsic::log;
+ case LibFunc::log10:
+ case LibFunc::log10f:
+ case LibFunc::log10l:
+ return Intrinsic::log10;
+ case LibFunc::log2:
+ case LibFunc::log2f:
+ case LibFunc::log2l:
+ return Intrinsic::log2;
+ case LibFunc::fabs:
+ case LibFunc::fabsf:
+ case LibFunc::fabsl:
+ return Intrinsic::fabs;
+ case LibFunc::floor:
+ case LibFunc::floorf:
+ case LibFunc::floorl:
+ return Intrinsic::floor;
+ case LibFunc::ceil:
+ case LibFunc::ceilf:
+ case LibFunc::ceill:
+ return Intrinsic::ceil;
+ case LibFunc::trunc:
+ case LibFunc::truncf:
+ case LibFunc::truncl:
+ return Intrinsic::trunc;
+ case LibFunc::rint:
+ case LibFunc::rintf:
+ case LibFunc::rintl:
+ return Intrinsic::rint;
+ case LibFunc::nearbyint:
+ case LibFunc::nearbyintf:
+ case LibFunc::nearbyintl:
+ return Intrinsic::nearbyint;
+ case LibFunc::pow:
+ case LibFunc::powf:
+ case LibFunc::powl:
+ return Intrinsic::pow;
+ }
+
+ return Intrinsic::not_intrinsic;
}
/// This function translates the reduction kind to an LLVM binary operator.
@@ -1546,7 +1643,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// To do so, we need to generate the 'identity' vector and overide
// one of the elements with the incoming scalar reduction. We need
// to do it in the vector-loop preheader.
- Builder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
+ Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
// This is the vector-clone of the value that leaves the loop.
VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
@@ -1991,17 +2088,21 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
}
case Instruction::Call: {
- assert(isTriviallyVectorizableIntrinsic(it));
+ // Ignore dbg intrinsics.
+ if (isa<DbgInfoIntrinsic>(it))
+ break;
+
Module *M = BB->getParent()->getParent();
- IntrinsicInst *II = cast<IntrinsicInst>(it);
- Intrinsic::ID ID = II->getIntrinsicID();
+ CallInst *CI = cast<CallInst>(it);
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ assert(ID && "Not an intrinsic call!");
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value*, 4> Args;
- for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) {
- VectorParts &Arg = getVectorValue(II->getArgOperand(i));
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+ VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
Args.push_back(Arg[Part]);
}
- Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) };
+ Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) };
Function *F = Intrinsic::getDeclaration(M, ID, Tys);
Entry[Part] = Builder.CreateCall(F, Args);
}
@@ -2138,6 +2239,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
BasicBlock *PreHeader = TheLoop->getLoopPreheader();
BasicBlock *Header = TheLoop->getHeader();
+ // If we marked the scalar loop as "already vectorized" then no need
+ // to vectorize it again.
+ if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) {
+ DEBUG(dbgs() << "LV: This loop was vectorized before\n");
+ return false;
+ }
+
// For each block in the loop.
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
@@ -2220,9 +2328,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
return false;
}// end of PHI handling
- // We still don't handle functions.
+ // We still don't handle functions. However, we can ignore dbg intrinsic
+ // calls and we do handle certain intrinsic and libm functions.
CallInst *CI = dyn_cast<CallInst>(it);
- if (CI && !isTriviallyVectorizableIntrinsic(it)) {
+ if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
DEBUG(dbgs() << "LV: Found a call site.\n");
return false;
}
@@ -2638,6 +2747,9 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
// Is this a bin op ?
FoundBinOp |= !isa<PHINode>(Iter);
+ // Remember the current instruction.
+ Instruction *OldIter = Iter;
+
// For each of the *users* of iter.
for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
it != e; ++it) {
@@ -2663,7 +2775,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
if (isa<PHINode>(Iter) && isa<PHINode>(U) &&
U->getParent() != TheLoop->getHeader() &&
TheLoop->contains(U) &&
- Iter->getNumUses() > 1)
+ Iter->hasNUsesOrMore(2))
continue;
// We can't have multiple inside users.
@@ -2683,6 +2795,10 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
Iter = U;
}
+ // If all uses were skipped this can't be a reduction variable.
+ if (Iter == OldIter)
+ return false;
+
// We found a reduction var if we have reached the original
// phi node and we only have a single instruction with out-of-loop
// users.
@@ -3152,6 +3268,10 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
// For each instruction in the old loop.
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ // Skip dbg intrinsics.
+ if (isa<DbgInfoIntrinsic>(it))
+ continue;
+
unsigned C = getInstructionCost(it, VF);
Cost += C;
DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
@@ -3218,7 +3338,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
Type *CondTy = SI->getCondition()->getType();
- if (ScalarCond)
+ if (!ScalarCond)
CondTy = VectorType::get(CondTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
@@ -3305,13 +3425,14 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
}
case Instruction::Call: {
- assert(isTriviallyVectorizableIntrinsic(I));
- IntrinsicInst *II = cast<IntrinsicInst>(I);
- Type *RetTy = ToVectorTy(II->getType(), VF);
+ CallInst *CI = cast<CallInst>(I);
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ assert(ID && "Not an intrinsic call!");
+ Type *RetTy = ToVectorTy(CI->getType(), VF);
SmallVector<Type*, 4> Tys;
- for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
- Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF));
- return TTI.getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
+ Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
+ return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
}
default: {
// We are scalarizing the instruction. Return the cost of the scalar