aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Transforms/Vectorize
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r--lib/Transforms/Vectorize/BBVectorize.cpp35
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp412
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp114
3 files changed, 366 insertions, 195 deletions
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 525c050..29fb01f 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -39,6 +39,7 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
@@ -206,8 +207,6 @@ namespace {
AA = &P->getAnalysis<AliasAnalysis>();
DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
SE = &P->getAnalysis<ScalarEvolution>();
- DataLayoutPass *DLP = P->getAnalysisIfAvailable<DataLayoutPass>();
- DL = DLP ? &DLP->getDataLayout() : nullptr;
TTI = IgnoreTargetInfo
? nullptr
: &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
@@ -222,7 +221,6 @@ namespace {
AliasAnalysis *AA;
DominatorTree *DT;
ScalarEvolution *SE;
- const DataLayout *DL;
const TargetTransformInfo *TTI;
// FIXME: const correct?
@@ -442,8 +440,6 @@ namespace {
AA = &getAnalysis<AliasAnalysis>();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
SE = &getAnalysis<ScalarEvolution>();
- DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
- DL = DLP ? &DLP->getDataLayout() : nullptr;
TTI = IgnoreTargetInfo
? nullptr
: &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
@@ -642,19 +638,19 @@ namespace {
dyn_cast<SCEVConstant>(OffsetSCEV)) {
ConstantInt *IntOff = ConstOffSCEV->getValue();
int64_t Offset = IntOff->getSExtValue();
-
+ const DataLayout &DL = I->getModule()->getDataLayout();
Type *VTy = IPtr->getType()->getPointerElementType();
- int64_t VTyTSS = (int64_t) DL->getTypeStoreSize(VTy);
+ int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy);
Type *VTy2 = JPtr->getType()->getPointerElementType();
if (VTy != VTy2 && Offset < 0) {
- int64_t VTy2TSS = (int64_t) DL->getTypeStoreSize(VTy2);
+ int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2);
OffsetInElmts = Offset/VTy2TSS;
- return (abs64(Offset) % VTy2TSS) == 0;
+ return (std::abs(Offset) % VTy2TSS) == 0;
}
OffsetInElmts = Offset/VTyTSS;
- return (abs64(Offset) % VTyTSS) == 0;
+ return (std::abs(Offset) % VTyTSS) == 0;
}
return false;
@@ -846,7 +842,7 @@ namespace {
// It is important to cleanup here so that future iterations of this
// function have less work to do.
- (void) SimplifyInstructionsInBlock(&BB, DL, AA->getTargetLibraryInfo());
+ (void)SimplifyInstructionsInBlock(&BB, AA->getTargetLibraryInfo());
return true;
}
@@ -900,10 +896,6 @@ namespace {
return false;
}
- // We can't vectorize memory operations without target data
- if (!DL && IsSimpleLoadStore)
- return false;
-
Type *T1, *T2;
getInstructionTypes(I, T1, T2);
@@ -938,9 +930,8 @@ namespace {
if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
return false;
- if ((!Config.VectorizePointers || !DL) &&
- (T1->getScalarType()->isPointerTy() ||
- T2->getScalarType()->isPointerTy()))
+ if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() ||
+ T2->getScalarType()->isPointerTy()))
return false;
if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
@@ -985,8 +976,8 @@ namespace {
unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
int64_t OffsetInElmts = 0;
if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
- IAddressSpace, JAddressSpace,
- OffsetInElmts) && abs64(OffsetInElmts) == 1) {
+ IAddressSpace, JAddressSpace, OffsetInElmts) &&
+ std::abs(OffsetInElmts) == 1) {
FixedOrder = (int) OffsetInElmts;
unsigned BottomAlignment = IAlignment;
if (OffsetInElmts < 0) BottomAlignment = JAlignment;
@@ -1001,8 +992,8 @@ namespace {
// An aligned load or store is possible only if the instruction
// with the lower offset has an alignment suitable for the
// vector type.
-
- unsigned VecAlignment = DL->getPrefTypeAlignment(VType);
+ const DataLayout &DL = I->getModule()->getDataLayout();
+ unsigned VecAlignment = DL.getPrefTypeAlignment(VType);
if (BottomAlignment < VecAlignment)
return false;
}
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6142306..b7d0ae4 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -218,6 +218,15 @@ public:
R.getInstr()) {}
};
+/// A helper function for converting Scalar types to vector types.
+/// If the incoming type is void, we return void. If the VF is 1, we return
+/// the scalar type.
+static Type* ToVectorTy(Type *Scalar, unsigned VF) {
+ if (Scalar->isVoidTy() || VF == 1)
+ return Scalar;
+ return VectorType::get(Scalar, VF);
+}
+
/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
@@ -235,13 +244,13 @@ public:
class InnerLoopVectorizer {
public:
InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
- DominatorTree *DT, const DataLayout *DL,
- const TargetLibraryInfo *TLI, unsigned VecWidth,
+ DominatorTree *DT, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, unsigned VecWidth,
unsigned UnrollFactor)
- : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
+ : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
- Legal(nullptr) {}
+ Legal(nullptr), AddedSafetyChecks(false) {}
// Perform the actual loop widening (vectorization).
void vectorize(LoopVectorizationLegality *L) {
@@ -255,6 +264,11 @@ public:
updateAnalysis();
}
+ // Return true if any runtime check is added.
+ bool IsSafetyChecksAdded() {
+ return AddedSafetyChecks;
+ }
+
virtual ~InnerLoopVectorizer() {}
protected:
@@ -389,10 +403,10 @@ protected:
DominatorTree *DT;
/// Alias Analysis.
AliasAnalysis *AA;
- /// Data Layout.
- const DataLayout *DL;
/// Target Library Info.
const TargetLibraryInfo *TLI;
+ /// Target Transform Info.
+ const TargetTransformInfo *TTI;
/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
@@ -434,14 +448,17 @@ protected:
EdgeMaskCache MaskCache;
LoopVectorizationLegality *Legal;
+
+ // Record whether runtime check is added.
+ bool AddedSafetyChecks;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
public:
InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
- DominatorTree *DT, const DataLayout *DL,
- const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
- InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
+ DominatorTree *DT, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, unsigned UnrollFactor)
+ : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
private:
void scalarizeInstruction(Instruction *Instr,
@@ -488,7 +505,7 @@ static std::string getDebugLocString(const Loop *L) {
raw_string_ostream OS(Result);
const DebugLoc LoopDbgLoc = L->getStartLoc();
if (!LoopDbgLoc.isUnknown())
- LoopDbgLoc.print(L->getHeader()->getContext(), OS);
+ LoopDbgLoc.print(OS);
else
// Just print the module name.
OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
@@ -543,14 +560,13 @@ static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *F
/// induction variable and the different reduction variables.
class LoopVectorizationLegality {
public:
- LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
- DominatorTree *DT, TargetLibraryInfo *TLI,
- AliasAnalysis *AA, Function *F,
- const TargetTransformInfo *TTI,
+ LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
+ TargetLibraryInfo *TLI, AliasAnalysis *AA,
+ Function *F, const TargetTransformInfo *TTI,
LoopAccessAnalysis *LAA)
- : NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
- TLI(TLI), TheFunction(F), TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
- Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
+ : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
+ TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr),
+ WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
/// This enum represents the kinds of reductions that we support.
enum ReductionKind {
@@ -842,8 +858,6 @@ private:
Loop *TheLoop;
/// Scev analysis.
ScalarEvolution *SE;
- /// DataLayout analysis.
- const DataLayout *DL;
/// Target Library Info.
TargetLibraryInfo *TLI;
/// Parent function
@@ -884,7 +898,7 @@ private:
ValueToValueMap Strides;
SmallPtrSet<Value *, 8> StrideSet;
-
+
/// While vectorizing these instructions we have to generate a
/// call to the appropriate masked intrinsic
SmallPtrSet<const Instruction*, 8> MaskedOp;
@@ -902,10 +916,9 @@ public:
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
- const DataLayout *DL, const TargetLibraryInfo *TLI,
- AssumptionCache *AC, const Function *F,
- const LoopVectorizeHints *Hints)
- : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI),
+ const TargetLibraryInfo *TLI, AssumptionCache *AC,
+ const Function *F, const LoopVectorizeHints *Hints)
+ : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI),
TheFunction(F), Hints(Hints) {
CodeMetrics::collectEphemeralValues(L, AC, EphValues);
}
@@ -958,11 +971,6 @@ private:
/// width. Vector width of one means scalar.
unsigned getInstructionCost(Instruction *I, unsigned VF);
- /// A helper function for converting Scalar types to vector types.
- /// If the incoming type is void, we return void. If the VF is 1, we return
- /// the scalar type.
- static Type* ToVectorTy(Type *Scalar, unsigned VF);
-
/// Returns whether the instruction is a load or store and will be a emitted
/// as a vector operation.
bool isConsecutiveLoadOrStore(Instruction *I);
@@ -988,8 +996,6 @@ private:
LoopVectorizationLegality *Legal;
/// Vector target information.
const TargetTransformInfo &TTI;
- /// Target data layout information.
- const DataLayout *DL;
/// Target Library Info.
const TargetLibraryInfo *TLI;
const Function *TheFunction;
@@ -1254,7 +1260,6 @@ struct LoopVectorize : public FunctionPass {
}
ScalarEvolution *SE;
- const DataLayout *DL;
LoopInfo *LI;
TargetTransformInfo *TTI;
DominatorTree *DT;
@@ -1270,8 +1275,6 @@ struct LoopVectorize : public FunctionPass {
bool runOnFunction(Function &F) override {
SE = &getAnalysis<ScalarEvolution>();
- DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
- DL = DLP ? &DLP->getDataLayout() : nullptr;
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -1292,12 +1295,6 @@ struct LoopVectorize : public FunctionPass {
if (!TTI->getNumberOfRegisters(true))
return false;
- if (!DL) {
- DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName()
- << ": Missing data layout\n");
- return false;
- }
-
// Build up a worklist of inner-loops to vectorize. This is necessary as
// the act of vectorizing or partially unrolling a loop creates new loops
// and can invalidate iterators across the loops.
@@ -1317,6 +1314,40 @@ struct LoopVectorize : public FunctionPass {
return Changed;
}
+ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
+ SmallVector<Metadata *, 4> MDs;
+ // Reserve first location for self reference to the LoopID metadata node.
+ MDs.push_back(nullptr);
+ bool IsUnrollMetadata = false;
+ MDNode *LoopID = L->getLoopID();
+ if (LoopID) {
+ // First find existing loop unrolling disable metadata.
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+ if (MD) {
+ const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+ IsUnrollMetadata =
+ S && S->getString().startswith("llvm.loop.unroll.disable");
+ }
+ MDs.push_back(LoopID->getOperand(i));
+ }
+ }
+
+ if (!IsUnrollMetadata) {
+ // Add runtime unroll disable metadata.
+ LLVMContext &Context = L->getHeader()->getContext();
+ SmallVector<Metadata *, 1> DisableOperands;
+ DisableOperands.push_back(
+ MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+ MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+ MDs.push_back(DisableNode);
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ L->setLoopID(NewLoopID);
+ }
+ }
+
bool processLoop(Loop *L) {
assert(L->empty() && "Only process inner loops.");
@@ -1391,7 +1422,7 @@ struct LoopVectorize : public FunctionPass {
}
// Check if it is legal to vectorize the loop.
- LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI, LAA);
+ LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
emitMissedWarning(F, L, Hints);
@@ -1399,8 +1430,7 @@ struct LoopVectorize : public FunctionPass {
}
// Use the cost model.
- LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AC, F,
- &Hints);
+ LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints);
// Check the function attributes to find out if this function should be
// optimized for size.
@@ -1464,14 +1494,20 @@ struct LoopVectorize : public FunctionPass {
// We decided not to vectorize, but we may want to unroll.
- InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
+ InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF);
Unroller.vectorize(&LVL);
} else {
// If we decided that it is *legal* to vectorize the loop then do it.
- InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
+ InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF);
LB.vectorize(&LVL);
++LoopsVectorized;
+ // Add metadata to disable runtime unrolling scalar loop when there's no
+ // runtime check about strides and memory. Because at this situation,
+ // scalar loop is rarely used not worthy to be unrolled.
+ if (!LB.IsSafetyChecksAdded())
+ AddRuntimeUnrollDisableMetaData(L);
+
// Report the vectorization decision.
emitOptimizationRemark(
F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
@@ -1561,10 +1597,10 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
/// \brief Find the operand of the GEP that should be checked for consecutive
/// stores. This ignores trailing indices that have no effect on the final
/// pointer.
-static unsigned getGEPInductionOperand(const DataLayout *DL,
- const GetElementPtrInst *Gep) {
+static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
+ const DataLayout &DL = Gep->getModule()->getDataLayout();
unsigned LastOperand = Gep->getNumOperands() - 1;
- unsigned GEPAllocSize = DL->getTypeAllocSize(
+ unsigned GEPAllocSize = DL.getTypeAllocSize(
cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
// Walk backwards and try to peel off zeros.
@@ -1575,7 +1611,7 @@ static unsigned getGEPInductionOperand(const DataLayout *DL,
// If it's a type with the same allocation size as the result of the GEP we
// can peel off the zero index.
- if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize)
+ if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize)
break;
--LastOperand;
}
@@ -1621,7 +1657,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
return II.getConsecutiveDirection();
}
- unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
+ unsigned InductionOperand = getGEPInductionOperand(Gep);
// Check that all of the gep indices are uniform except for our induction
// operand.
@@ -1714,11 +1750,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
// An alignment of 0 means target abi alignment. We need to use the scalar's
// target abi alignment in such a case.
+ const DataLayout &DL = Instr->getModule()->getDataLayout();
if (!Alignment)
- Alignment = DL->getABITypeAlignment(ScalarDataTy);
+ Alignment = DL.getABITypeAlignment(ScalarDataTy);
unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
- unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
- unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
+ unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);
+ unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;
if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
!Legal->isMaskRequired(SI))
@@ -1759,7 +1796,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
// The last index does not have to be the induction. It can be
// consecutive and be a function of the index. For example A[I+1];
unsigned NumOperands = Gep->getNumOperands();
- unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
+ unsigned InductionOperand = getGEPInductionOperand(Gep);
// Create the new GEP with the new induction variable.
GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
@@ -2080,9 +2117,11 @@ void InnerLoopVectorizer::createEmptyLoop() {
ExitCount = SE->getAddExpr(BackedgeTakeCount,
SE->getConstant(BackedgeTakeCount->getType(), 1));
+ const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout();
+
// Expand the trip count and place the new instructions in the preheader.
// Notice that the pre-header does not change, only the loop body.
- SCEVExpander Exp(*SE, "induction");
+ SCEVExpander Exp(*SE, DL, "induction");
// We need to test whether the backedge-taken count is uint##_max. Adding one
// to it will cause overflow and an incorrect loop trip count in the vector
@@ -2218,6 +2257,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
std::tie(FirstCheckInst, StrideCheck) =
addStrideCheck(LastBypassBlock->getTerminator());
if (StrideCheck) {
+ AddedSafetyChecks = true;
// Create a new block containing the stride check.
BasicBlock *CheckBlock =
LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
@@ -2242,6 +2282,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
std::tie(FirstCheckInst, MemRuntimeCheck) =
Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator());
if (MemRuntimeCheck) {
+ AddedSafetyChecks = true;
// Create a new block containing the memory check.
BasicBlock *CheckBlock =
LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck");
@@ -2480,10 +2521,9 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
}
}
-Value *createMinMaxOp(IRBuilder<> &Builder,
- LoopVectorizationLegality::MinMaxReductionKind RK,
- Value *Left,
- Value *Right) {
+static Value *createMinMaxOp(IRBuilder<> &Builder,
+ LoopVectorizationLegality::MinMaxReductionKind RK,
+ Value *Left, Value *Right) {
CmpInst::Predicate P = CmpInst::ICMP_NE;
switch (RK) {
default:
@@ -2594,6 +2634,95 @@ static Value *addFastMathFlag(Value *V) {
return V;
}
+/// Estimate the overhead of scalarizing a value. Insert and Extract are set if
+/// the result needs to be inserted and/or extracted from vectors.
+static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
+ const TargetTransformInfo &TTI) {
+ if (Ty->isVoidTy())
+ return 0;
+
+ assert(Ty->isVectorTy() && "Can only scalarize vectors");
+ unsigned Cost = 0;
+
+ for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+ if (Insert)
+ Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ if (Extract)
+ Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+ }
+
+ return Cost;
+}
+
+// Estimate cost of a call instruction CI if it were vectorized with factor VF.
+// Return the cost of the instruction, including scalarization overhead if it's
+// needed. The flag NeedToScalarize shows if the call needs to be scalarized -
+// i.e. either vector version isn't available, or is too expensive.
+static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
+ const TargetTransformInfo &TTI,
+ const TargetLibraryInfo *TLI,
+ bool &NeedToScalarize) {
+ Function *F = CI->getCalledFunction();
+ StringRef FnName = CI->getCalledFunction()->getName();
+ Type *ScalarRetTy = CI->getType();
+ SmallVector<Type *, 4> Tys, ScalarTys;
+ for (auto &ArgOp : CI->arg_operands())
+ ScalarTys.push_back(ArgOp->getType());
+
+ // Estimate cost of scalarized vector call. The source operands are assumed
+ // to be vectors, so we need to extract individual elements from there,
+ // execute VF scalar calls, and then gather the result into the vector return
+ // value.
+ unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
+ if (VF == 1)
+ return ScalarCallCost;
+
+ // Compute corresponding vector type for return value and arguments.
+ Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+ for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i)
+ Tys.push_back(ToVectorTy(ScalarTys[i], VF));
+
+ // Compute costs of unpacking argument values for the scalar calls and
+ // packing the return values to a vector.
+ unsigned ScalarizationCost =
+ getScalarizationOverhead(RetTy, true, false, TTI);
+ for (unsigned i = 0, ie = Tys.size(); i != ie; ++i)
+ ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI);
+
+ unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
+
+ // If we can't emit a vector call for this function, then the currently found
+ // cost is the cost we need to return.
+ NeedToScalarize = true;
+ if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
+ return Cost;
+
+ // If the corresponding vector cost is cheaper, return its cost.
+ unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
+ if (VectorCallCost < Cost) {
+ NeedToScalarize = false;
+ return VectorCallCost;
+ }
+ return Cost;
+}
+
+// Estimate cost of an intrinsic call instruction CI if it were vectorized with
+// factor VF. Return the cost of the instruction, including scalarization
+// overhead if it's needed.
+static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
+ const TargetTransformInfo &TTI,
+ const TargetLibraryInfo *TLI) {
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ assert(ID && "Expected intrinsic call!");
+
+ Type *RetTy = ToVectorTy(CI->getType(), VF);
+ SmallVector<Type *, 4> Tys;
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
+ Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
+
+ return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
+}
+
void InnerLoopVectorizer::vectorizeLoop() {
//===------------------------------------------------===//
//
@@ -3181,37 +3310,71 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
Module *M = BB->getParent()->getParent();
CallInst *CI = cast<CallInst>(it);
+
+ StringRef FnName = CI->getCalledFunction()->getName();
+ Function *F = CI->getCalledFunction();
+ Type *RetTy = ToVectorTy(CI->getType(), VF);
+ SmallVector<Type *, 4> Tys;
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
+ Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
+
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
- assert(ID && "Not an intrinsic call!");
- switch (ID) {
- case Intrinsic::assume:
- case Intrinsic::lifetime_end:
- case Intrinsic::lifetime_start:
+ if (ID &&
+ (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+ ID == Intrinsic::lifetime_start)) {
scalarizeInstruction(it);
break;
- default:
- bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1);
- for (unsigned Part = 0; Part < UF; ++Part) {
- SmallVector<Value *, 4> Args;
- for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
- if (HasScalarOpd && i == 1) {
- Args.push_back(CI->getArgOperand(i));
- continue;
- }
- VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
- Args.push_back(Arg[Part]);
- }
- Type *Tys[] = {CI->getType()};
- if (VF > 1)
- Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+ }
+ // The flag shows whether we use Intrinsic or a usual Call for vectorized
+ // version of the instruction.
+ // Is it beneficial to perform intrinsic call compared to lib call?
+ bool NeedToScalarize;
+ unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
+ bool UseVectorIntrinsic =
+ ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
+ if (!UseVectorIntrinsic && NeedToScalarize) {
+ scalarizeInstruction(it);
+ break;
+ }
- Function *F = Intrinsic::getDeclaration(M, ID, Tys);
- Entry[Part] = Builder.CreateCall(F, Args);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ SmallVector<Value *, 4> Args;
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+ Value *Arg = CI->getArgOperand(i);
+ // Some intrinsics have a scalar argument - don't replace it with a
+ // vector.
+ if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
+ VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
+ Arg = VectorArg[Part];
+ }
+ Args.push_back(Arg);
}
- propagateMetadata(Entry, it);
- break;
+ Function *VectorF;
+ if (UseVectorIntrinsic) {
+ // Use vector version of the intrinsic.
+ Type *TysForDecl[] = {CI->getType()};
+ if (VF > 1)
+ TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+ VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+ } else {
+ // Use vector version of the library call.
+ StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
+ assert(!VFnName.empty() && "Vector function name is empty.");
+ VectorF = M->getFunction(VFnName);
+ if (!VectorF) {
+ // Generate a declaration
+ FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
+ VectorF =
+ Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
+ VectorF->copyAttributesFrom(F);
+ }
+ }
+ assert(VectorF && "Can't create vector function.");
+ Entry[Part] = Builder.CreateCall(VectorF, Args);
}
+
+ propagateMetadata(Entry, it);
break;
}
@@ -3463,6 +3626,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// Look for the attribute signaling the absence of NaNs.
Function &F = *Header->getParent();
+ const DataLayout &DL = F.getParent()->getDataLayout();
if (F.hasFnAttribute("no-nans-fp-math"))
HasFunNoNaNAttr =
F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
@@ -3518,9 +3682,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (IK_NoInduction != IK) {
// Get the widest type.
if (!WidestIndTy)
- WidestIndTy = convertPointerToIntegerType(*DL, PhiTy);
+ WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
else
- WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
+ WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
// Int inductions are special because we only allow one IV.
if (IK == IK_IntInduction && StepValue->isOne()) {
@@ -3591,13 +3755,17 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
return false;
}// end of PHI handling
- // We still don't handle functions. However, we can ignore dbg intrinsic
- // calls and we do handle certain intrinsic and libm functions.
+ // We handle calls that:
+ // * Are debug info intrinsics.
+ // * Have a mapping to an IR intrinsic.
+ // * Have a vector version available.
CallInst *CI = dyn_cast<CallInst>(it);
- if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
+ if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&
+ !(CI->getCalledFunction() && TLI &&
+ TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
emitAnalysis(VectorizationReport(it) <<
"call instruction cannot be vectorized");
- DEBUG(dbgs() << "LV: Found a call site.\n");
+ DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
return false;
}
@@ -3665,13 +3833,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
///\brief Remove GEPs whose indices but the last one are loop invariant and
/// return the induction operand of the gep pointer.
-static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE,
- const DataLayout *DL, Loop *Lp) {
+static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
if (!GEP)
return Ptr;
- unsigned InductionOperand = getGEPInductionOperand(DL, GEP);
+ unsigned InductionOperand = getGEPInductionOperand(GEP);
// Check that all of the gep indices are uniform except for our induction
// operand.
@@ -3700,8 +3867,7 @@ static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
///\brief Get the stride of a pointer access in a loop.
/// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
/// pointer to the Value, or null otherwise.
-static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
- const DataLayout *DL, Loop *Lp) {
+static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
if (!PtrTy || PtrTy->isAggregateType())
return nullptr;
@@ -3714,7 +3880,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
// The size of the pointer access.
int64_t PtrAccessSize = 1;
- Ptr = stripGetElementPtr(Ptr, SE, DL, Lp);
+ Ptr = stripGetElementPtr(Ptr, SE, Lp);
const SCEV *V = SE->getSCEV(Ptr);
if (Ptr != OrigPtr)
@@ -3733,7 +3899,8 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
// Strip off the size of access multiplication if we are still analyzing the
// pointer.
if (OrigPtr == Ptr) {
- DL->getTypeAllocSize(PtrTy->getElementType());
+ const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout();
+ DL.getTypeAllocSize(PtrTy->getElementType());
if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
if (M->getOperand(0)->getSCEVType() != scConstant)
return nullptr;
@@ -3785,7 +3952,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
else
return;
- Value *Stride = getStrideFromPointer(Ptr, SE, DL, TheLoop);
+ Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);
if (!Stride)
return;
@@ -3837,7 +4004,19 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
auto &OptionalReport = LAI->getReport();
if (OptionalReport)
emitAnalysis(VectorizationReport(*OptionalReport));
- return LAI->canVectorizeMemory();
+ if (!LAI->canVectorizeMemory())
+ return false;
+
+ if (LAI->getNumRuntimePointerChecks() >
+ VectorizerParams::RuntimeMemoryCheckThreshold) {
+ emitAnalysis(VectorizationReport()
+ << LAI->getNumRuntimePointerChecks() << " exceeds limit of "
+ << VectorizerParams::RuntimeMemoryCheckThreshold
+ << " dependent memory operations checked at runtime");
+ DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+ return false;
+ }
+ return true;
}
static bool hasMultipleUsesOf(Instruction *I,
@@ -4163,7 +4342,8 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi,
if (!PointerElementType->isSized())
return IK_NoInduction;
- int64_t Size = static_cast<int64_t>(DL->getTypeAllocSize(PointerElementType));
+ const DataLayout &DL = Phi->getModule()->getDataLayout();
+ int64_t Size = static_cast<int64_t>(DL.getTypeAllocSize(PointerElementType));
int64_t CVSize = CV->getSExtValue();
if (CVSize % Size)
return IK_NoInduction;
@@ -4375,6 +4555,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
unsigned LoopVectorizationCostModel::getWidestType() {
unsigned MaxWidth = 8;
+ const DataLayout &DL = TheFunction->getParent()->getDataLayout();
// For each block.
for (Loop::block_iterator bb = TheLoop->block_begin(),
@@ -4409,7 +4590,7 @@ unsigned LoopVectorizationCostModel::getWidestType() {
continue;
MaxWidth = std::max(MaxWidth,
- (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
+ (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
}
}
@@ -4561,6 +4742,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
return SmallUF;
}
+ // Unroll if this is a large loop (small loops are already dealt with by this
+ // point) that could benefit from interleaved unrolling.
+ bool HasReductions = (Legal->getReductionVars()->size() > 0);
+ if (TTI.enableAggressiveInterleaving(HasReductions)) {
+ DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n");
+ return UF;
+ }
+
DEBUG(dbgs() << "LV: Not Unrolling.\n");
return 1;
}
@@ -4898,8 +5087,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
// Scalarized loads/stores.
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
bool Reverse = ConsecutiveStride < 0;
- unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
- unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
+ const DataLayout &DL = I->getModule()->getDataLayout();
+ unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
+ unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
bool IsComplexComputation =
isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
@@ -4960,14 +5150,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
}
case Instruction::Call: {
+ bool NeedToScalarize;
CallInst *CI = cast<CallInst>(I);
- Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
- assert(ID && "Not an intrinsic call!");
- Type *RetTy = ToVectorTy(CI->getType(), VF);
- SmallVector<Type*, 4> Tys;
- for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
- Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
- return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
+ unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
+ if (getIntrinsicIDForCall(CI, TLI))
+ return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
+ return CallCost;
}
default: {
// We are scalarizing the instruction. Return the cost of the scalar
@@ -4994,12 +5182,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
}// end of switch.
}
-Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
- if (Scalar->isVoidTy() || VF == 1)
- return Scalar;
- return VectorType::get(Scalar, VF);
-}
-
char LoopVectorize::ID = 0;
static const char lv_name[] = "Loop Vectorization";
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index baf9741..8fc4cc1 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -17,9 +17,9 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Optional.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
@@ -342,11 +342,11 @@ public:
typedef SmallPtrSet<Value *, 16> ValueSet;
typedef SmallVector<StoreInst *, 8> StoreList;
- BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
- TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa,
- LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC)
+ BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
+ TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
+ DominatorTree *Dt, AssumptionCache *AC)
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
- SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
+ SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
}
@@ -383,7 +383,7 @@ public:
}
/// \returns true if the memory operations A and B are consecutive.
- bool isConsecutiveAccess(Value *A, Value *B);
+ bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
@@ -877,7 +877,6 @@ private:
// Analysis and block reference.
Function *F;
ScalarEvolution *SE;
- const DataLayout *DL;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
AliasAnalysis *AA;
@@ -1130,8 +1129,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
- if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
- if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) {
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
+ if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
++NumLoadsWantToChangeOrder;
}
BS.cancelScheduling(VL);
@@ -1300,9 +1300,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
return;
}
case Instruction::Store: {
+ const DataLayout &DL = F->getParent()->getDataLayout();
// Check if the stores are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
- if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+ if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
@@ -1789,7 +1790,7 @@ unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
return -1;
}
-bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
+bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
Value *PtrA = getPointerOperand(A);
Value *PtrB = getPointerOperand(B);
unsigned ASA = getAddressSpaceOperand(A);
@@ -1803,13 +1804,13 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
return false;
- unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA);
+ unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
- APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty));
+ APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
- PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA);
- PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB);
+ PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+ PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
APInt OffsetDelta = OffsetB - OffsetA;
@@ -1842,6 +1843,7 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
+ const DataLayout &DL = F->getParent()->getDataLayout();
// Push left and right operands of binary operation into Left and Right
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
@@ -1856,10 +1858,10 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
Instruction *VL1 = cast<Instruction>(VL[j]);
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
- if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) {
+ if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
std::swap(Left[j], Right[j]);
continue;
- } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) {
+ } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
@@ -1870,10 +1872,10 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
Instruction *VL1 = cast<Instruction>(VL[j]);
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
- if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) {
+ if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
std::swap(Left[j], Right[j]);
continue;
- } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) {
+ } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
@@ -1983,6 +1985,8 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
Right = OrigRight;
}
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
// Finally check if we can get longer vectorizable chain by reordering
// without breaking the good operand order detected above.
// E.g. If we have something like-
@@ -2001,7 +2005,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
for (unsigned j = 0; j < VL.size() - 1; ++j) {
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
- if (isConsecutiveAccess(L, L1)) {
+ if (isConsecutiveAccess(L, L1, DL)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
@@ -2009,7 +2013,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
}
if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
- if (isConsecutiveAccess(L, L1)) {
+ if (isConsecutiveAccess(L, L1, DL)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
@@ -2105,6 +2109,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return Gather(E->Scalars, VecTy);
}
+ const DataLayout &DL = F->getParent()->getDataLayout();
unsigned Opcode = getSameOpcode(E->Scalars);
switch (Opcode) {
@@ -2301,8 +2306,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
unsigned Alignment = LI->getAlignment();
LI = Builder.CreateLoad(VecPtr);
- if (!Alignment)
- Alignment = DL->getABITypeAlignment(ScalarLoadTy);
+ if (!Alignment) {
+ Alignment = DL.getABITypeAlignment(ScalarLoadTy);
+ }
LI->setAlignment(Alignment);
E->VectorizedValue = LI;
++NumVectorInstructions;
@@ -2331,8 +2337,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ExternalUses.push_back(
ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
- if (!Alignment)
- Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
+ if (!Alignment) {
+ Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
+ }
S->setAlignment(Alignment);
E->VectorizedValue = S;
++NumVectorInstructions;
@@ -2358,7 +2365,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
OpVecs.push_back(OpVec);
}
- Value *V = Builder.CreateGEP(Op0, OpVecs);
+ Value *V = Builder.CreateGEP(
+ cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -3051,7 +3059,6 @@ struct SLPVectorizer : public FunctionPass {
}
ScalarEvolution *SE;
- const DataLayout *DL;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
AliasAnalysis *AA;
@@ -3064,8 +3071,6 @@ struct SLPVectorizer : public FunctionPass {
return false;
SE = &getAnalysis<ScalarEvolution>();
- DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
- DL = DLP ? &DLP->getDataLayout() : nullptr;
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
TLI = TLIP ? &TLIP->getTLI() : nullptr;
@@ -3082,11 +3087,6 @@ struct SLPVectorizer : public FunctionPass {
if (!TTI->getNumberOfRegisters(true))
return false;
- // Must have DataLayout. We can't require it because some tests run w/o
- // triple.
- if (!DL)
- return false;
-
// Don't vectorize when the attribute NoImplicitFloat is used.
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return false;
@@ -3095,7 +3095,7 @@ struct SLPVectorizer : public FunctionPass {
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
- BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC);
+ BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
// delete instructions.
@@ -3178,15 +3178,11 @@ private:
/// the WeakVH array.
/// Vectorization of part of the VL array may cause later values in the VL array
/// to become invalid. We track when this has happened in the WeakVH array.
-static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
- SmallVectorImpl<WeakVH> &VH,
- unsigned SliceBegin,
- unsigned SliceSize) {
- for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
- if (VH[i] != VL[i])
- return true;
-
- return false;
+static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
+ unsigned SliceBegin, unsigned SliceSize) {
+ VL = VL.slice(SliceBegin, SliceSize);
+ VH = VH.slice(SliceBegin, SliceSize);
+ return !std::equal(VL.begin(), VL.end(), VH.begin());
}
bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
@@ -3195,7 +3191,8 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
<< "\n");
Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
- unsigned Sz = DL->getTypeSizeInBits(StoreTy);
+ auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
+ unsigned Sz = DL.getTypeSizeInBits(StoreTy);
unsigned VF = MinVecRegSize / Sz;
if (!isPowerOf2_32(Sz) || VF < 2)
@@ -3238,8 +3235,8 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
int costThreshold, BoUpSLP &R) {
- SetVector<Value *> Heads, Tails;
- SmallDenseMap<Value *, Value *> ConsecutiveChain;
+ SetVector<StoreInst *> Heads, Tails;
+ SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
// We may run into multiple chains that merge into a single chain. We mark the
// stores that we vectorized so that we don't visit the same store twice.
@@ -3252,8 +3249,8 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
for (unsigned j = 0; j < e; ++j) {
if (i == j)
continue;
-
- if (R.isConsecutiveAccess(Stores[i], Stores[j])) {
+ const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
+ if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) {
Tails.insert(Stores[j]);
Heads.insert(Stores[i]);
ConsecutiveChain[Stores[i]] = Stores[j];
@@ -3262,7 +3259,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
}
// For stores that start but don't end a link in the chain:
- for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
+ for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
it != e; ++it) {
if (Tails.count(*it))
continue;
@@ -3270,7 +3267,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
// We found a store instr that starts a chain. Now follow the chain and try
// to vectorize it.
BoUpSLP::ValueList Operands;
- Value *I = *it;
+ StoreInst *I = *it;
// Collect the chain into a list.
while (Tails.count(I) || Heads.count(I)) {
if (VectorizedStores.count(I))
@@ -3295,6 +3292,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
unsigned count = 0;
StoreRefs.clear();
+ const DataLayout &DL = BB->getModule()->getDataLayout();
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
StoreInst *SI = dyn_cast<StoreInst>(it);
if (!SI)
@@ -3340,9 +3338,10 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
return false;
unsigned Opcode0 = I0->getOpcode();
+ const DataLayout &DL = I0->getModule()->getDataLayout();
Type *Ty0 = I0->getType();
- unsigned Sz = DL->getTypeSizeInBits(Ty0);
+ unsigned Sz = DL.getTypeSizeInBits(Ty0);
unsigned VF = MinVecRegSize / Sz;
for (int i = 0, e = VL.size(); i < e; ++i) {
@@ -3544,8 +3543,7 @@ public:
ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
/// \brief Try to find a reduction tree.
- bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
- const DataLayout *DL) {
+ bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
assert((!Phi ||
std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
"Thi phi needs to use the binary operator");
@@ -3570,9 +3568,10 @@ public:
if (!isValidElementType(Ty))
return false;
+ const DataLayout &DL = B->getModule()->getDataLayout();
ReductionOpcode = B->getOpcode();
ReducedValueOpcode = 0;
- ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
+ ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
ReductionRoot = B;
ReductionPHI = Phi;
@@ -3882,8 +3881,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Try to match and vectorize a horizontal reduction.
HorizontalReduction HorRdx;
- if (ShouldVectorizeHor &&
- HorRdx.matchAssociativeReduction(P, BI, DL) &&
+ if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
HorRdx.tryToReduce(R, TTI)) {
Changed = true;
it = BB->begin();
@@ -3913,7 +3911,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (BinaryOperator *BinOp =
dyn_cast<BinaryOperator>(SI->getValueOperand())) {
HorizontalReduction HorRdx;
- if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) &&
+ if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
HorRdx.tryToReduce(R, TTI)) ||
tryToVectorize(BinOp, R))) {
Changed = true;