aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
authorStephen Hines <srhines@google.com>2013-03-05 23:27:24 -0800
committerStephen Hines <srhines@google.com>2013-03-05 23:27:24 -0800
commit5adb136be579e8fff3734461580cb34d1d2983b8 (patch)
treebff1a422e9c9789df563aaf9a7e91e63e8ec0384 /lib/Transforms/Vectorize/LoopVectorize.cpp
parent227a4a4ade38716ba9eb3205f48b52910f3b955e (diff)
parentb3201c5cf1e183d840f7c99ff779d57f1549d8e5 (diff)
downloadexternal_llvm-5adb136be579e8fff3734461580cb34d1d2983b8.zip
external_llvm-5adb136be579e8fff3734461580cb34d1d2983b8.tar.gz
external_llvm-5adb136be579e8fff3734461580cb34d1d2983b8.tar.bz2
Merge commit 'b3201c5cf1e183d840f7c99ff779d57f1549d8e5' into merge_20130226
Conflicts: include/llvm/Support/ELF.h lib/Support/DeltaAlgorithm.cpp Change-Id: I24a4fbce62eb39d924efee3c687b55e1e17b30cd
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp980
1 files changed, 636 insertions, 344 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9c82cb8..f489393 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9,10 +9,10 @@
//
// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
// and generates target-independent LLVM-IR. Legalization of the IR is done
-// in the codegen. However, the vectorizes uses (will use) the codegen
+// in the codegen. However, the vectorizer uses (will use) the codegen
// interfaces to generate IR that is likely to result in an optimal binary.
//
-// The loop vectorizer combines consecutive loop iteration into a single
+// The loop vectorizer combines consecutive loop iterations into a single
// 'wide' iteration. After this transformation the index is incremented
// by the SIMD vector width, and not by one.
//
@@ -32,7 +32,7 @@
// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
//
// Variable uniformity checks are inspired by:
-// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+// Karrenberg, R. and Hack, S. Whole Function Vectorization.
//
// Other ideas/concepts are from:
// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
@@ -101,24 +101,20 @@ EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
cl::desc("Enable if-conversion during vectorization."));
/// We don't vectorize loops with a known constant trip count below this number.
-static const unsigned TinyTripCountVectorThreshold = 16;
+static cl::opt<unsigned>
+TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
+ cl::Hidden,
+ cl::desc("Don't vectorize loops with a constant "
+ "trip count that is smaller than this "
+ "value."));
/// We don't unroll loops with a known constant trip count below this number.
static const unsigned TinyTripCountUnrollThreshold = 128;
-/// We don't unroll loops that are larget than this threshold.
-static const unsigned MaxLoopSizeThreshold = 32;
-
/// When performing a runtime memory check, do not check more than this
/// number of pointers. Notice that the check is quadratic!
static const unsigned RuntimeMemoryCheckThreshold = 4;
-/// This is the highest vector width that we try to generate.
-static const unsigned MaxVectorSize = 8;
-
-/// This is the highest Unroll Factor.
-static const unsigned MaxUnrollSize = 4;
-
namespace {
// Forward declarations.
@@ -169,8 +165,8 @@ private:
/// Add code that checks at runtime if the accessed arrays overlap.
/// Returns the comparator value or NULL if no check is needed.
- Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
- Instruction *Loc);
+ Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal,
+ Instruction *Loc);
/// Create an empty loop, based on the loop ranges of the old loop.
void createEmptyLoop(LoopVectorizationLegality *Legal);
/// Copy and widen the instructions from the old loop.
@@ -196,6 +192,10 @@ private:
/// of scalars.
void scalarizeInstruction(Instruction *Instr);
+ /// Vectorize Load and Store instructions,
+ void vectorizeMemoryInstruction(Instruction *Instr,
+ LoopVectorizationLegality *Legal);
+
/// Create a broadcast instruction. This method generates a broadcast
/// instruction (shuffle) for loop invariant values and for the induction
/// value. If this is the induction variable then we extend it to N, N+1, ...
@@ -228,31 +228,34 @@ private:
ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
/// \return True if 'Key' is saved in the Value Map.
- bool has(Value *Key) { return MapStoreage.count(Key); }
+ bool has(Value *Key) const { return MapStorage.count(Key); }
/// Initializes a new entry in the map. Sets all of the vector parts to the
/// save value in 'Val'.
/// \return A reference to a vector with splat values.
VectorParts &splat(Value *Key, Value *Val) {
- MapStoreage[Key].clear();
- MapStoreage[Key].append(UF, Val);
- return MapStoreage[Key];
+ VectorParts &Entry = MapStorage[Key];
+ Entry.assign(UF, Val);
+ return Entry;
}
///\return A reference to the value that is stored at 'Key'.
VectorParts &get(Value *Key) {
- if (!has(Key))
- MapStoreage[Key].resize(UF);
- return MapStoreage[Key];
+ VectorParts &Entry = MapStorage[Key];
+ if (Entry.empty())
+ Entry.resize(UF);
+ assert(Entry.size() == UF);
+ return Entry;
}
+ private:
/// The unroll factor. Each entry in the map stores this number of vector
/// elements.
unsigned UF;
/// Map storage. We use std::map and not DenseMap because insertions to a
/// dense map invalidates its iterators.
- std::map<Value*, VectorParts> MapStoreage;
+ std::map<Value *, VectorParts> MapStorage;
};
/// The original loop.
@@ -289,8 +292,8 @@ private:
BasicBlock *LoopVectorBody;
///The scalar loop body.
BasicBlock *LoopScalarBody;
- ///The first bypass block.
- BasicBlock *LoopBypassBlock;
+ /// A list of all bypass blocks. The first block is the entry of the loop.
+ SmallVector<BasicBlock *, 4> LoopBypassBlocks;
/// The new Induction variable which was added to the new block.
PHINode *Induction;
@@ -316,8 +319,9 @@ private:
class LoopVectorizationLegality {
public:
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
- DominatorTree *DT)
- : TheLoop(L), SE(SE), DL(DL), DT(DT), Induction(0) {}
+ DominatorTree *DT, TargetTransformInfo* TTI,
+ AliasAnalysis* AA)
+ : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), Induction(0) {}
/// This enum represents the kinds of reductions that we support.
enum ReductionKind {
@@ -336,7 +340,8 @@ public:
IK_NoInduction, ///< Not an induction variable.
IK_IntInduction, ///< Integer induction variable. Step = 1.
IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
- IK_PtrInduction ///< Pointer induction variable. Step = sizeof(elem).
+ IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem).
+ IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem).
};
/// This POD struct holds information about reduction variables.
@@ -400,6 +405,11 @@ public:
/// induction descriptor.
typedef MapVector<PHINode*, InductionInfo> InductionList;
+ /// Alias(Multi)Map stores the values (GEPs or underlying objects and their
+ /// respective Store/Load instruction(s) to calculate aliasing.
+ typedef DenseMap<Value*, Instruction* > AliasMap;
+ typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
+
/// Returns true if it is legal to vectorize this loop.
/// This does not mean that it is profitable to vectorize this
/// loop, only that it is legal to do so.
@@ -473,6 +483,14 @@ private:
InductionKind isInductionVariable(PHINode *Phi);
/// Return true if can compute the address bounds of Ptr within the loop.
bool hasComputableBounds(Value *Ptr);
+ /// Return true if there is the chance of write reorder.
+ bool hasPossibleGlobalWriteReorder(Value *Object,
+ Instruction *Inst,
+ AliasMultiMap &WriteObjects,
+ unsigned MaxByteWidth);
+ /// Return the AA location for a load or a store.
+ AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst);
+
/// The loop that we evaluate.
Loop *TheLoop;
@@ -480,8 +498,12 @@ private:
ScalarEvolution *SE;
/// DataLayout analysis.
DataLayout *DL;
- // Dominators.
+ /// Dominators.
DominatorTree *DT;
+ /// Target Info.
+ TargetTransformInfo *TTI;
+ /// Alias Analysis.
+ AliasAnalysis *AA;
// --- vectorization state --- //
@@ -517,20 +539,34 @@ class LoopVectorizationCostModel {
public:
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
LoopVectorizationLegality *Legal,
- const TargetTransformInfo &TTI)
- : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI) {}
-
- /// \return The most profitable vectorization factor.
+ const TargetTransformInfo &TTI,
+ DataLayout *DL)
+ : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL) {}
+
+ /// Information about vectorization costs
+ struct VectorizationFactor {
+ unsigned Width; // Vector width with best cost
+ unsigned Cost; // Cost of the loop with that width
+ };
+ /// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to VF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
/// possible.
- unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF);
+ VectorizationFactor selectVectorizationFactor(bool OptForSize,
+ unsigned UserVF);
+ /// \return The size (in bits) of the widest type in the code that
+ /// needs to be vectorized. We ignore values that remain scalar such as
+ /// 64 bit loop indices.
+ unsigned getWidestType();
/// \return The most profitable unroll factor.
/// If UserUF is non-zero then this method finds the best unroll-factor
/// based on register pressure and other parameters.
- unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF);
+ /// VF and LoopCost are the selected vectorization factor and the cost of the
+ /// selected VF.
+ unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF,
+ unsigned LoopCost);
/// \brief A struct that represents some properties of the register usage
/// of a loop.
@@ -562,6 +598,10 @@ private:
/// the scalar type.
static Type* ToVectorTy(Type *Scalar, unsigned VF);
+ /// Returns whether the instruction is a load or store and will be a emitted
+ /// as a vector operation.
+ bool isConsecutiveLoadOrStore(Instruction *I);
+
/// The loop that we evaluate.
Loop *TheLoop;
/// Scev analysis.
@@ -572,6 +612,8 @@ private:
LoopVectorizationLegality *Legal;
/// Vector target information.
const TargetTransformInfo &TTI;
+ /// Target data layout information.
+ DataLayout *DL;
};
/// The LoopVectorize Pass.
@@ -588,6 +630,7 @@ struct LoopVectorize : public LoopPass {
LoopInfo *LI;
TargetTransformInfo *TTI;
DominatorTree *DT;
+ AliasAnalysis *AA;
virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
// We only vectorize innermost loops.
@@ -599,21 +642,22 @@ struct LoopVectorize : public LoopPass {
LI = &getAnalysis<LoopInfo>();
TTI = &getAnalysis<TargetTransformInfo>();
DT = &getAnalysis<DominatorTree>();
+ AA = getAnalysisIfAvailable<AliasAnalysis>();
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
L->getHeader()->getParent()->getName() << "\"\n");
// Check if it is legal to vectorize the loop.
- LoopVectorizationLegality LVL(L, SE, DL, DT);
+ LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing.\n");
return false;
}
// Use the cost model.
- LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI);
+ LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL);
- // Check the function attribues to find out if this function should be
+ // Check the function attributes to find out if this function should be
// optimized for size.
Function *F = L->getHeader()->getParent();
Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
@@ -628,20 +672,24 @@ struct LoopVectorize : public LoopPass {
return false;
}
- unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
- unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll);
+ // Select the optimal vectorization factor.
+ LoopVectorizationCostModel::VectorizationFactor VF;
+ VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
+ // Select the unroll factor.
+ unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll,
+ VF.Width, VF.Cost);
- if (VF == 1) {
+ if (VF.Width == 1) {
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
return false;
}
- DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
+ DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
F->getParent()->getModuleIdentifier()<<"\n");
DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
- // If we decided that it is *legal* to vectorizer the loop then do it.
- InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF);
+ // If we decided that it is *legal* to vectorize the loop then do it.
+ InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF.Width, UF);
LB.vectorize(&LVL);
DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -730,6 +778,9 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+ // Make sure that the pointer does not point to structs.
+ if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
+ return 0;
// If this value is a pointer induction variable we know it is consecutive.
PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
@@ -737,6 +788,8 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
InductionInfo II = Inductions[Phi];
if (IK_PtrInduction == II.IK)
return 1;
+ else if (IK_ReversePtrInduction == II.IK)
+ return -1;
}
GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
@@ -746,6 +799,29 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
unsigned NumOperands = Gep->getNumOperands();
Value *LastIndex = Gep->getOperand(NumOperands - 1);
+ Value *GpPtr = Gep->getPointerOperand();
+ // If this GEP value is a consecutive pointer induction variable and all of
+ // the indices are constant then we know it is consecutive. We can
+ Phi = dyn_cast<PHINode>(GpPtr);
+ if (Phi && Inductions.count(Phi)) {
+
+ // Make sure that the pointer does not point to structs.
+ PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
+ if (GepPtrType->getElementType()->isAggregateType())
+ return 0;
+
+ // Make sure that all of the index operands are loop invariant.
+ for (unsigned i = 1; i < NumOperands; ++i)
+ if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+ return 0;
+
+ InductionInfo II = Inductions[Phi];
+ if (IK_PtrInduction == II.IK)
+ return 1;
+ else if (IK_ReversePtrInduction == II.IK)
+ return -1;
+ }
+
// Check that all of the gep indices are uniform except for the last.
for (unsigned i = 0; i < NumOperands - 1; ++i)
if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
@@ -784,8 +860,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
// If this scalar is unknown, assume that it is a constant or that it is
// loop invariant. Broadcast V and save the value for future uses.
Value *B = getBroadcastInstrs(V);
- WidenMap.splat(V, B);
- return WidenMap.get(V);
+ return WidenMap.splat(V, B);
}
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
@@ -799,6 +874,111 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
"reverse");
}
+
+void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
+ LoopVectorizationLegality *Legal) {
+ // Attempt to issue a wide load.
+ LoadInst *LI = dyn_cast<LoadInst>(Instr);
+ StoreInst *SI = dyn_cast<StoreInst>(Instr);
+
+ assert((LI || SI) && "Invalid Load/Store instruction");
+
+ Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+ Type *DataTy = VectorType::get(ScalarDataTy, VF);
+ Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+ unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+
+ // If the pointer is loop invariant or if it is non consecutive,
+ // scalarize the load.
+ int Stride = Legal->isConsecutivePtr(Ptr);
+ bool Reverse = Stride < 0;
+ bool UniformLoad = LI && Legal->isUniform(Ptr);
+ if (Stride == 0 || UniformLoad)
+ return scalarizeInstruction(Instr);
+
+ Constant *Zero = Builder.getInt32(0);
+ VectorParts &Entry = WidenMap.get(Instr);
+
+ // Handle consecutive loads/stores.
+ GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+ if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
+ Value *PtrOperand = Gep->getPointerOperand();
+ Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
+ FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
+
+ // Create the new GEP with the new induction variable.
+ GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+ Gep2->setOperand(0, FirstBasePtr);
+ Gep2->setName("gep.indvar.base");
+ Ptr = Builder.Insert(Gep2);
+ } else if (Gep) {
+ assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
+ OrigLoop) && "Base ptr must be invariant");
+
+ // The last index does not have to be the induction. It can be
+ // consecutive and be a function of the index. For example A[I+1];
+ unsigned NumOperands = Gep->getNumOperands();
+
+ Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
+ VectorParts &GEPParts = getVectorValue(LastGepOperand);
+ Value *LastIndex = GEPParts[0];
+ LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+ // Create the new GEP with the new induction variable.
+ GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+ Gep2->setOperand(NumOperands - 1, LastIndex);
+ Gep2->setName("gep.indvar.idx");
+ Ptr = Builder.Insert(Gep2);
+ } else {
+ // Use the induction element ptr.
+ assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+ VectorParts &PtrVal = getVectorValue(Ptr);
+ Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
+ }
+
+ // Handle Stores:
+ if (SI) {
+ assert(!Legal->isUniform(SI->getPointerOperand()) &&
+ "We do not allow storing to uniform addresses");
+
+ VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // Calculate the pointer for the specific unroll-part.
+ Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+
+ if (Reverse) {
+ // If we store to reverse consecutive memory locations then we need
+ // to reverse the order of elements in the stored value.
+ StoredVal[Part] = reverseVector(StoredVal[Part]);
+ // If the address is consecutive but reversed, then the
+ // wide store needs to start at the last vector element.
+ PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+ PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+ }
+
+ Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+ Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
+ }
+ }
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // Calculate the pointer for the specific unroll-part.
+ Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+
+ if (Reverse) {
+ // If the address is consecutive but reversed, then the
+ // wide store needs to start at the last vector element.
+ PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+ PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+ }
+
+ Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+ Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
+ cast<LoadInst>(LI)->setAlignment(Alignment);
+ Entry[Part] = Reverse ? reverseVector(LI) : LI;
+ }
+}
+
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
// Holds vector parameters or scalars, in case of uniform vals.
@@ -870,7 +1050,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
}
}
-Value*
+Instruction *
InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
Instruction *Loc) {
LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
@@ -879,7 +1059,7 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
if (!PtrRtCheck->Need)
return NULL;
- Value *MemoryRuntimeCheck = 0;
+ Instruction *MemoryRuntimeCheck = 0;
unsigned NumPointers = PtrRtCheck->Pointers.size();
SmallVector<Value* , 2> Starts;
SmallVector<Value* , 2> Ends;
@@ -908,28 +1088,23 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
}
}
+ IRBuilder<> ChkBuilder(Loc);
+
for (unsigned i = 0; i < NumPointers; ++i) {
for (unsigned j = i+1; j < NumPointers; ++j) {
- Instruction::CastOps Op = Instruction::BitCast;
- Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc);
- Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc);
- Value *End0 = CastInst::Create(Op, Ends[i], PtrArithTy, "bc", Loc);
- Value *End1 = CastInst::Create(Op, Ends[j], PtrArithTy, "bc", Loc);
-
- Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
- Start0, End1, "bound0", Loc);
- Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
- Start1, End0, "bound1", Loc);
- Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
- "found.conflict", Loc);
+ Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
+ Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
+ Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc");
+ Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy, "bc");
+
+ Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
+ Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
+ Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
if (MemoryRuntimeCheck)
- MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
- MemoryRuntimeCheck,
- IsConflict,
- "conflict.rdx", Loc);
- else
- MemoryRuntimeCheck = IsConflict;
+ IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
+ "conflict.rdx");
+ MemoryRuntimeCheck = cast<Instruction>(IsConflict);
}
}
@@ -943,7 +1118,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
the vectorized instructions while the old loop will continue to run the
scalar remainder.
- [ ] <-- vector loop bypass.
+ [ ] <-- vector loop bypass (may consist of multiple blocks).
/ |
/ v
| [ ] <-- vector pre header.
@@ -1004,10 +1179,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
ConstantInt::get(IdxTy, 0);
assert(BypassBlock && "Invalid loop structure");
-
- // Generate the code that checks in runtime if arrays overlap.
- Value *MemoryRuntimeCheck = addRuntimeCheck(Legal,
- BypassBlock->getTerminator());
+ LoopBypassBlocks.push_back(BypassBlock);
// Split the single block loop into the two loop structure described above.
BasicBlock *VectorPH =
@@ -1019,10 +1191,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
BasicBlock *ScalarPH =
MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
- // This is the location in which we add all of the logic for bypassing
- // the new vector loop.
- Instruction *Loc = BypassBlock->getTerminator();
-
// Use this IR builder to create the loop instructions (Phi, Br, Cmp)
// inside the loop.
Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
@@ -1033,42 +1201,62 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
// times the unroll factor (num of SIMD instructions).
Constant *Step = ConstantInt::get(IdxTy, VF * UF);
+ // This is the IR builder that we use to add all of the logic for bypassing
+ // the new vector loop.
+ IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
+
// We may need to extend the index in case there is a type mismatch.
// We know that the count starts at zero and does not overflow.
if (Count->getType() != IdxTy) {
// The exit count can be of pointer type. Convert it to the correct
// integer type.
if (ExitCount->getType()->isPointerTy())
- Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc);
+ Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
else
- Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc);
+ Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
}
// Add the start index to the loop count to get the new end index.
- Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
+ Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
// Now we need to generate the expression for N - (N % VF), which is
// the part that the vectorized body will execute.
- Value *R = BinaryOperator::CreateURem(Count, Step, "n.mod.vf", Loc);
- Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
- Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
- "end.idx.rnd.down", Loc);
+ Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
+ Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
+ Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
+ "end.idx.rnd.down");
// Now, compare the new count to zero. If it is zero skip the vector loop and
// jump to the scalar loop.
- Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
- IdxEndRoundDown,
- StartIdx,
- "cmp.zero", Loc);
-
- // If we are using memory runtime checks, include them in.
- if (MemoryRuntimeCheck)
- Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
- "CntOrMem", Loc);
+ Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
+ "cmp.zero");
+
+ BasicBlock *LastBypassBlock = BypassBlock;
+
+ // Generate the code that checks in runtime if arrays overlap. We put the
+ // checks into a separate block to make the more common case of few elements
+ // faster.
+ Instruction *MemRuntimeCheck = addRuntimeCheck(Legal,
+ BypassBlock->getTerminator());
+ if (MemRuntimeCheck) {
+ // Create a new block containing the memory check.
+ BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck,
+ "vector.memcheck");
+ LoopBypassBlocks.push_back(CheckBlock);
+
+ // Replace the branch into the memory check block with a conditional branch
+ // for the "few elements case".
+ Instruction *OldTerm = BypassBlock->getTerminator();
+ BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
+ OldTerm->eraseFromParent();
+
+ Cmp = MemRuntimeCheck;
+ LastBypassBlock = CheckBlock;
+ }
- BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
- // Remove the old terminator.
- Loc->eraseFromParent();
+ LastBypassBlock->getTerminator()->eraseFromParent();
+ BranchInst::Create(MiddleBlock, VectorPH, Cmp,
+ LastBypassBlock);
// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
@@ -1108,30 +1296,45 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
Value *CRD = CountRoundDown;
if (CRDSize > IISize)
CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
- II.StartValue->getType(),
- "tr.crd", BypassBlock->getTerminator());
+ II.StartValue->getType(), "tr.crd",
+ LoopBypassBlocks.back()->getTerminator());
else if (CRDSize < IISize)
CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
II.StartValue->getType(),
- "sext.crd", BypassBlock->getTerminator());
+ "sext.crd",
+ LoopBypassBlocks.back()->getTerminator());
// Handle reverse integer induction counter:
- EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
- BypassBlock->getTerminator());
+ EndValue =
+ BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
+ LoopBypassBlocks.back()->getTerminator());
break;
}
case LoopVectorizationLegality::IK_PtrInduction: {
// For pointer induction variables, calculate the offset using
// the end index.
- EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown,
- "ptr.ind.end",
- BypassBlock->getTerminator());
+ EndValue =
+ GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end",
+ LoopBypassBlocks.back()->getTerminator());
+ break;
+ }
+ case LoopVectorizationLegality::IK_ReversePtrInduction: {
+ // The value at the end of the loop for the reverse pointer is calculated
+ // by creating a GEP with a negative index starting from the start value.
+ Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
+ Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown,
+ "rev.ind.end",
+ LoopBypassBlocks.back()->getTerminator());
+ EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx,
+ "rev.ptr.ind.end",
+ LoopBypassBlocks.back()->getTerminator());
break;
}
}// end of case
// The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop.
- ResumeVal->addIncoming(II.StartValue, BypassBlock);
+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
ResumeVal->addIncoming(EndValue, VecBody);
// Fix the scalar body counter (PHI node).
@@ -1147,7 +1350,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
assert(!ResumeIndex && "Unexpected resume value found");
ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
MiddleBlock->getTerminator());
- ResumeIndex->addIncoming(StartIdx, BypassBlock);
+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
}
@@ -1187,6 +1391,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
// Insert the new loop into the loop nest and register the new basic blocks.
if (ParentLoop) {
ParentLoop->addChildLoop(Lp);
+ for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
+ ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase());
ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
@@ -1203,7 +1409,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
LoopExitBlock = ExitBlock;
LoopVectorBody = VecBody;
LoopScalarBody = OldBasicBlock;
- LoopBypassBlock = BypassBlock;
}
/// This function returns the identity element (or neutral element) for
@@ -1295,9 +1500,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// the cost-model.
//
//===------------------------------------------------===//
- BasicBlock &BB = *OrigLoop->getHeader();
- Constant *Zero =
- ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0);
+ Constant *Zero = Builder.getInt32(0);
// In order to support reduction variables we need to be able to vectorize
// Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
@@ -1343,7 +1546,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// To do so, we need to generate the 'identity' vector and overide
// one of the elements with the incoming scalar reduction. We need
// to do it in the vector-loop preheader.
- Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
+ Builder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
// This is the vector-clone of the value that leaves the loop.
VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
@@ -1391,7 +1594,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
Value *StartVal = (part == 0) ? VectorStart : Identity;
- NewPhi->addIncoming(StartVal, LoopBypassBlock);
+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
RdxParts.push_back(NewPhi);
}
@@ -1533,8 +1737,6 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
void
InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
BasicBlock *BB, PhiVector *PV) {
- Constant *Zero = Builder.getInt32(0);
-
// For each instruction in the old loop.
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
VectorParts &Entry = WidenMap.get(it);
@@ -1568,7 +1770,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
// optimizations will clean it up.
VectorParts Cond = createEdgeMask(P->getIncomingBlock(0),
P->getParent());
-
+
for (unsigned part = 0; part < UF; ++part) {
VectorParts &In0 = getVectorValue(P->getIncomingValue(0));
VectorParts &In1 = getVectorValue(P->getIncomingValue(1));
@@ -1600,6 +1802,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
}
case LoopVectorizationLegality::IK_ReverseIntInduction:
case LoopVectorizationLegality::IK_PtrInduction:
+ case LoopVectorizationLegality::IK_ReversePtrInduction:
// Handle reverse integer and pointer inductions.
Value *StartIdx = 0;
// If we have a single integer induction variable then use it.
@@ -1635,15 +1838,23 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
// Handle the pointer induction variable case.
assert(P->getType()->isPointerTy() && "Unexpected type.");
+ // Is this a reverse induction ptr or a consecutive induction ptr.
+ bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
+ II.IK);
+
// This is the vector of results. Notice that we don't generate
// vector geps because scalar geps result in better code.
for (unsigned part = 0; part < UF; ++part) {
Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
for (unsigned int i = 0; i < VF; ++i) {
- Constant *Idx = ConstantInt::get(Induction->getType(),
- i + part * VF);
- Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx,
- "gep.idx");
+ int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
+ Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
+ Value *GlobalIdx;
+ if (!Reverse)
+ GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+ else
+ GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
+
Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
"next.gep");
VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
@@ -1684,13 +1895,13 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
for (unsigned Part = 0; Part < UF; ++Part) {
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
- // Update the NSW, NUW and Exact flags.
- BinaryOperator *VecOp = cast<BinaryOperator>(V);
- if (isa<OverflowingBinaryOperator>(BinOp)) {
+ // Update the NSW, NUW and Exact flags. Notice: V can be an Undef.
+ BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V);
+ if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) {
VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
}
- if (isa<PossiblyExactOperator>(VecOp))
+ if (VecOp && isa<PossiblyExactOperator>(VecOp))
VecOp->setIsExact(BinOp->isExact());
Entry[Part] = V;
@@ -1740,124 +1951,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
break;
}
- case Instruction::Store: {
- // Attempt to issue a wide store.
- StoreInst *SI = dyn_cast<StoreInst>(it);
- Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
- Value *Ptr = SI->getPointerOperand();
- unsigned Alignment = SI->getAlignment();
-
- assert(!Legal->isUniform(Ptr) &&
- "We do not allow storing to uniform addresses");
-
-
- int Stride = Legal->isConsecutivePtr(Ptr);
- bool Reverse = Stride < 0;
- if (Stride == 0) {
- scalarizeInstruction(it);
+ case Instruction::Store:
+ case Instruction::Load:
+ vectorizeMemoryInstruction(it, Legal);
break;
- }
-
- // Handle consecutive stores.
-
- GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
- if (Gep) {
- // The last index does not have to be the induction. It can be
- // consecutive and be a function of the index. For example A[I+1];
- unsigned NumOperands = Gep->getNumOperands();
-
- Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
- VectorParts &GEPParts = getVectorValue(LastGepOperand);
- Value *LastIndex = GEPParts[0];
- LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
- // Create the new GEP with the new induction variable.
- GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
- Gep2->setOperand(NumOperands - 1, LastIndex);
- Ptr = Builder.Insert(Gep2);
- } else {
- // Use the induction element ptr.
- assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
- VectorParts &PtrVal = getVectorValue(Ptr);
- Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
- }
-
- VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
- for (unsigned Part = 0; Part < UF; ++Part) {
- // Calculate the pointer for the specific unroll-part.
- Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
-
- if (Reverse) {
- // If we store to reverse consecutive memory locations then we need
- // to reverse the order of elements in the stored value.
- StoredVal[Part] = reverseVector(StoredVal[Part]);
- // If the address is consecutive but reversed, then the
- // wide store needs to start at the last vector element.
- PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
- PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
- }
-
- Value *VecPtr = Builder.CreateBitCast(PartPtr, StTy->getPointerTo());
- Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
- }
- break;
- }
- case Instruction::Load: {
- // Attempt to issue a wide load.
- LoadInst *LI = dyn_cast<LoadInst>(it);
- Type *RetTy = VectorType::get(LI->getType(), VF);
- Value *Ptr = LI->getPointerOperand();
- unsigned Alignment = LI->getAlignment();
-
- // If the pointer is loop invariant or if it is non consecutive,
- // scalarize the load.
- int Stride = Legal->isConsecutivePtr(Ptr);
- bool Reverse = Stride < 0;
- if (Legal->isUniform(Ptr) || Stride == 0) {
- scalarizeInstruction(it);
- break;
- }
-
- GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
- if (Gep) {
- // The last index does not have to be the induction. It can be
- // consecutive and be a function of the index. For example A[I+1];
- unsigned NumOperands = Gep->getNumOperands();
-
- Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
- VectorParts &GEPParts = getVectorValue(LastGepOperand);
- Value *LastIndex = GEPParts[0];
- LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
- // Create the new GEP with the new induction variable.
- GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
- Gep2->setOperand(NumOperands - 1, LastIndex);
- Ptr = Builder.Insert(Gep2);
- } else {
- // Use the induction element ptr.
- assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
- VectorParts &PtrVal = getVectorValue(Ptr);
- Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
- }
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- // Calculate the pointer for the specific unroll-part.
- Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
-
- if (Reverse) {
- // If the address is consecutive but reversed, then the
- // wide store needs to start at the last vector element.
- PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
- PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
- }
-
- Value *VecPtr = Builder.CreateBitCast(PartPtr, RetTy->getPointerTo());
- Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
- cast<LoadInst>(LI)->setAlignment(Alignment);
- Entry[Part] = Reverse ? reverseVector(LI) : LI;
- }
- break;
- }
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -1924,12 +2021,14 @@ void InnerLoopVectorizer::updateAnalysis() {
SE->forgetLoop(OrigLoop);
// Update the dominator tree information.
- assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
+ assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
"Entry does not dominate exit.");
- DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
+ for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
+ DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
+ DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
- DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
+ DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
@@ -2196,7 +2295,51 @@ void LoopVectorizationLegality::collectLoopUniforms() {
}
}
+AliasAnalysis::Location
+LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) {
+ if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
+ return AA->getLocation(Store);
+ else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
+ return AA->getLocation(Load);
+
+ llvm_unreachable("Should be either load or store instruction");
+}
+
+bool
+LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
+ Value *Object,
+ Instruction *Inst,
+ AliasMultiMap& WriteObjects,
+ unsigned MaxByteWidth) {
+
+ AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst);
+
+ std::vector<Instruction*>::iterator
+ it = WriteObjects[Object].begin(),
+ end = WriteObjects[Object].end();
+
+ for (; it != end; ++it) {
+ Instruction* I = *it;
+ if (I == Inst)
+ continue;
+
+ AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I);
+ if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth),
+ ThatLoc.getWithNewSize(MaxByteWidth)))
+ return true;
+ }
+ return false;
+}
+
bool LoopVectorizationLegality::canVectorizeMemory() {
+
+ if (TheLoop->isAnnotatedParallel()) {
+ DEBUG(dbgs()
+ << "LV: A loop annotated parallel, ignore memory dependency "
+ << "checks.\n");
+ return true;
+ }
+
typedef SmallVector<Value*, 16> ValueVector;
typedef SmallPtrSet<Value*, 16> ValueSet;
// Holds the Load and Store *instructions*.
@@ -2250,9 +2393,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
return true;
}
- // Holds the read and read-write *pointers* that we find.
- ValueVector Reads;
- ValueVector ReadWrites;
+ // Holds the read and read-write *pointers* that we find. These maps hold
+ // unique values for pointers (so no need for multi-map).
+ AliasMap Reads;
+ AliasMap ReadWrites;
// Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
// multiple times on the same object. If the ptr is accessed twice, once
@@ -2274,7 +2418,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
// If we did *not* see this pointer before, insert it to
// the read-write list. At this phase it is only a 'write' list.
if (Seen.insert(Ptr))
- ReadWrites.push_back(Ptr);
+ ReadWrites.insert(std::make_pair(Ptr, ST));
}
for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
@@ -2289,7 +2433,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
// read a few words, modify, and write a few words, and some of the
// words may be written to the same address.
if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
- Reads.push_back(Ptr);
+ Reads.insert(std::make_pair(Ptr, LD));
}
// If we write (or read-write) to a single destination and there are no
@@ -2302,22 +2446,27 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
// Find pointers with computable bounds. We are going to use this information
// to place a runtime bound check.
bool CanDoRT = true;
- for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
- if (hasComputableBounds(*I)) {
- PtrRtCheck.insert(SE, TheLoop, *I);
- DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+ AliasMap::iterator MI, ME;
+ for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
+ Value *V = (*MI).first;
+ if (hasComputableBounds(V)) {
+ PtrRtCheck.insert(SE, TheLoop, V);
+ DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
} else {
CanDoRT = false;
break;
}
- for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
- if (hasComputableBounds(*I)) {
- PtrRtCheck.insert(SE, TheLoop, *I);
- DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+ }
+ for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
+ Value *V = (*MI).first;
+ if (hasComputableBounds(V)) {
+ PtrRtCheck.insert(SE, TheLoop, V);
+ DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
} else {
CanDoRT = false;
break;
}
+ }
// Check that we did not collect too many pointers or found a
// unsizeable pointer.
@@ -2332,47 +2481,104 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
bool NeedRTCheck = false;
+ // Biggest vectorized access possible, vector width * unroll factor.
+ // TODO: We're being very pessimistic here, find a way to know the
+ // real access width before getting here.
+ unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) *
+ TTI->getMaximumUnrollFactor();
// Now that the pointers are in two lists (Reads and ReadWrites), we
// can check that there are no conflicts between each of the writes and
// between the writes to the reads.
- ValueSet WriteObjects;
+ // Note that WriteObjects duplicates the stores (indexed now by underlying
+ // objects) to avoid pointing to elements inside ReadWrites.
+ // TODO: Maybe create a new type where they can interact without duplication.
+ AliasMultiMap WriteObjects;
ValueVector TempObjects;
// Check that the read-writes do not conflict with other read-write
// pointers.
bool AllWritesIdentified = true;
- for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
- GetUnderlyingObjects(*I, TempObjects, DL);
- for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
- it != e; ++it) {
- if (!isIdentifiedObject(*it)) {
- DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
+ for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
+ Value *Val = (*MI).first;
+ Instruction *Inst = (*MI).second;
+
+ GetUnderlyingObjects(Val, TempObjects, DL);
+ for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
+ UI != UE; ++UI) {
+ if (!isIdentifiedObject(*UI)) {
+ DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n");
NeedRTCheck = true;
AllWritesIdentified = false;
}
- if (!WriteObjects.insert(*it)) {
+
+ // Never seen it before, can't alias.
+ if (WriteObjects[*UI].empty()) {
+ DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n");
+ WriteObjects[*UI].push_back(Inst);
+ continue;
+ }
+ // Direct alias found.
+ if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
+ DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+ << **UI <<"\n");
+ return false;
+ }
+ DEBUG(dbgs() << "LV: Found a conflicting global value:"
+ << **UI <<"\n");
+ DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n");
+ DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
+
+ // If global alias, make sure they do alias.
+ if (hasPossibleGlobalWriteReorder(*UI,
+ Inst,
+ WriteObjects,
+ MaxByteWidth)) {
DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
- << **it <<"\n");
+ << *UI <<"\n");
return false;
}
+
+ // Didn't alias, insert into map for further reference.
+ WriteObjects[*UI].push_back(Inst);
}
TempObjects.clear();
}
/// Check that the reads don't conflict with the read-writes.
- for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
- GetUnderlyingObjects(*I, TempObjects, DL);
- for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
- it != e; ++it) {
+ for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
+ Value *Val = (*MI).first;
+ GetUnderlyingObjects(Val, TempObjects, DL);
+ for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
+ UI != UE; ++UI) {
// If all of the writes are identified then we don't care if the read
// pointer is identified or not.
- if (!AllWritesIdentified && !isIdentifiedObject(*it)) {
- DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
+ if (!AllWritesIdentified && !isIdentifiedObject(*UI)) {
+ DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n");
NeedRTCheck = true;
}
- if (WriteObjects.count(*it)) {
- DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
- << **it <<"\n");
+
+ // Never seen it before, can't alias.
+ if (WriteObjects[*UI].empty())
+ continue;
+ // Direct alias found.
+ if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
+ DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+ << **UI <<"\n");
+ return false;
+ }
+ DEBUG(dbgs() << "LV: Found a global value: "
+ << **UI <<"\n");
+ Instruction *Inst = (*MI).second;
+ DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n");
+ DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
+
+ // If global alias, make sure they do alias.
+ if (hasPossibleGlobalWriteReorder(*UI,
+ Inst,
+ WriteObjects,
+ MaxByteWidth)) {
+ DEBUG(dbgs() << "LV: Found a possible read-write reorder:"
+ << *UI <<"\n");
return false;
}
}
@@ -2535,7 +2741,7 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
return IK_NoInduction;
- // Check that the PHI is consecutive and starts at zero.
+ // Check that the PHI is consecutive.
const SCEV *PhiScev = SE->getSCEV(Phi);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
if (!AR) {
@@ -2562,6 +2768,8 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
if (C->getValue()->equalsInt(Size))
return IK_PtrInduction;
+ else if (C->getValue()->equalsInt(0 - Size))
+ return IK_ReversePtrInduction;
return IK_NoInduction;
}
@@ -2612,18 +2820,34 @@ bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
return AR->isAffine();
}
-unsigned
+LoopVectorizationCostModel::VectorizationFactor
LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
unsigned UserVF) {
+ // Width 1 means no vectorize
+ VectorizationFactor Factor = { 1U, 0U };
if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
- return 1;
+ return Factor;
}
// Find the trip count.
unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
+ unsigned WidestType = getWidestType();
+ unsigned WidestRegister = TTI.getRegisterBitWidth(true);
+ unsigned MaxVectorSize = WidestRegister / WidestType;
+ DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
+ DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
+
+ if (MaxVectorSize == 0) {
+ DEBUG(dbgs() << "LV: The target has no vector registers.\n");
+ MaxVectorSize = 1;
+ }
+
+ assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
+ " into one vector!");
+
unsigned VF = MaxVectorSize;
// If we optimize the program for size, avoid creating the tail loop.
@@ -2631,7 +2855,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
// If we are unable to calculate the trip count then don't try to vectorize.
if (TC < 2) {
DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
- return 1;
+ return Factor;
}
// Find the maximum SIMD width that can fit within the trip count.
@@ -2644,7 +2868,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
// zero then we require a tail.
if (VF < 2) {
DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
- return 1;
+ return Factor;
}
}
@@ -2652,7 +2876,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
- return UserVF;
+ Factor.Width = UserVF;
+ return Factor;
}
float Cost = expectedCost(1);
@@ -2672,12 +2897,70 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
}
DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
- return Width;
+ Factor.Width = Width;
+ Factor.Cost = Width * Cost;
+ return Factor;
+}
+
+unsigned LoopVectorizationCostModel::getWidestType() {
+ unsigned MaxWidth = 8;
+
+ // For each block.
+ for (Loop::block_iterator bb = TheLoop->block_begin(),
+ be = TheLoop->block_end(); bb != be; ++bb) {
+ BasicBlock *BB = *bb;
+
+ // For each instruction in the loop.
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ Type *T = it->getType();
+
+ // Only examine Loads, Stores and PHINodes.
+ if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
+ continue;
+
+ // Examine PHI nodes that are reduction variables.
+ if (PHINode *PN = dyn_cast<PHINode>(it))
+ if (!Legal->getReductionVars()->count(PN))
+ continue;
+
+ // Examine the stored values.
+ if (StoreInst *ST = dyn_cast<StoreInst>(it))
+ T = ST->getValueOperand()->getType();
+
+ // Ignore loaded pointer types and stored pointer types that are not
+ // consecutive. However, we do want to take consecutive stores/loads of
+ // pointer vectors into account.
+ if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
+ continue;
+
+ MaxWidth = std::max(MaxWidth,
+ (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
+ }
+ }
+
+ return MaxWidth;
}
unsigned
LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
- unsigned UserUF) {
+ unsigned UserUF,
+ unsigned VF,
+ unsigned LoopCost) {
+
+ // -- The unroll heuristics --
+ // We unroll the loop in order to expose ILP and reduce the loop overhead.
+ // There are many micro-architectural considerations that we can't predict
+ // at this level. For example frontend pressure (on decode or fetch) due to
+ // code size, or the number and capabilities of the execution ports.
+ //
+ // We use the following heuristics to select the unroll factor:
+ // 1. If the code has reductions the we unroll in order to break the cross
+ // iteration dependency.
+ // 2. If the loop is really small then we unroll in order to reduce the loop
+ // overhead.
+ // 3. We don't unroll if we think that we will spill registers to memory due
+ // to the increased register pressure.
+
// Use the user preference, unless 'auto' is selected.
if (UserUF != 0)
return UserUF;
@@ -2710,17 +2993,39 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
// fit without causing spills.
unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
- // We don't want to unroll the loops to the point where they do not fit into
- // the decoded cache. Assume that we only allow 32 IR instructions.
- UF = std::min(UF, (MaxLoopSizeThreshold / R.NumInstructions));
-
// Clamp the unroll factor ranges to reasonable factors.
+ unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
+
+ // If we did not calculate the cost for VF (because the user selected the VF)
+ // then we calculate the cost of VF here.
+ if (LoopCost == 0)
+ LoopCost = expectedCost(VF);
+
+ // Clamp the calculated UF to be between the 1 and the max unroll factor
+ // that the target allows.
if (UF > MaxUnrollSize)
UF = MaxUnrollSize;
else if (UF < 1)
UF = 1;
- return UF;
+ if (Legal->getReductionVars()->size()) {
+ DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
+ return UF;
+ }
+
+ // We want to unroll tiny loops in order to reduce the loop overhead.
+ // We assume that the cost overhead is 1 and we use the cost model
+ // to estimate the cost of the loop and unroll until the cost of the
+ // loop overhead is about 5% of the cost of the loop.
+ DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
+ if (LoopCost < 20) {
+ DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
+ unsigned NewUF = 20/LoopCost + 1;
+ return std::min(NewUF, UF);
+ }
+
+ DEBUG(dbgs() << "LV: Not Unrolling. \n");
+ return 1;
}
LoopVectorizationCostModel::RegisterUsage
@@ -2878,9 +3183,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
- // We mark this instruction as zero-cost because scalar GEPs are usually
- // lowered to the intruction addressing mode. At the moment we don't
- // generate vector geps.
+ // We mark this instruction as zero-cost because the cost of GEPs in
+ // vectorized code depends on whether the corresponding memory instruction
+ // is scalarized or not. Therefore, we handle GEPs with the memory
+ // instruction cost.
return 0;
case Instruction::Br: {
return TTI.getCFInstrCost(I->getOpcode());
@@ -2923,83 +3229,59 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
VectorTy = ToVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
}
- case Instruction::Store: {
- StoreInst *SI = cast<StoreInst>(I);
- Type *ValTy = SI->getValueOperand()->getType();
+ case Instruction::Store:
+ case Instruction::Load: {
+ StoreInst *SI = dyn_cast<StoreInst>(I);
+ LoadInst *LI = dyn_cast<LoadInst>(I);
+ Type *ValTy = (SI ? SI->getValueOperand()->getType() :
+ LI->getType());
VectorTy = ToVectorTy(ValTy, VF);
+ unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
+ unsigned AS = SI ? SI->getPointerAddressSpace() :
+ LI->getPointerAddressSpace();
+ Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
+ // We add the cost of address computation here instead of with the gep
+ // instruction because only here we know whether the operation is
+ // scalarized.
if (VF == 1)
- return TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
- SI->getAlignment(),
- SI->getPointerAddressSpace());
+ return TTI.getAddressComputationCost(VectorTy) +
+ TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
- // Scalarized stores.
- int Stride = Legal->isConsecutivePtr(SI->getPointerOperand());
+ // Scalarized loads/stores.
+ int Stride = Legal->isConsecutivePtr(Ptr);
bool Reverse = Stride < 0;
if (0 == Stride) {
unsigned Cost = 0;
-
// The cost of extracting from the value vector and pointer vector.
- Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+ Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
for (unsigned i = 0; i < VF; ++i) {
- Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
- i);
+ // The cost of extracting the pointer operand.
Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
+ // In case of STORE, the cost of ExtractElement from the vector.
+ // In case of LOAD, the cost of InsertElement into the returned
+ // vector.
+ Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
+ Instruction::InsertElement,
+ VectorTy, i);
}
- // The cost of the scalar stores.
+ // The cost of the scalar loads/stores.
+ Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType());
Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
- SI->getAlignment(),
- SI->getPointerAddressSpace());
+ Alignment, AS);
return Cost;
}
- // Wide stores.
- unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
- SI->getAlignment(),
- SI->getPointerAddressSpace());
+ // Wide load/stores.
+ unsigned Cost = TTI.getAddressComputationCost(VectorTy);
+ Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+
if (Reverse)
Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
VectorTy, 0);
return Cost;
}
- case Instruction::Load: {
- LoadInst *LI = cast<LoadInst>(I);
-
- if (VF == 1)
- return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
- LI->getPointerAddressSpace());
-
- // Scalarized loads.
- int Stride = Legal->isConsecutivePtr(LI->getPointerOperand());
- bool Reverse = Stride < 0;
- if (0 == Stride) {
- unsigned Cost = 0;
- Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
-
- // The cost of extracting from the pointer vector.
- for (unsigned i = 0; i < VF; ++i)
- Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
-
- // The cost of inserting data to the result vector.
- for (unsigned i = 0; i < VF; ++i)
- Cost += TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy, i);
-
- // The cost of the scalar stores.
- Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), RetTy->getScalarType(),
- LI->getAlignment(),
- LI->getPointerAddressSpace());
- return Cost;
- }
-
- // Wide loads.
- unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
- LI->getAlignment(),
- LI->getPointerAddressSpace());
- if (Reverse)
- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
- return Cost;
- }
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -3077,4 +3359,14 @@ namespace llvm {
}
}
+bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
+ // Check for a store.
+ if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
+ return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
+
+ // Check for a load.
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+ return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
+ return false;
+}