diff options
author | Nadav Rotem <nrotem@apple.com> | 2013-04-09 19:44:35 +0000 |
---|---|---|
committer | Nadav Rotem <nrotem@apple.com> | 2013-04-09 19:44:35 +0000 |
commit | 8383b539ff4c039108ee0c202a27b787621d96cf (patch) | |
tree | a94c718adf657b35e9c1581987a588bac83242f1 | |
parent | 376e05fd7ba37b76ea26fa7604671c9abd32307e (diff) | |
download | external_llvm-8383b539ff4c039108ee0c202a27b787621d96cf.zip external_llvm-8383b539ff4c039108ee0c202a27b787621d96cf.tar.gz external_llvm-8383b539ff4c039108ee0c202a27b787621d96cf.tar.bz2 |
Add support for bottom-up SLP vectorization infrastructure.
This commit adds the infrastructure for performing bottom-up SLP vectorization (and other optimizations) on parallel computations.
The infrastructure has three potential users:
1. The loop vectorizer needs to be able to vectorize AOS data structures such as (sum += A[i] + A[i+1]).
2. The BB-vectorizer needs this infrastructure for bottom-up SLP vectorization, because bottom-up vectorization is faster to compute.
3. A loop-roller needs to be able to analyze consecutive chains and roll them into a loop, in order to reduce code size. A loop roller does not need to create vector instructions, and this infrastructure separates the chain analysis from the vectorization.
This patch also includes a simple (100 LOC) bottom up SLP vectorizer that uses the infrastructure, and can vectorize this code:
void SAXPY(int *x, int *y, int a, int i) {
x[i] = a * x[i] + y[i];
x[i+1] = a * x[i+1] + y[i+1];
x[i+2] = a * x[i+2] + y[i+2];
x[i+3] = a * x[i+3] + y[i+3];
}
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179117 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | include/llvm/InitializePasses.h | 1 | ||||
-rw-r--r-- | include/llvm/LinkAllPasses.h | 1 | ||||
-rw-r--r-- | include/llvm/Transforms/Vectorize.h | 6 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/CMakeLists.txt | 2 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 153 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/VecUtils.cpp | 439 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/VecUtils.h | 108 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/Vectorize.cpp | 5 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/flag.ll | 51 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/lit.local.cfg | 6 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/multi_user.ll | 47 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/saxpy.ll | 50 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/simple-loop.ll | 100 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/simplebb.ll | 25 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/lit.local.cfg | 1 |
15 files changed, 995 insertions, 0 deletions
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 9cc194b..5b2cd60 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -271,6 +271,7 @@ void initializeInstSimplifierPass(PassRegistry&); void initializeUnpackMachineBundlesPass(PassRegistry&); void initializeFinalizeMachineBundlesPass(PassRegistry&); void initializeLoopVectorizePass(PassRegistry&); +void initializeSLPVectorizerPass(PassRegistry&); void initializeBBVectorizePass(PassRegistry&); void initializeMachineFunctionPrinterPassPass(PassRegistry&); } diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index 1f017e4..ca1c139 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -161,6 +161,7 @@ namespace { (void) llvm::createMemDepPrinter(); (void) llvm::createInstructionSimplifierPass(); (void) llvm::createLoopVectorizePass(); + (void) llvm::createSLPVectorizerPass(); (void) llvm::createBBVectorizePass(); (void)new llvm::IntervalPartition(); diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h index d205dbd..8d0db16 100644 --- a/include/llvm/Transforms/Vectorize.h +++ b/include/llvm/Transforms/Vectorize.h @@ -117,6 +117,12 @@ createBBVectorizePass(const VectorizeConfig &C = VectorizeConfig()); Pass *createLoopVectorizePass(); //===----------------------------------------------------------------------===// +// +// SLPVectorizer - Create a bottom-up SLP vectorizer pass. +// +Pass *createSLPVectorizerPass(); + +//===----------------------------------------------------------------------===// /// @brief Vectorize the BasicBlock. /// /// @param BB The BasicBlock to be vectorized diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index e64034a..7ae082f 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -2,6 +2,8 @@ add_llvm_library(LLVMVectorize BBVectorize.cpp Vectorize.cpp LoopVectorize.cpp + SLPVectorizer.cpp + VecUtils.cpp ) add_dependencies(LLVMVectorize intrinsics_gen) diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp new file mode 100644 index 0000000..4b61dc9 --- /dev/null +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -0,0 +1,153 @@ +//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This pass implements the Bottom Up SLP vectorizer. It detects consecutive +// stores that can be put together into vector-stores. Next, it attempts to +// construct vectorizable tree using the use-def chains. If a profitable tree +// was found, the SLP vectorizer performs vectorization on the tree. +// +// The pass is inspired by the work described in the paper: +// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. +// +//===----------------------------------------------------------------------===// +#define SV_NAME "slp-vectorizer" +#define DEBUG_TYPE SV_NAME + +#include "VecUtils.h" +#include "llvm/Transforms/Vectorize.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <map> + +using namespace llvm; + +static cl::opt<int> +SLPCostThreshold("slp-threshold", cl::init(1), cl::Hidden, + cl::desc("Only vectorize trees if the gain is above this " + "number. (gain = -cost of vectorization)")); +namespace { + +/// The SLPVectorizer Pass. +struct SLPVectorizer : public BasicBlockPass { + typedef std::map<Value*, BoUpSLP::StoreList> StoreListMap; + + /// Pass identification, replacement for typeid + static char ID; + + explicit SLPVectorizer() : BasicBlockPass(ID) { + initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); + } + + ScalarEvolution *SE; + DataLayout *DL; + TargetTransformInfo *TTI; + AliasAnalysis *AA; + + /// \brief Collect memory references and sort them according to their base + /// object. We sort the stores to their base objects to reduce the cost of the + /// quadratic search on the stores. TODO: We can further reduce this cost + /// if we flush the chain creation every time we run into a memory barrier. + bool CollectStores(BasicBlock *BB, BoUpSLP &R) { + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // Can't vectorize instructions with side effects. + if (it->mayThrow()) + return false; + + StoreInst *SI = dyn_cast<StoreInst>(it); + if (!SI) + continue; + + // Check that the pointer points to scalars. + if (SI->getValueOperand()->getType()->isAggregateType()) + return false; + + // Find the base of the GEP. + Value *Ptr = SI->getPointerOperand(); + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) + Ptr = GEP->getPointerOperand(); + + // Save the store locations. + StoreRefs[Ptr].push_back(SI); + } + return true; + } + + bool RollStoreChains(BoUpSLP &R) { + bool Changed = false; + // Attempt to sort and vectorize each of the store-groups. + for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); + it != e; ++it) { + if (it->second.size() < 2) + continue; + Changed |= R.vectorizeStores(it->second, -SLPCostThreshold); + } + return Changed; + } + + virtual bool runOnBasicBlock(BasicBlock &BB) { + SE = &getAnalysis<ScalarEvolution>(); + DL = getAnalysisIfAvailable<DataLayout>(); + TTI = &getAnalysis<TargetTransformInfo>(); + AA = &getAnalysis<AliasAnalysis>(); + StoreRefs.clear(); + + // Use the bollom up slp vectorizer to construct chains that start with + // he store instructions. + BoUpSLP R(&BB, SE, DL, TTI, AA); + + if (!CollectStores(&BB, R)) + return false; + + bool Changed = RollStoreChains(R); + if (Changed) { + DEBUG(dbgs()<<"Rolled chains in \""<<BB.getParent()->getName()<<"\"\n"); + DEBUG(verifyFunction(*BB.getParent())); + } + + return Changed; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + BasicBlockPass::getAnalysisUsage(AU); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<AliasAnalysis>(); + AU.addRequired<TargetTransformInfo>(); + } + +private: + StoreListMap StoreRefs; +}; + +} // end anonymous namespace + +char SLPVectorizer::ID = 0; +static const char lv_name[] = "SLP Vectorizer"; +INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) + +namespace llvm { + Pass *createSLPVectorizerPass() { + return new SLPVectorizer(); + } +} + diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp new file mode 100644 index 0000000..7e9f12d --- /dev/null +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -0,0 +1,439 @@ +//===- VecUtils.h --- Vectorization Utilities -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "VecUtils" + +#include "VecUtils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include <algorithm> +#include <map> + +using namespace llvm; + +namespace llvm { + +BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa) : + BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa) { + numberInstructions(); +} + +void BoUpSLP::numberInstructions() { + int Loc = 0; + InstrIdx.clear(); + InstrVec.clear(); + // Number the instructions in the block. + for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) { + InstrIdx[it] = Loc++; + InstrVec.push_back(it); + assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); + } +} + +Value *BoUpSLP::getPointerOperand(Value *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand(); + return 0; +} + +unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { + if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace(); + if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace(); + return -1; +} + +bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) return false; + + // Check that A and B are of the same type. + if (PtrA->getType() != PtrB->getType()) return false; + + // Calculate the distance. + const SCEV *PtrSCEVA = SE->getSCEV(PtrA); + const SCEV *PtrSCEVB = SE->getSCEV(PtrB); + const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB); + const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV); + + // Non constant distance. + if (!ConstOffSCEV) return false; + + unsigned Offset = ConstOffSCEV->getValue()->getSExtValue(); + Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); + // The Instructions are connsecutive if the size of the first load/store is + // the same as the offset. + unsigned Sz = (DL ? DL->getTypeStoreSize(Ty) : Ty->getScalarSizeInBits()/8); + return ((-Offset) == Sz); +} + +bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) { + ValueSet Heads, Tails; + SmallDenseMap<Value*, Value*> ConsecutiveChain; + bool Changed = false; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of loads that follow each other. + for (unsigned i = 0, e = Stores.size(); i < e; ++i) + for (unsigned j = 0; j < e; ++j) { + if (i == j) continue; + if (isConsecutiveAccess(Stores[i], Stores[j])) { + Tails.insert(Stores[j]); + Heads.insert(Stores[i]); + ConsecutiveChain[Stores[i]] = Stores[j]; + } + } + + // For stores that start but don't end a link in the chain: + for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) { + if (Tails.count(*it)) continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to vectorize it. + ValueList Operands; + Value *I = *it; + int MinCost = 0, MinVF = 0; + while (Tails.count(I) || Heads.count(I)) { + Operands.push_back(I); + unsigned VF = Operands.size(); + if (isPowerOf2_32(VF) && VF > 1) { + int cost = getTreeRollCost(Operands, 0); + DEBUG(dbgs() << "Found cost=" << cost << " for VF=" << VF << "\n"); + if (cost < MinCost) { MinCost = cost; MinVF = VF; } + } + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + if (MinCost <= costThreshold && MinVF > 1) { + DEBUG(dbgs() << "Decided to vectorize cost=" << MinCost << "\n"); + vectorizeTree(Operands, MinVF); + Stores.clear(); + // The current numbering is invalid because we added and removed instrs. + numberInstructions(); + Changed = true; + } + } + + return Changed; +} + +int BoUpSLP::getScalarizationCost(Type *Ty) { + int Cost = 0; + for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + return Cost; +} + +AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { + if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI); + return AliasAnalysis::Location(); +} + +Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { + assert(Src->getParent() == Dst->getParent() && "Not the same BB"); + BasicBlock::iterator I = Src, E = Dst; + /// Scan all of the instruction from SRC to DST and check if + /// the source may alias. + for (++I; I != E; ++I) { + // Ignore store instructions that are marked as 'ignore'. + if (MemBarrierIgnoreList.count(I)) continue; + if (Src->mayWriteToMemory()) /* Write */ { + if (!I->mayReadOrWriteMemory()) continue; + } else /* Read */ { + if (!I->mayWriteToMemory()) continue; + } + AliasAnalysis::Location A = getLocation(&*I); + AliasAnalysis::Location B = getLocation(Src); + + if (!A.Ptr || !B.Ptr || AA->alias(A, B)) + return I; + } + return 0; +} + +int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) { + if (Depth == 6) return max_cost; + Type *ScalarTy = VL[0]->getType(); + + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + + /// Don't mess with vectors. + if (ScalarTy->isVectorTy()) return max_cost; + + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + // Check if all of the operands are constants. + bool AllConst = true; + bool AllSameScalar = true; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + AllConst &= isa<Constant>(VL[i]); + AllSameScalar &= (VL[0] == VL[i]); + // Must have a single use. + Instruction *I = dyn_cast<Instruction>(VL[i]); + // Need to scalarize instructions with multiple users or from other BBs. + if (I && ((I->getNumUses() > 1) || (I->getParent() != BB))) + return getScalarizationCost(VecTy); + } + + // Is this a simple vector constant. + if (AllConst) return 0; + + // If all of the operands are identical we can broadcast them. + if (AllSameScalar) + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + + // Scalarize unknown structures. + Instruction *VL0 = dyn_cast<Instruction>(VL[0]); + if (!VL0) return getScalarizationCost(VecTy); + assert(VL0->getParent() == BB && "Wrong BB"); + + unsigned Opcode = VL0->getOpcode(); + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + // If not all of the instructions are identical then we have to scalarize. + if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy); + } + + // Check if it is safe to sink the loads or the stores. + if (Opcode == Instruction::Load || Opcode == Instruction::Store) { + int MaxIdx = InstrIdx[VL0]; + for (unsigned i = 1, e = VL.size(); i < e; ++i ) + MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); + + Instruction *Last = InstrVec[MaxIdx]; + for (unsigned i = 0, e = VL.size(); i < e; ++i ) { + if (VL[i] == Last) continue; + Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last); + if (Barrier) { + DEBUG(dbgs() << "LR: Can't sink " << *VL[i] << "\n down to " << + *Last << "\n because of " << *Barrier << "\n"); + return max_cost; + } + } + } + + switch (Opcode) { + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList Operands; + int Cost = 0; + // Calculate the cost of all of the operands. + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + Cost += getTreeRollCost(Operands, Depth+1); + Operands.clear(); + } + + // Calculate the cost of this instruction. + int ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy); + int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); + Cost += (VecCost - ScalarCost); + return Cost; + } + case Instruction::Load: { + // If we are scalarize the loads, add the cost of forming the vector. + for (unsigned i = 0, e = VL.size()-1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i+1])) + return getScalarizationCost(VecTy); + + // Cost of wide load - cost of scalar loads. + int ScalarLdCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + return VecLdCost - ScalarLdCost; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + int ScalarStCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0); + int StoreCost = VecStCost - ScalarStCost; + + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) { + Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); + MemBarrierIgnoreList.insert(VL[j]); + } + + int TotalCost = StoreCost + getTreeRollCost(Operands, Depth + 1); + MemBarrierIgnoreList.clear(); + return TotalCost; + } + default: + // Unable to vectorize unknown instructions. + return getScalarizationCost(VecTy); + } +} + +Instruction *BoUpSLP::GetLastInstr(ValueList &VL, unsigned VF) { + int MaxIdx = InstrIdx[BB->getFirstNonPHI()]; + for (unsigned i = 0; i < VF; ++i ) + MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); + return InstrVec[MaxIdx + 1]; +} + +Value *BoUpSLP::Scalarize(ValueList &VL, VectorType *Ty) { + IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements())); + Value *Vec = UndefValue::get(Ty); + for (unsigned i=0; i < Ty->getNumElements(); ++i) + Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); + return Vec; +} + +Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) { + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VF); + + // Check if all of the operands are constants or identical. + bool AllConst = true; + bool AllSameScalar = true; + for (unsigned i = 0, e = VF; i < e; ++i) { + AllConst &= !!dyn_cast<Constant>(VL[i]); + AllSameScalar &= (VL[0] == VL[i]); + // Must have a single use. + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (I && (I->getNumUses() > 1 || I->getParent() != BB)) + return Scalarize(VL, VecTy); + } + + // Is this a simple vector constant. + if (AllConst || AllSameScalar) return Scalarize(VL, VecTy); + + // Scalarize unknown structures. + Instruction *VL0 = dyn_cast<Instruction>(VL[0]); + if (!VL0) return Scalarize(VL, VecTy); + + unsigned Opcode = VL0->getOpcode(); + for (unsigned i = 0, e = VF; i < e; ++i) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + // If not all of the instructions are identical then we have to scalarize. + if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy); + } + + switch (Opcode) { + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList LHSVL, RHSVL; + for (int i = 0; i < VF; ++i) { + RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); + LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1)); + } + + Value *RHS = vectorizeTree(RHSVL, VF); + Value *LHS = vectorizeTree(LHSVL, VF); + IRBuilder<> Builder(GetLastInstr(VL, VF)); + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(VL0); + return Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS); + } + case Instruction::Load: { + LoadInst *LI = dyn_cast<LoadInst>(VL0); + unsigned Alignment = LI->getAlignment(); + + // Check if all of the loads are consecutive. + for (unsigned i = 1, e = VF; i < e; ++i) + if (!isConsecutiveAccess(VL[i-1], VL[i])) + return Scalarize(VL, VecTy); + + IRBuilder<> Builder(GetLastInstr(VL, VF)); + Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), + VecTy->getPointerTo()); + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + return LI; + } + case Instruction::Store: { + StoreInst *SI = dyn_cast<StoreInst>(VL0); + unsigned Alignment = SI->getAlignment(); + + ValueList ValueOp; + for (int i = 0; i < VF; ++i) + ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand()); + + Value *VecValue = vectorizeTree(ValueOp, VF); + + IRBuilder<> Builder(GetLastInstr(VL, VF)); + Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), + VecTy->getPointerTo()); + Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); + + for (int i = 0; i < VF; ++i) + cast<Instruction>(VL[i])->eraseFromParent(); + return 0; + } + default: + return Scalarize(VL, VecTy); + } +} + +} // end of namespace diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h new file mode 100644 index 0000000..b1c2b0c --- /dev/null +++ b/lib/Transforms/Vectorize/VecUtils.h @@ -0,0 +1,108 @@ +//===- VecUtils.cpp - Vectorization Utilities -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This family of classes and functions manipulate vectors and chains of +// vectors. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_AOSVECTORIZER_H +#define LLVM_TRANSFORMS_VECTORIZE_AOSVECTORIZER_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include <vector> + +using namespace llvm; + +namespace llvm { + +class BasicBlock; class Instruction; class Type; +class VectorType; class StoreInst; class Value; +class ScalarEvolution; class DataLayout; +class TargetTransformInfo; class AliasAnalysis; + +/// Bottom Up SLP vectorization utility class. +struct BoUpSLP { + typedef SmallVector<Value*, 8> ValueList; + typedef SmallPtrSet<Value*, 16> ValueSet; + typedef SmallVector<StoreInst*, 8> StoreList; + static const int max_cost = 1<<20; + + // \brief C'tor. + BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa); + + /// \returns true if the memory operations A and B are consecutive. + bool isConsecutiveAccess(Value *A, Value *B); + + /// \brief Vectorize the tree that starts with the elements in \p VL. + /// \returns the vectorized value. + Value *vectorizeTree(ValueList &VL, int VF); + + /// \returns the vectorization cost of the subtree that starts at \p VL. + /// A negative number means that this is profitable. + int getTreeRollCost(ValueList &VL, unsigned Depth); + + /// \brief Take the pointer operand from the Load/Store instruction. + /// \returns NULL if this is not a valid Load/Store instruction. + static Value *getPointerOperand(Value *I); + + /// \brief Take the address space operand from the Load/Store instruction. + /// \returns -1 if this is not a valid Load/Store instruction. + static unsigned getAddressSpaceOperand(Value *I); + + /// \brief Attempts to order and vectorize a sequence of stores. This + /// function does a quadratic scan of the given stores. + /// \returns true if the basic block was modified. + bool vectorizeStores(StoreList &Stores, int costThreshold); + + /// \brief Number all of the instructions in the block. + void numberInstructions(); + +private: + /// \returns the scalarization cost for this type. Scalarization in this + /// context means the creation of vectors from a group of scalars. + int getScalarizationCost(Type *Ty); + + /// \returns the AA location that is being access by the instruction. + AliasAnalysis::Location getLocation(Instruction *I); + + /// \brief Checks if it is possible to sink an instruction from + /// \p Src to \p Dst. + /// \returns the pointer to the barrier instruction if we can't sink. + Value *isUnsafeToSink(Instruction *Src, Instruction *Dst); + + /// \returns the instruction that appears last in the BB from \p VL. + /// Only consider the first \p VF elements. + Instruction *GetLastInstr(ValueList &VL, unsigned VF); + + /// \returns a vector from a collection of scalars in \p VL. + Value *Scalarize(ValueList &VL, VectorType *Ty); + + // Maps instructions to numbers and back. + SmallDenseMap<Value*, int> InstrIdx; + std::vector<Instruction*> InstrVec; + // A list of instructions to ignore while sinking + // memory instructions. + SmallSet<Value*, 8> MemBarrierIgnoreList; + // Analysis and block reference. + BasicBlock *BB; + ScalarEvolution *SE; + DataLayout *DL; + TargetTransformInfo *TTI; + AliasAnalysis *AA; +}; + +} // end of namespace +# endif //LLVM_TRANSFORMS_VECTORIZE_AOSVECTORIZER_H + diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index 19eefd2..3aff636 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -28,6 +28,7 @@ using namespace llvm; void llvm::initializeVectorization(PassRegistry &Registry) { initializeBBVectorizePass(Registry); initializeLoopVectorizePass(Registry); + initializeSLPVectorizerPass(Registry); } void LLVMInitializeVectorization(LLVMPassRegistryRef R) { @@ -41,3 +42,7 @@ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) { void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopVectorizePass()); } + +void LLVMAddLoopRollerPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createSLPVectorizerPass()); +} diff --git a/test/Transforms/SLPVectorizer/X86/flag.ll b/test/Transforms/SLPVectorizer/X86/flag.ll new file mode 100644 index 0000000..a76ebd7 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/flag.ll @@ -0,0 +1,51 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=1000 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; Check that the command line flag works. +;CHECK:rollable +;CHECK-NOT:load <4 x i32> +;CHECK: ret + +define i32 @rollable(i32* noalias nocapture %in, i32* noalias nocapture %out, i64 %n) nounwind ssp uwtable { + %1 = icmp eq i64 %n, 0 + br i1 %1, label %._crit_edge, label %.lr.ph + +.lr.ph: ; preds = %0, %.lr.ph + %i.019 = phi i64 [ %26, %.lr.ph ], [ 0, %0 ] + %2 = shl i64 %i.019, 2 + %3 = getelementptr inbounds i32* %in, i64 %2 + %4 = load i32* %3, align 4 + %5 = or i64 %2, 1 + %6 = getelementptr inbounds i32* %in, i64 %5 + %7 = load i32* %6, align 4 + %8 = or i64 %2, 2 + %9 = getelementptr inbounds i32* %in, i64 %8 + %10 = load i32* %9, align 4 + %11 = or i64 %2, 3 + %12 = getelementptr inbounds i32* %in, i64 %11 + %13 = load i32* %12, align 4 + %14 = mul i32 %4, 7 + %15 = add i32 %14, 7 + %16 = mul i32 %7, 7 + %17 = add i32 %16, 14 + %18 = mul i32 %10, 7 + %19 = add i32 %18, 21 + %20 = mul i32 %13, 7 + %21 = add i32 %20, 28 + %22 = getelementptr inbounds i32* %out, i64 %2 + store i32 %15, i32* %22, align 4 + %23 = getelementptr inbounds i32* %out, i64 %5 + store i32 %17, i32* %23, align 4 + %24 = getelementptr inbounds i32* %out, i64 %8 + store i32 %19, i32* %24, align 4 + %25 = getelementptr inbounds i32* %out, i64 %11 + store i32 %21, i32* %25, align 4 + %26 = add i64 %i.019, 1 + %exitcond = icmp eq i64 %26, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret i32 undef +} diff --git a/test/Transforms/SLPVectorizer/X86/lit.local.cfg b/test/Transforms/SLPVectorizer/X86/lit.local.cfg new file mode 100644 index 0000000..a8ad0f1 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/lit.local.cfg @@ -0,0 +1,6 @@ +config.suffixes = ['.ll', '.c', '.cpp'] + +targets = set(config.root.targets_to_build.split()) +if not 'X86' in targets: + config.unsupported = True + diff --git a/test/Transforms/SLPVectorizer/X86/multi_user.ll b/test/Transforms/SLPVectorizer/X86/multi_user.ll new file mode 100644 index 0000000..cd47eb3 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/multi_user.ll @@ -0,0 +1,47 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.7.0" + +;int foo (int *A, int n) { +; A[0] += n * 5 + 7; +; A[1] += n * 5 + 8; +; A[2] += n * 5 + 9; +; A[3] += n * 5 + 10; +; A[4] += n * 5 + 11; +;} + +;CHECK: @foo +;CHECK: insertelement <4 x i32> +;CHECK: load <4 x i32> +;CHECK: add <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret +define i32 @foo(i32* nocapture %A, i32 %n) nounwind ssp uwtable { + %1 = mul nsw i32 %n, 5 + %2 = add nsw i32 %1, 7 + %3 = load i32* %A, align 4 + %4 = add nsw i32 %2, %3 + store i32 %4, i32* %A, align 4 + %5 = add nsw i32 %1, 8 + %6 = getelementptr inbounds i32* %A, i64 1 + %7 = load i32* %6, align 4 + %8 = add nsw i32 %5, %7 + store i32 %8, i32* %6, align 4 + %9 = add nsw i32 %1, 9 + %10 = getelementptr inbounds i32* %A, i64 2 + %11 = load i32* %10, align 4 + %12 = add nsw i32 %9, %11 + store i32 %12, i32* %10, align 4 + %13 = add nsw i32 %1, 10 + %14 = getelementptr inbounds i32* %A, i64 3 + %15 = load i32* %14, align 4 + %16 = add nsw i32 %13, %15 + store i32 %16, i32* %14, align 4 + %17 = add nsw i32 %1, 11 + %18 = getelementptr inbounds i32* %A, i64 4 + %19 = load i32* %18, align 4 + %20 = add nsw i32 %17, %19 + store i32 %20, i32* %18, align 4 + ret i32 undef +} diff --git a/test/Transforms/SLPVectorizer/X86/saxpy.ll b/test/Transforms/SLPVectorizer/X86/saxpy.ll new file mode 100644 index 0000000..5f06f9f --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/saxpy.ll @@ -0,0 +1,50 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; SLP vectorization example from http://cs.stanford.edu/people/eschkufz/research/asplos291-schkufza.pdf +;CHECK: SAXPY +;CHECK: mul <4 x i32> +;CHECK: ret + +define void @SAXPY(i32* noalias nocapture %x, i32* noalias nocapture %y, i32 %a, i64 %i) #0 { + %1 = getelementptr inbounds i32* %x, i64 %i + %2 = load i32* %1, align 4, !tbaa !0 + %3 = mul nsw i32 %2, %a + %4 = getelementptr inbounds i32* %y, i64 %i + %5 = load i32* %4, align 4, !tbaa !0 + %6 = add nsw i32 %3, %5 + store i32 %6, i32* %1, align 4, !tbaa !0 + %7 = add i64 %i, 1 + %8 = getelementptr inbounds i32* %x, i64 %7 + %9 = load i32* %8, align 4, !tbaa !0 + %10 = mul nsw i32 %9, %a + %11 = getelementptr inbounds i32* %y, i64 %7 + %12 = load i32* %11, align 4, !tbaa !0 + %13 = add nsw i32 %10, %12 + store i32 %13, i32* %8, align 4, !tbaa !0 + %14 = add i64 %i, 2 + %15 = getelementptr inbounds i32* %x, i64 %14 + %16 = load i32* %15, align 4, !tbaa !0 + %17 = mul nsw i32 %16, %a + %18 = getelementptr inbounds i32* %y, i64 %14 + %19 = load i32* %18, align 4, !tbaa !0 + %20 = add nsw i32 %17, %19 + store i32 %20, i32* %15, align 4, !tbaa !0 + %21 = add i64 %i, 3 + %22 = getelementptr inbounds i32* %x, i64 %21 + %23 = load i32* %22, align 4, !tbaa !0 + %24 = mul nsw i32 %23, %a + %25 = getelementptr inbounds i32* %y, i64 %21 + %26 = load i32* %25, align 4, !tbaa !0 + %27 = add nsw i32 %24, %26 + store i32 %27, i32* %22, align 4, !tbaa !0 + ret void +} + +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = metadata !{metadata !"int", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/test/Transforms/SLPVectorizer/X86/simple-loop.ll b/test/Transforms/SLPVectorizer/X86/simple-loop.ll new file mode 100644 index 0000000..4c15ed0 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/simple-loop.ll @@ -0,0 +1,100 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +;CHECK:rollable +define i32 @rollable(i32* noalias nocapture %in, i32* noalias nocapture %out, i64 %n) nounwind ssp uwtable { + %1 = icmp eq i64 %n, 0 + br i1 %1, label %._crit_edge, label %.lr.ph + +.lr.ph: ; preds = %0, %.lr.ph + %i.019 = phi i64 [ %26, %.lr.ph ], [ 0, %0 ] + %2 = shl i64 %i.019, 2 + %3 = getelementptr inbounds i32* %in, i64 %2 +;CHECK:load <4 x i32> + %4 = load i32* %3, align 4 + %5 = or i64 %2, 1 + %6 = getelementptr inbounds i32* %in, i64 %5 + %7 = load i32* %6, align 4 + %8 = or i64 %2, 2 + %9 = getelementptr inbounds i32* %in, i64 %8 + %10 = load i32* %9, align 4 + %11 = or i64 %2, 3 + %12 = getelementptr inbounds i32* %in, i64 %11 + %13 = load i32* %12, align 4 +;CHECK:mul <4 x i32> + %14 = mul i32 %4, 7 +;CHECK:add <4 x i32> + %15 = add i32 %14, 7 + %16 = mul i32 %7, 7 + %17 = add i32 %16, 14 + %18 = mul i32 %10, 7 + %19 = add i32 %18, 21 + %20 = mul i32 %13, 7 + %21 = add i32 %20, 28 + %22 = getelementptr inbounds i32* %out, i64 %2 +;CHECK:store <4 x i32> + store i32 %15, i32* %22, align 4 + %23 = getelementptr inbounds i32* %out, i64 %5 + store i32 %17, i32* %23, align 4 + %24 = getelementptr inbounds i32* %out, i64 %8 + store i32 %19, i32* %24, align 4 + %25 = getelementptr inbounds i32* %out, i64 %11 + store i32 %21, i32* %25, align 4 + %26 = add i64 %i.019, 1 + %exitcond = icmp eq i64 %26, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 +;CHECK: ret + ret i32 undef +} + +;CHECK:unrollable +;CHECK-NOT: <4 x i32> +;CHECK: ret +define i32 @unrollable(i32* %in, i32* %out, i64 %n) nounwind ssp uwtable { + %1 = icmp eq i64 %n, 0 + br i1 %1, label %._crit_edge, label %.lr.ph + +.lr.ph: ; preds = %0, %.lr.ph + %i.019 = phi i64 [ %26, %.lr.ph ], [ 0, %0 ] + %2 = shl i64 %i.019, 2 + %3 = getelementptr inbounds i32* %in, i64 %2 + %4 = load i32* %3, align 4 + %5 = or i64 %2, 1 + %6 = getelementptr inbounds i32* %in, i64 %5 + %7 = load i32* %6, align 4 + %8 = or i64 %2, 2 + %9 = getelementptr inbounds i32* %in, i64 %8 + %10 = load i32* %9, align 4 + %11 = or i64 %2, 3 + %12 = getelementptr inbounds i32* %in, i64 %11 + %13 = load i32* %12, align 4 + %14 = mul i32 %4, 7 + %15 = add i32 %14, 7 + %16 = mul i32 %7, 7 + %17 = add i32 %16, 14 + %18 = mul i32 %10, 7 + %19 = add i32 %18, 21 + %20 = mul i32 %13, 7 + %21 = add i32 %20, 28 + %22 = getelementptr inbounds i32* %out, i64 %2 + store i32 %15, i32* %22, align 4 + %23 = getelementptr inbounds i32* %out, i64 %5 + store i32 %17, i32* %23, align 4 + %barrier = call i32 @goo(i32 0) ; <---------------- memory barrier. + %24 = getelementptr inbounds i32* %out, i64 %8 + store i32 %19, i32* %24, align 4 + %25 = getelementptr inbounds i32* %out, i64 %11 + store i32 %21, i32* %25, align 4 + %26 = add i64 %i.019, 1 + %exitcond = icmp eq i64 %26, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret i32 undef +} + +declare i32 @goo(i32) diff --git a/test/Transforms/SLPVectorizer/X86/simplebb.ll b/test/Transforms/SLPVectorizer/X86/simplebb.ll new file mode 100644 index 0000000..0af30ab --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/simplebb.ll @@ -0,0 +1,25 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; Simple 3-pair chain with loads and stores +; CHECK: test1 +; CHECK: store <2 x double> +; CHECK: ret +define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + store double %mul, double* %c, align 8 + %arrayidx5 = getelementptr inbounds double* %c, i64 1 + store double %mul5, double* %arrayidx5, align 8 + ret void +} + diff --git a/test/Transforms/SLPVectorizer/lit.local.cfg b/test/Transforms/SLPVectorizer/lit.local.cfg new file mode 100644 index 0000000..19eebc0 --- /dev/null +++ b/test/Transforms/SLPVectorizer/lit.local.cfg @@ -0,0 +1 @@ +config.suffixes = ['.ll', '.c', '.cpp'] |