diff options
-rw-r--r-- | include/llvm/Transforms/Vectorize.h | 68 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/BBVectorize.cpp | 92 |
2 files changed, 126 insertions, 34 deletions
diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h index ad06937..6691258 100644 --- a/include/llvm/Transforms/Vectorize.h +++ b/include/llvm/Transforms/Vectorize.h @@ -20,10 +20,73 @@ class BasicBlock; class BasicBlockPass; //===----------------------------------------------------------------------===// +/// @brief Vectorize configuration. +struct VectorizeConfig { + //===--------------------------------------------------------------------===// + // Target architecture related parameters + + /// @brief The size of the native vector registers. + unsigned VectorBits; + + /// @brief Don't try to vectorize integer values. + bool NoInts; + + /// @brief Don't try to vectorize floating-point values. + bool NoFloats; + + /// @brief Don't try to vectorize casting (conversion) operations. + bool NoCasts; + + /// @brief Don't try to vectorize floating-point math intrinsics. + bool NoMath; + + /// @brief Don't try to vectorize the fused-multiply-add intrinsic. + bool NoFMA; + + /// @brief Don't try to vectorize loads and stores. + bool NoMemOps; + + /// @brief Only generate aligned loads and stores. + bool AlignedOnly; + + //===--------------------------------------------------------------------===// + // Misc parameters + + /// @brief The required chain depth for vectorization. + unsigned ReqChainDepth; + + /// @brief The maximum search distance for instruction pairs. + unsigned SearchLimit; + + /// @brief The maximum number of candidate pairs with which to use a full + /// cycle check. + unsigned MaxCandPairsForCycleCheck; + + /// @brief Replicating one element to a pair breaks the chain. + bool SplatBreaksChain; + + /// @brief The maximum number of pairable instructions per group. + unsigned MaxInsts; + + /// @brief The maximum number of pairing iterations. + unsigned MaxIter; + + /// @brief Don't boost the chain-depth contribution of loads and stores. + bool NoMemOpBoost; + + /// @brief Use a fast instruction dependency analysis. + bool FastDep; + + /// @brief Initialize the VectorizeConfig from command line options. + VectorizeConfig(); +}; + +//===----------------------------------------------------------------------===// // // BBVectorize - A basic-block vectorization pass. // -BasicBlockPass *createBBVectorizePass(); +BasicBlockPass * +createBBVectorizePass(const VectorizeConfig &C = VectorizeConfig()); //===----------------------------------------------------------------------===// /// @brief Vectorize the BasicBlock. @@ -35,7 +98,8 @@ BasicBlockPass *createBBVectorizePass(); /// /// @return True if the BB is changed, false otherwise. /// -bool vectorizeBasicBlock(Pass *P, BasicBlock &BB); +bool vectorizeBasicBlock(Pass *P, BasicBlock &BB, + const VectorizeConfig &C = VectorizeConfig()); } // End llvm namespace diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index 5abb242..7d5bb31 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -140,11 +140,16 @@ STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); namespace { struct BBVectorize : public BasicBlockPass { static char ID; // Pass identification, replacement for typeid - BBVectorize() : BasicBlockPass(ID) { + + VectorizeConfig Config; + + BBVectorize(const VectorizeConfig &C = VectorizeConfig()) + : BasicBlockPass(ID), Config(C) { initializeBBVectorizePass(*PassRegistry::getPassRegistry()); } - BBVectorize(Pass *P) : BasicBlockPass(ID) { + BBVectorize(Pass *P, const VectorizeConfig &C) + : BasicBlockPass(ID), Config(C) { AA = &P->getAnalysis<AliasAnalysis>(); SE = &P->getAnalysis<ScalarEvolution>(); TD = P->getAnalysisIfAvailable<TargetData>(); @@ -291,9 +296,10 @@ namespace { // Iterate a sufficient number of times to merge types of size 1 bit, // then 2 bits, then 4, etc. up to half of the target vector width of the // target vector register. - for (unsigned v = 2, n = 1; v <= VectorBits && (!MaxIter || n <= MaxIter); + for (unsigned v = 2, n = 1; + v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter); v *= 2, ++n) { - DEBUG(dbgs() << "BBV: fusing loop #" << n << + DEBUG(dbgs() << "BBV: fusing loop #" << n << " for " << BB.getName() << " in " << BB.getParent()->getName() << "...\n"); if (vectorizePairs(BB)) @@ -343,7 +349,7 @@ namespace { // candidate chains where longer chains are considered to be better. // Note: when this function returns 0, the resulting instructions are // not actually fused. - static inline size_t getDepthFactor(Value *V) { + inline size_t getDepthFactor(Value *V) { // InsertElement and ExtractElement have a depth factor of zero. This is // for two reasons: First, they cannot be usefully fused. Second, because // the pass generates a lot of these, they can confuse the simple metric @@ -357,8 +363,8 @@ namespace { // Give a load or store half of the required depth so that load/store // pairs will vectorize. - if (!NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) - return ReqChainDepth/2; + if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) + return Config.ReqChainDepth/2; return 1; } @@ -431,9 +437,9 @@ namespace { case Intrinsic::exp: case Intrinsic::exp2: case Intrinsic::pow: - return !NoMath; + return !Config.NoMath; case Intrinsic::fma: - return !NoFMA; + return !Config.NoFMA; } } @@ -527,16 +533,16 @@ namespace { } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { // Vectorize simple loads if possbile: IsSimpleLoadStore = L->isSimple(); - if (!IsSimpleLoadStore || NoMemOps) + if (!IsSimpleLoadStore || Config.NoMemOps) return false; } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { // Vectorize simple stores if possbile: IsSimpleLoadStore = S->isSimple(); - if (!IsSimpleLoadStore || NoMemOps) + if (!IsSimpleLoadStore || Config.NoMemOps) return false; } else if (CastInst *C = dyn_cast<CastInst>(I)) { // We can vectorize casts, but not casts of pointer types, etc. - if (NoCasts) + if (Config.NoCasts) return false; Type *SrcTy = C->getSrcTy(); @@ -576,14 +582,14 @@ namespace { !(VectorType::isValidElementType(T2) || T2->isVectorTy())) return false; - if (NoInts && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) + if (Config.NoInts && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) return false; - if (NoFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) + if (Config.NoFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) return false; - if (T1->getPrimitiveSizeInBits() > VectorBits/2 || - T2->getPrimitiveSizeInBits() > VectorBits/2) + if (T1->getPrimitiveSizeInBits() > Config.VectorBits/2 || + T2->getPrimitiveSizeInBits() > Config.VectorBits/2) return false; return true; @@ -611,7 +617,7 @@ namespace { LI->isVolatile() != LJ->isVolatile() || LI->getOrdering() != LJ->getOrdering() || LI->getSynchScope() != LJ->getSynchScope()) - return false; + return false; } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) { if (SI->getValueOperand()->getType() != SJ->getValueOperand()->getType() || @@ -632,7 +638,7 @@ namespace { int64_t OffsetInElmts = 0; if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, OffsetInElmts) && abs64(OffsetInElmts) == 1) { - if (AlignedOnly) { + if (Config.AlignedOnly) { Type *aType = isa<StoreInst>(I) ? cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); // An aligned load or store is possible only if the instruction @@ -753,12 +759,12 @@ namespace { AliasSetTracker WriteSet(*AA); bool JAfterStart = IAfterStart; BasicBlock::iterator J = llvm::next(I); - for (unsigned ss = 0; J != E && ss <= SearchLimit; ++J, ++ss) { + for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { if (J == Start) JAfterStart = true; // Determine if J uses I, if so, exit the loop. - bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !FastDep); - if (FastDep) { + bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep); + if (Config.FastDep) { // Note: For this heuristic to be effective, independent operations // must tend to be intermixed. This is likely to be true from some // kinds of grouped loop unrolling (but not the generic LLVM pass), @@ -796,7 +802,7 @@ namespace { // If we have already found too many pairs, break here and this function // will be called again starting after the last instruction selected // during this invocation. - if (PairableInsts.size() >= MaxInsts) { + if (PairableInsts.size() >= Config.MaxInsts) { ShouldContinue = true; break; } @@ -841,7 +847,7 @@ namespace { ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I))); } - if (SplatBreaksChain) continue; + if (Config.SplatBreaksChain) continue; // Look for cases where just the first value in the pair is used by // both members of another pair (splatting). for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) { @@ -850,7 +856,7 @@ namespace { } } - if (SplatBreaksChain) return; + if (Config.SplatBreaksChain) return; // Look for cases where just the second value in the pair is used by // both members of another pair (splatting). for (Value::use_iterator I = P.second->use_begin(), @@ -1280,7 +1286,7 @@ namespace { << *J->first << " <-> " << *J->second << "} of depth " << MaxDepth << " and size " << PrunedTree.size() << " (effective size: " << EffSize << ")\n"); - if (MaxDepth >= ReqChainDepth && EffSize > BestEffSize) { + if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) { BestMaxDepth = MaxDepth; BestEffSize = EffSize; BestTree = PrunedTree; @@ -1296,7 +1302,8 @@ namespace { std::multimap<ValuePair, ValuePair> &ConnectedPairs, DenseSet<ValuePair> &PairableInstUsers, DenseMap<Value *, Value *>& ChosenPairs) { - bool UseCycleCheck = CandidatePairs.size() <= MaxCandPairsForCycleCheck; + bool UseCycleCheck = + CandidatePairs.size() <= Config.MaxCandPairsForCycleCheck; std::multimap<ValuePair, ValuePair> PairableInstUserMap; for (std::vector<Value *>::iterator I = PairableInsts.begin(), E = PairableInsts.end(); I != E; ++I) { @@ -1547,11 +1554,11 @@ namespace { unsigned IID = F->getIntrinsicID(); if (o == NumOperands-1) { BasicBlock &BB = *I->getParent(); - + Module *M = BB.getParent()->getParent(); Type *ArgType = I->getType(); Type *VArgType = getVecTypeForPair(ArgType); - + // FIXME: is it safe to do this here? ReplacedOperands[o] = Intrinsic::getDeclaration(M, (Intrinsic::ID) IID, VArgType); @@ -1867,11 +1874,32 @@ INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) -BasicBlockPass *llvm::createBBVectorizePass() { - return new BBVectorize(); +BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { + return new BBVectorize(C); } -bool llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB) { - BBVectorize BBVectorizer(P); +bool +llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { + BBVectorize BBVectorizer(P, C); return BBVectorizer.vectorizeBB(BB); } + +//===----------------------------------------------------------------------===// +VectorizeConfig::VectorizeConfig() { + VectorBits = ::VectorBits; + NoInts = ::NoInts; + NoFloats = ::NoFloats; + NoCasts = ::NoCasts; + NoMath = ::NoMath; + NoFMA = ::NoFMA; + NoMemOps = ::NoMemOps; + AlignedOnly = ::AlignedOnly; + ReqChainDepth= ::ReqChainDepth; + SearchLimit = ::SearchLimit; + MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; + SplatBreaksChain = ::SplatBreaksChain; + MaxInsts = ::MaxInsts; + MaxIter = ::MaxIter; + NoMemOpBoost = ::NoMemOpBoost; + FastDep = ::FastDep; +} |