diff options
Diffstat (limited to 'lib/Transforms')
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 2052 |
1 files changed, 1041 insertions, 1011 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 85c01bd..d2e7450 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -59,7 +59,7 @@ static const unsigned RecursionMaxDepth = 12; class BuilderLocGuard { public: BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()) {} - ~BuilderLocGuard() { Builder.SetInsertPoint(Loc); } + ~BuilderLocGuard() { if (Loc) Builder.SetInsertPoint(Loc); } private: // Prevent copying. @@ -91,6 +91,7 @@ struct BlockNumbering { } int getIndex(Instruction *I) { + assert(I->getParent() == BB && "Invalid instruction"); if (!Valid) numberInstructions(); assert(InstrIdx.count(I) && "Unknown instruction"); @@ -117,72 +118,169 @@ private: std::vector<Instruction *> InstrVec; }; -class FuncSLP { +/// \returns the parent basic block if all of the instructions in \p VL +/// are in the same block or null otherwise. +static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + if (!I0) + return 0; + BasicBlock *BB = I0->getParent(); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (!I) + return 0; + + if (BB != I->getParent()) + return 0; + } + return BB; +} + +/// \returns True if all of the values in \p VL are constants. +static bool allConstant(ArrayRef<Value *> VL) { + for (unsigned i = 0, e = VL.size(); i < e; ++i) + if (!isa<Constant>(VL[i])) + return false; + return true; +} + +/// \returns True if all of the values in \p VL are identical. +static bool isSplat(ArrayRef<Value *> VL) { + for (unsigned i = 1, e = VL.size(); i < e; ++i) + if (VL[i] != VL[0]) + return false; + return true; +} + +/// \returns The opcode if all of the Instructions in \p VL have the same +/// opcode, or zero. +static unsigned getSameOpcode(ArrayRef<Value *> VL) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + if (!I0) + return 0; + unsigned Opcode = I0->getOpcode(); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (!I || Opcode != I->getOpcode()) + return 0; + } + return Opcode; +} + +/// \returns The type that all of the values in \p VL have or null if there +/// are different types. +static Type* getSameType(ArrayRef<Value *> VL) { + Type *Ty = VL[0]->getType(); + for (int i = 1, e = VL.size(); i < e; i++) + if (VL[0]->getType() != Ty) + return 0; + + return Ty; +} + +/// \returns True if the ExtractElement instructions in VL can be vectorized +/// to use the original vector. +static bool CanReuseExtract(ArrayRef<Value *> VL) { + assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); + // Check if all of the extracts come from the same vector and from the + // correct offset. + Value *VL0 = VL[0]; + ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); + Value *Vec = E0->getOperand(0); + + // We have to extract from the same vector type. + unsigned NElts = Vec->getType()->getVectorNumElements(); + + if (NElts != VL.size()) + return false; + + // Check that all of the indices extract from the correct offset. + ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); + if (!CI || CI->getZExtValue()) + return false; + + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); + ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); + + if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) + return false; + } + + return true; +} + +/// Bottom Up SLP Vectorizer. +class BoUpSLP { +public: typedef SmallVector<Value *, 8> ValueList; typedef SmallVector<Instruction *, 16> InstrList; typedef SmallPtrSet<Value *, 16> ValueSet; typedef SmallVector<StoreInst *, 8> StoreList; -public: - static const int MAX_COST = INT_MIN; - - FuncSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, + BoUpSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt) : F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt), Builder(Se->getContext()) { - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { - BasicBlock *BB = it; - BlocksNumbers[BB] = BlockNumbering(BB); + // Setup the block numbering utility for all of the blocks in the + // function. + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BasicBlock *BB = it; + BlocksNumbers[BB] = BlockNumbering(BB); + } } - } - - /// \brief Take the pointer operand from the Load/Store instruction. - /// \returns NULL if this is not a valid Load/Store instruction. - static Value *getPointerOperand(Value *I); - - /// \brief Take the address space operand from the Load/Store instruction. - /// \returns -1 if this is not a valid Load/Store instruction. - static unsigned getAddressSpaceOperand(Value *I); - - /// \returns true if the memory operations A and B are consecutive. - bool isConsecutiveAccess(Value *A, Value *B); /// \brief Vectorize the tree that starts with the elements in \p VL. - /// \returns the vectorized value. - Value *vectorizeTree(ArrayRef<Value *> VL); + void vectorizeTree(); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - int getTreeCost(ArrayRef<Value *> VL); + int getTreeCost(); + + /// Construct a vectorizable tree that starts at \p Roots. + void buildTree(ArrayRef<Value *> Roots); + + /// Clear the internal data structures that are created by 'buildTree'. + void deleteTree() { + VectorizableTree.clear(); + ScalarToTreeEntry.clear(); + MustGather.clear(); + MemBarrierIgnoreList.clear(); + } /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. int getGatherCost(ArrayRef<Value *> VL); - /// \brief Attempts to order and vectorize a sequence of stores. This - /// function does a quadratic scan of the given stores. - /// \returns true if the basic block was modified. - bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold); + /// \returns true if the memory operations A and B are consecutive. + bool isConsecutiveAccess(Value *A, Value *B); - /// \brief Vectorize a group of scalars into a vector tree. - /// \returns the vectorized value. - Value *vectorizeArith(ArrayRef<Value *> Operands); + /// \brief Perform LICM and CSE on the newly generated gather sequences. + void optimizeGatherSequence(); +private: + struct TreeEntry; - /// \brief This method contains the recursive part of getTreeCost. - int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth); + /// \returns the cost of the vectorizable entry. + int getEntryCost(TreeEntry *E); - /// \brief This recursive method looks for vectorization hazards such as - /// values that are used by multiple users and checks that values are used - /// by only one vector lane. It updates the variables LaneMap, MultiUserVals. - void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth); + /// This is the recursive part of buildTree. + void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); - /// \brief This method contains the recursive part of vectorizeTree. - Value *vectorizeTree_rec(ArrayRef<Value *> VL); + /// Vectorizer a single entry in the tree. + Value *vectorizeTree(TreeEntry *E); - /// \brief Vectorize a sorted sequence of stores. - bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold); + /// Vectorizer a single entry in the tree, starting in \p VL. + Value *vectorizeTree(ArrayRef<Value *> VL); + + /// \brief Take the pointer operand from the Load/Store instruction. + /// \returns NULL if this is not a valid Load/Store instruction. + static Value *getPointerOperand(Value *I); + + /// \brief Take the address space operand from the Load/Store instruction. + /// \returns -1 if this is not a valid Load/Store instruction. + static unsigned getAddressSpaceOperand(Value *I); /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. @@ -211,57 +309,65 @@ public: /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); - /// \brief Perform LICM and CSE on the newly generated gather sequences. - void optimizeGatherSequence(); + struct TreeEntry { + TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0), + NeedToGather(0) {} - bool needToGatherAny(ArrayRef<Value *> VL) { - for (int i = 0, e = VL.size(); i < e; ++i) - if (MustGather.count(VL[i])) - return true; - return false; - } + /// \returns true if the scalars in VL are equal to this entry. + bool isSame(ArrayRef<Value *> VL) { + assert(VL.size() == Scalars.size() && "Invalid size"); + for (int i = 0, e = VL.size(); i != e; ++i) + if (VL[i] != Scalars[i]) + return false; + return true; + } + + /// A vector of scalars. + ValueList Scalars; + + /// The Scalars are vectorized into this value. It is initialized to Null. + Value *VectorizedValue; + + /// The index in the basic block of the last scalar. + int LastScalarIndex; - void forgetNumbering() { - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) - BlocksNumbers[it].forget(); + /// Do we need to gather this sequence ? + bool NeedToGather; + }; + + /// Create a new VectorizableTree entry. + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { + VectorizableTree.push_back(TreeEntry()); + int idx = VectorizableTree.size() - 1; + TreeEntry *Last = &VectorizableTree[idx]; + Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); + Last->NeedToGather = !Vectorized; + if (Vectorized) { + Last->LastScalarIndex = getLastIndex(VL); + for (int i = 0, e = VL.size(); i != e; ++i) { + assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); + ScalarToTreeEntry[VL[i]] = idx; + } + } else { + Last->LastScalarIndex = 0; + MustGather.insert(VL.begin(), VL.end()); + } + return Last; } /// -- Vectorization State -- + /// Holds all of the tree entries. + std::vector<TreeEntry> VectorizableTree; - /// Maps values in the tree to the vector lanes that uses them. This map must - /// be reset between runs of getCost. - std::map<Value *, int> LaneMap; - /// A list of instructions to ignore while sinking - /// memory instructions. This map must be reset between runs of getCost. - ValueSet MemBarrierIgnoreList; - - /// Maps between the first scalar to the vector. This map must be reset - /// between runs. - DenseMap<Value *, Value *> VectorizedValues; + /// Maps a specific scalar to its tree entry. + SmallDenseMap<Value*, int> ScalarToTreeEntry; - /// Contains values that must be gathered because they are used - /// by multiple lanes, or by users outside the tree. - /// NOTICE: The vectorization methods also use this set. + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; - /// Contains PHINodes that are being processed. We use this data structure - /// to stop cycles in the graph. - ValueSet VisitedPHIs; - - /// Contains a list of values that are used outside the current tree, the - /// first element in the bundle and the insertion point for extracts. This - /// set must be reset between runs. - struct UseInfo{ - UseInfo(Instruction *VL0, int I) : - Leader(VL0), LastIndex(I) {} - UseInfo() : Leader(0), LastIndex(0) {} - /// The first element in the bundle. - Instruction *Leader; - /// The insertion index. - int LastIndex; - }; - MapVector<Instruction*, UseInfo> MultiUserVals; - SetVector<Instruction*> ExtractedLane; + /// A list of instructions to ignore while sinking + /// memory instructions. This map must be reset between runs of getCost. + ValueSet MemBarrierIgnoreList; /// Holds all of the instructions that we gathered. SetVector<Instruction *> GatherSeq; @@ -281,14 +387,478 @@ public: IRBuilder<> Builder; }; -int FuncSLP::getGatherCost(Type *Ty) { +void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { + deleteTree(); + buildTree_rec(Roots, 0); +} + + +void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { + bool SameTy = getSameType(VL); (void)SameTy; + assert(SameTy && "Invalid types!"); + + if (Depth == RecursionMaxDepth) { + DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); + newTreeEntry(VL, false); + return; + } + + // Don't handle vectors. + if (VL[0]->getType()->isVectorTy()) { + DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); + newTreeEntry(VL, false); + return; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + if (SI->getValueOperand()->getType()->isVectorTy()) { + DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); + newTreeEntry(VL, false); + return; + } + + // If all of the operands are identical or constant we have a simple solution. + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || + !getSameOpcode(VL)) { + DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); + newTreeEntry(VL, false); + return; + } + + // We now know that this is a vector of instructions of the same type from + // the same block. + + // Check if this is a duplicate of another entry. + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *E = &VectorizableTree[Idx]; + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); + if (E->Scalars[i] != VL[i]) { + DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); + newTreeEntry(VL, false); + return; + } + } + DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); + return; + } + + // Check that none of the instructions in the bundle are already in the tree. + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + if (ScalarToTreeEntry.count(VL[i])) { + DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << + ") is already in tree.\n"); + newTreeEntry(VL, false); + return; + } + } + + // If any of the scalars appears in the table OR it is marked as a value that + // needs to stat scalar then we need to gather the scalars. + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) { + DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n"); + newTreeEntry(VL, false); + return; + } + } + + // Check that all of the users of the scalars that we want to vectorize are + // schedulable. + Instruction *VL0 = cast<Instruction>(VL[0]); + int MyLastIndex = getLastIndex(VL); + BasicBlock *BB = cast<Instruction>(VL0)->getParent(); + + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + Instruction *Scalar = cast<Instruction>(VL[i]); + DEBUG(dbgs() << "SLP: Checking users of " << *Scalar << ". \n"); + for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end(); + U != UE; ++U) { + DEBUG(dbgs() << "SLP: \tUser " << **U << ". \n"); + Instruction *User = dyn_cast<Instruction>(*U); + if (!User) { + DEBUG(dbgs() << "SLP: Gathering due unknown user. \n"); + newTreeEntry(VL, false); + return; + } + + // We don't care if the user is in a different basic block. + BasicBlock *UserBlock = User->getParent(); + if (UserBlock != BB) { + DEBUG(dbgs() << "SLP: User from a different basic block " + << *User << ". \n"); + continue; + } + + // If this is a PHINode within this basic block then we can place the + // extract wherever we want. + if (isa<PHINode>(*User)) { + DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *User << ". \n"); + continue; + } + + // Check if this is a safe in-tree user. + if (ScalarToTreeEntry.count(User)) { + int Idx = ScalarToTreeEntry[User]; + int VecLocation = VectorizableTree[Idx].LastScalarIndex; + if (VecLocation <= MyLastIndex) { + DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n"); + newTreeEntry(VL, false); + return; + } + DEBUG(dbgs() << "SLP: In-tree user (" << *User << ") at #" << + VecLocation << " vector value (" << *Scalar << ") at #" + << MyLastIndex << ".\n"); + continue; + } + + // Make sure that we can schedule this unknown user. + BlockNumbering &BN = BlocksNumbers[BB]; + int UserIndex = BN.getIndex(User); + if (UserIndex < MyLastIndex) { + + DEBUG(dbgs() << "SLP: Can't schedule extractelement for " + << *User << ". \n"); + newTreeEntry(VL, false); + return; + } + } + } + + // Check that every instructions appears once in this bundle. + for (unsigned i = 0, e = VL.size(); i < e; ++i) + for (unsigned j = i+1; j < e; ++j) + if (VL[i] == VL[j]) { + DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, false); + return; + } + + // Check that instructions in this bundle don't reference other instructions. + // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4. + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); + U != UE; ++U) { + for (unsigned j = 0; j < e; ++j) { + if (i != j && *U == VL[j]) { + DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << **U << ". \n"); + newTreeEntry(VL, false); + return; + } + } + } + } + + DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); + + unsigned Opcode = getSameOpcode(VL); + + // Check if it is safe to sink the loads or the stores. + if (Opcode == Instruction::Load || Opcode == Instruction::Store) { + Instruction *Last = getLastInstruction(VL); + + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + if (VL[i] == Last) + continue; + Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last); + if (Barrier) { + DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last + << "\n because of " << *Barrier << ". Gathering.\n"); + newTreeEntry(VL, false); + return; + } + } + } + + switch (Opcode) { + case Instruction::PHI: { + PHINode *PH = dyn_cast<PHINode>(VL0); + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); + + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } + case Instruction::ExtractElement: { + bool Reuse = CanReuseExtract(VL); + if (Reuse) { + DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); + } + newTreeEntry(VL, Reuse); + return; + } + case Instruction::Load: { + // Check if the loads are consecutive or of we need to swizzle them. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Need to swizzle loads.\n"); + return; + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of loads.\n"); + return; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + for (unsigned i = 0; i < VL.size(); ++i) { + Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); + if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); + return; + } + } + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of casts.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Check that all of the compares have the same predicate. + CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + CmpInst *Cmp = cast<CmpInst>(VL[i]); + if (Cmp->getPredicate() != P0) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); + return; + } + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of compares.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::Store: { + // Check if the stores are consecutive or of we need to swizzle them. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Non consecutive store.\n"); + return; + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); + + // We can ignore these values because we are sinking them down. + MemBarrierIgnoreList.insert(VL.begin(), VL.end()); + buildTree_rec(Operands, Depth + 1); + return; + } + default: + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); + return; + } +} + +int BoUpSLP::getEntryCost(TreeEntry *E) { + ArrayRef<Value*> VL = E->Scalars; + + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + if (E->NeedToGather) { + if (allConstant(VL)) + return 0; + if (isSplat(VL)) { + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + } + return getGatherCost(E->Scalars); + } + + assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) && + "Invalid VL"); + Instruction *VL0 = cast<Instruction>(VL[0]); + unsigned Opcode = VL0->getOpcode(); + switch (Opcode) { + case Instruction::PHI: { + return 0; + } + case Instruction::ExtractElement: { + if (CanReuseExtract(VL)) + return 0; + return getGatherCost(VecTy); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + + // Calculate the cost of this instruction. + int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), + VL0->getType(), SrcTy); + + VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + return VecCost - ScalarCost; + } + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Calculate the cost of this instruction. + int ScalarCost = 0; + int VecCost = 0; + if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || + Opcode == Instruction::Select) { + VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); + ScalarCost = VecTy->getNumElements() * + TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); + VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); + } else { + ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy); + VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); + } + return VecCost - ScalarCost; + } + case Instruction::Load: { + // Cost of wide load - cost of scalar loads. + int ScalarLdCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + return VecLdCost - ScalarLdCost; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + int ScalarStCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + return VecStCost - ScalarStCost; + } + default: + llvm_unreachable("Unknown instruction"); + } +} + +int BoUpSLP::getTreeCost() { + int Cost = 0; + DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << + VectorizableTree.size() << ".\n"); + + for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { + int C = getEntryCost(&VectorizableTree[i]); + DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " + << *VectorizableTree[i].Scalars[0] << " .\n"); + Cost += C; + } + DEBUG(dbgs() << "SLP: Total Cost " << Cost << ".\n"); + return Cost; +} + +int BoUpSLP::getGatherCost(Type *Ty) { int Cost = 0; for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); return Cost; } -int FuncSLP::getGatherCost(ArrayRef<Value *> VL) { +int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { // Find the type of the operands in VL. Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) @@ -298,7 +868,7 @@ int FuncSLP::getGatherCost(ArrayRef<Value *> VL) { return getGatherCost(VecTy); } -AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) { +AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI); if (LoadInst *LI = dyn_cast<LoadInst>(I)) @@ -306,7 +876,7 @@ AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) { return AliasAnalysis::Location(); } -Value *FuncSLP::getPointerOperand(Value *I) { +Value *BoUpSLP::getPointerOperand(Value *I) { if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand(); if (StoreInst *SI = dyn_cast<StoreInst>(I)) @@ -314,7 +884,7 @@ Value *FuncSLP::getPointerOperand(Value *I) { return 0; } -unsigned FuncSLP::getAddressSpaceOperand(Value *I) { +unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { if (LoadInst *L = dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace(); if (StoreInst *S = dyn_cast<StoreInst>(I)) @@ -322,7 +892,7 @@ unsigned FuncSLP::getAddressSpaceOperand(Value *I) { return -1; } -bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) { +bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { Value *PtrA = getPointerOperand(A); Value *PtrB = getPointerOperand(B); unsigned ASA = getAddressSpaceOperand(A); @@ -354,7 +924,7 @@ bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) { return ((-Offset) == Sz); } -Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { +Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { assert(Src->getParent() == Dst->getParent() && "Not the same BB"); BasicBlock::iterator I = Src, E = Dst; /// Scan all of the instruction from SRC to DST and check if @@ -379,234 +949,7 @@ Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { return 0; } -static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { - BasicBlock *BB = 0; - for (int i = 0, e = VL.size(); i < e; i++) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - if (!I) - return 0; - - if (!BB) { - BB = I->getParent(); - continue; - } - - if (BB != I->getParent()) - return 0; - } - return BB; -} - -static bool allConstant(ArrayRef<Value *> VL) { - for (unsigned i = 0, e = VL.size(); i < e; ++i) - if (!isa<Constant>(VL[i])) - return false; - return true; -} - -static bool isSplat(ArrayRef<Value *> VL) { - for (unsigned i = 1, e = VL.size(); i < e; ++i) - if (VL[i] != VL[0]) - return false; - return true; -} - -static unsigned getSameOpcode(ArrayRef<Value *> VL) { - unsigned Opcode = 0; - for (int i = 0, e = VL.size(); i < e; i++) { - if (Instruction *I = dyn_cast<Instruction>(VL[i])) { - if (!Opcode) { - Opcode = I->getOpcode(); - continue; - } - if (Opcode != I->getOpcode()) - return 0; - } - } - return Opcode; -} - -static bool CanReuseExtract(ArrayRef<Value *> VL, unsigned VF, - VectorType *VecTy) { - assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); - // Check if all of the extracts come from the same vector and from the - // correct offset. - Value *VL0 = VL[0]; - ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); - Value *Vec = E0->getOperand(0); - - // We have to extract from the same vector type. - if (Vec->getType() != VecTy) - return false; - - // Check that all of the indices extract from the correct offset. - ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); - if (!CI || CI->getZExtValue()) - return false; - - for (unsigned i = 1, e = VF; i < e; ++i) { - ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); - ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); - - if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) - return false; - } - - return true; -} - -void FuncSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) { - if (Depth == RecursionMaxDepth) - return MustGather.insert(VL.begin(), VL.end()); - - // Don't handle vectors. - if (VL[0]->getType()->isVectorTy()) - return; - - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) - if (SI->getValueOperand()->getType()->isVectorTy()) - return; - - // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL)) - return MustGather.insert(VL.begin(), VL.end()); - - // Stop the scan at unknown IR. - Instruction *VL0 = dyn_cast<Instruction>(VL[0]); - assert(VL0 && "Invalid instruction"); - - // Mark instructions with multiple users. - int LastIndex = getLastIndex(VL); - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - if (PHINode *PN = dyn_cast<PHINode>(VL[i])) { - unsigned NumUses = 0; - // Check that PHINodes have only one external (non-self) use. - for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); - U != UE; ++U) { - // Don't count self uses. - if (*U == PN) - continue; - NumUses++; - } - if (NumUses > 1) { - DEBUG(dbgs() << "SLP: Adding PHI to MultiUserVals " - "because it has " << NumUses << " users:" << *PN << " \n"); - UseInfo UI(VL0, 0); - MultiUserVals[PN] = UI; - } - continue; - } - - Instruction *I = dyn_cast<Instruction>(VL[i]); - // Remember to check if all of the users of this instruction are vectorized - // within our tree. At depth zero we have no local users, only external - // users that we don't care about. - if (Depth && I && I->getNumUses() > 1) { - DEBUG(dbgs() << "SLP: Adding to MultiUserVals " - "because it has " << I->getNumUses() << " users:" << *I << " \n"); - UseInfo UI(VL0, LastIndex); - MultiUserVals[I] = UI; - } - } - - // Check that the instruction is only used within one lane. - for (int i = 0, e = VL.size(); i < e; ++i) { - if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) { - DEBUG(dbgs() << "SLP: Value used by multiple lanes:" << *VL[i] << "\n"); - return MustGather.insert(VL.begin(), VL.end()); - } - // Make this instruction as 'seen' and remember the lane. - LaneMap[VL[i]] = i; - } - - unsigned Opcode = getSameOpcode(VL); - if (!Opcode) - return MustGather.insert(VL.begin(), VL.end()); - - switch (Opcode) { - case Instruction::PHI: { - PHINode *PH = dyn_cast<PHINode>(VL0); - - // Stop self cycles. - if (VisitedPHIs.count(PH)) - return; - - VisitedPHIs.insert(PH); - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i)); - - getTreeUses_rec(Operands, Depth + 1); - } - return; - } - case Instruction::ExtractElement: { - VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size()); - // No need to follow ExtractElements that are going to be optimized away. - if (CanReuseExtract(VL, VL.size(), VecTy)) - return; - // Fall through. - } - case Instruction::Load: - return; - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: - case Instruction::Select: - case Instruction::ICmp: - case Instruction::FCmp: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); - - getTreeUses_rec(Operands, Depth + 1); - } - return; - } - case Instruction::Store: { - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); - getTreeUses_rec(Operands, Depth + 1); - return; - } - default: - return MustGather.insert(VL.begin(), VL.end()); - } -} - -int FuncSLP::getLastIndex(ArrayRef<Value *> VL) { +int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) { BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); BlockNumbering &BN = BlocksNumbers[BB]; @@ -617,7 +960,7 @@ int FuncSLP::getLastIndex(ArrayRef<Value *> VL) { return MaxIdx; } -Instruction *FuncSLP::getLastInstruction(ArrayRef<Value *> VL) { +Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) { BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); BlockNumbering &BN = BlocksNumbers[BB]; @@ -625,15 +968,17 @@ Instruction *FuncSLP::getLastInstruction(ArrayRef<Value *> VL) { int MaxIdx = BN.getIndex(cast<Instruction>(VL[0])); for (unsigned i = 1, e = VL.size(); i < e; ++i) MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i]))); - return BN.getInstruction(MaxIdx); + Instruction *I = BN.getInstruction(MaxIdx); + assert(I && "bad location"); + return I; } -Instruction *FuncSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) { +Instruction *BoUpSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) { BlockNumbering &BN = BlocksNumbers[BB]; return BN.getInstruction(Index); } -int FuncSLP::getFirstUserIndex(ArrayRef<Value *> VL) { +int BoUpSLP::getFirstUserIndex(ArrayRef<Value *> VL) { BasicBlock *BB = getSameBlock(VL); assert(BB && "All instructions must come from the same block"); BlockNumbering &BN = BlocksNumbers[BB]; @@ -654,444 +999,7 @@ int FuncSLP::getFirstUserIndex(ArrayRef<Value *> VL) { return FirstUser; } -int FuncSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) { - Type *ScalarTy = VL[0]->getType(); - - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - - /// Don't mess with vectors. - if (ScalarTy->isVectorTy()) - return FuncSLP::MAX_COST; - - if (allConstant(VL)) - return 0; - - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - - if (isSplat(VL)) - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); - - int GatherCost = getGatherCost(VecTy); - if (Depth == RecursionMaxDepth || needToGatherAny(VL)) - return GatherCost; - - BasicBlock *BB = getSameBlock(VL); - unsigned Opcode = getSameOpcode(VL); - assert(Opcode && BB && "Invalid Instruction Value"); - - // Check if it is safe to sink the loads or the stores. - if (Opcode == Instruction::Load || Opcode == Instruction::Store) { - int MaxIdx = getLastIndex(VL); - Instruction *Last = getInstructionForIndex(MaxIdx, BB); - - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - if (VL[i] == Last) - continue; - Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last); - if (Barrier) { - DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last - << "\n because of " << *Barrier << "\n"); - return MAX_COST; - } - } - } - - // Calculate the extract cost. - unsigned ExternalUserExtractCost = 0; - for (unsigned i = 0, e = VL.size(); i < e; ++i) - if (ExtractedLane.count(cast<Instruction>(VL[i]))) - ExternalUserExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); - - Instruction *VL0 = cast<Instruction>(VL[0]); - switch (Opcode) { - case Instruction::PHI: { - PHINode *PH = dyn_cast<PHINode>(VL0); - - // Stop self cycles. - if (VisitedPHIs.count(PH)) - return 0; - - VisitedPHIs.insert(PH); - int TotalCost = 0; - // Calculate the cost of all of the operands. - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i)); - - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; - TotalCost += TotalCost; - } - - if (TotalCost > GatherCost) { - MustGather.insert(VL.begin(), VL.end()); - return GatherCost; - } - - return TotalCost + ExternalUserExtractCost; - } - case Instruction::ExtractElement: { - if (CanReuseExtract(VL, VL.size(), VecTy)) - return 0; - return getGatherCost(VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - ValueList Operands; - Type *SrcTy = VL0->getOperand(0)->getType(); - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); - // Check that the casted type is the same for all users. - if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy) - return getGatherCost(VecTy); - } - - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; - - // Calculate the cost of this instruction. - int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); - - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); - Cost += (VecCost - ScalarCost); - - if (Cost > GatherCost) { - MustGather.insert(VL.begin(), VL.end()); - return GatherCost; - } - - return Cost + ExternalUserExtractCost; - } - case Instruction::FCmp: - case Instruction::ICmp: { - // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast<CmpInst>(VL[i]); - if (Cmp->getPredicate() != P0) - return getGatherCost(VecTy); - } - // Fall through. - } - case Instruction::Select: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - int TotalCost = 0; - // Calculate the cost of all of the operands. - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); - - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; - TotalCost += Cost; - } - - // Calculate the cost of this instruction. - int ScalarCost = 0; - int VecCost = 0; - if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || - Opcode == Instruction::Select) { - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - ScalarCost = - VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); - VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); - } else { - ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy); - VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); - } - TotalCost += (VecCost - ScalarCost); - - if (TotalCost > GatherCost) { - MustGather.insert(VL.begin(), VL.end()); - return GatherCost; - } - - return TotalCost + ExternalUserExtractCost; - } - case Instruction::Load: { - // If we are scalarize the loads, add the cost of forming the vector. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1])) - return getGatherCost(VecTy); - - // Cost of wide load - cost of scalar loads. - int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - int TotalCost = VecLdCost - ScalarLdCost; - - if (TotalCost > GatherCost) { - MustGather.insert(VL.begin(), VL.end()); - return GatherCost; - } - - return TotalCost + ExternalUserExtractCost; - } - case Instruction::Store: { - // We know that we can merge the stores. Calculate the cost. - int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int StoreCost = VecStCost - ScalarStCost; - - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); - MemBarrierIgnoreList.insert(VL[j]); - } - - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; - - int TotalCost = StoreCost + Cost; - return TotalCost + ExternalUserExtractCost; - } - default: - // Unable to vectorize unknown instructions. - return getGatherCost(VecTy); - } -} - -int FuncSLP::getTreeCost(ArrayRef<Value *> VL) { - // Get rid of the list of stores that were removed, and from the - // lists of instructions with multiple users. - MemBarrierIgnoreList.clear(); - LaneMap.clear(); - MultiUserVals.clear(); - ExtractedLane.clear(); - MustGather.clear(); - VisitedPHIs.clear(); - - if (!getSameBlock(VL)) - return MAX_COST; - - // Find the location of the last root. - int LastRootIndex = getLastIndex(VL); - int FirstUserIndex = getFirstUserIndex(VL); - - // Don't vectorize if there are users of the tree roots inside the tree - // itself. - if (LastRootIndex > FirstUserIndex) - return MAX_COST; - - // Scan the tree and find which value is used by which lane, and which values - // must be scalarized. - getTreeUses_rec(VL, 0); - - // Check that instructions with multiple users can be vectorized. Mark - // unsafe instructions. - for (MapVector<Instruction *, UseInfo>::iterator UI = MultiUserVals.begin(), - e = MultiUserVals.end(); UI != e; ++UI) { - Instruction *Scalar = UI->first; - - if (MustGather.count(Scalar)) - continue; - - assert(LaneMap.count(Scalar) && "Unknown scalar"); - int ScalarLane = LaneMap[Scalar]; - - bool ExternalUse = false; - // Check that all of the users of this instr are within the tree. - for (Value::use_iterator Usr = Scalar->use_begin(), - UE = Scalar->use_end(); Usr != UE; ++Usr) { - // If this user is within the tree, make sure it is from the same lane. - // Notice that we have both in-tree and out-of-tree users. - if (LaneMap.count(*Usr)) { - if (LaneMap[*Usr] != ScalarLane) { - DEBUG(dbgs() << "SLP: Adding to MustExtract " - "because of an out-of-lane usage.\n"); - MustGather.insert(Scalar); - break; - } - continue; - } - - // We have an out-of-tree user. Check if we can place an 'extract'. - Instruction *User = cast<Instruction>(*Usr); - // We care about the order only if the user is in the same block. - if (User->getParent() == Scalar->getParent()) { - int LastLoc = UI->second.LastIndex; - BlockNumbering &BN = BlocksNumbers[User->getParent()]; - int UserIdx = BN.getIndex(User); - if (UserIdx <= LastLoc) { - DEBUG(dbgs() << "SLP: Adding to MustExtract because of an external " - "user that we can't schedule.\n"); - MustGather.insert(Scalar); - break; - } - } - // We have an external user. - ExternalUse = true; - } - - if (ExternalUse) { - // Items that are left in MultiUserVals are to be extracted. - // ExtractLane is used for the lookup. - ExtractedLane.insert(Scalar); - } - - } - - // Now calculate the cost of vectorizing the tree. - return getTreeCost_rec(VL, 0); -} -bool FuncSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) { - unsigned ChainLen = Chain.size(); - DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen - << "\n"); - Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); - unsigned Sz = DL->getTypeSizeInBits(StoreTy); - unsigned VF = MinVecRegSize / Sz; - - if (!isPowerOf2_32(Sz) || VF < 2) - return false; - - bool Changed = false; - // Look for profitable vectorizable trees at all offsets, starting at zero. - for (unsigned i = 0, e = ChainLen; i < e; ++i) { - if (i + VF > e) - break; - DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i - << "\n"); - ArrayRef<Value *> Operands = Chain.slice(i, VF); - - int Cost = getTreeCost(Operands); - if (Cost == FuncSLP::MAX_COST) - continue; - DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); - vectorizeTree(Operands); - - // Remove the scalar stores. - for (int j = 0, e = VF; j < e; ++j) - cast<Instruction>(Operands[j])->eraseFromParent(); - - // Move to the next bundle. - i += VF - 1; - Changed = true; - } - } - - if (Changed || ChainLen > VF) - return Changed; - - // Handle short chains. This helps us catch types such as <3 x float> that - // are smaller than vector size. - int Cost = getTreeCost(Chain); - if (Cost == FuncSLP::MAX_COST) - return false; - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost - << " for size = " << ChainLen << "\n"); - vectorizeTree(Chain); - - // Remove all of the scalar stores. - for (int i = 0, e = Chain.size(); i < e; ++i) - cast<Instruction>(Chain[i])->eraseFromParent(); - - return true; - } - - return false; -} - -bool FuncSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) { - SetVector<Value *> Heads, Tails; - SmallDenseMap<Value *, Value *> ConsecutiveChain; - - // We may run into multiple chains that merge into a single chain. We mark the - // stores that we vectorized so that we don't visit the same store twice. - ValueSet VectorizedStores; - bool Changed = false; - - // Do a quadratic search on all of the given stores and find - // all of the pairs of loads that follow each other. - for (unsigned i = 0, e = Stores.size(); i < e; ++i) - for (unsigned j = 0; j < e; ++j) { - if (i == j) - continue; - - if (isConsecutiveAccess(Stores[i], Stores[j])) { - Tails.insert(Stores[j]); - Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[j]; - } - } - - // For stores that start but don't end a link in the chain: - for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); - it != e; ++it) { - if (Tails.count(*it)) - continue; - - // We found a store instr that starts a chain. Now follow the chain and try - // to vectorize it. - ValueList Operands; - Value *I = *it; - // Collect the chain into a list. - while (Tails.count(I) || Heads.count(I)) { - if (VectorizedStores.count(I)) - break; - Operands.push_back(I); - // Move to the next value in the chain. - I = ConsecutiveChain[I]; - } - - bool Vectorized = vectorizeStoreChain(Operands, costThreshold); - - // Mark the vectorized stores so that we don't vectorize them again. - if (Vectorized) - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed |= Vectorized; - } - - return Changed; -} - -Value *FuncSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { +Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { Value *Vec = UndefValue::get(Ty); // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { @@ -1103,282 +1011,292 @@ Value *FuncSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { return Vec; } -Value *FuncSLP::vectorizeTree_rec(ArrayRef<Value *> VL) { - BuilderLocGuard Guard(Builder); +Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *E = &VectorizableTree[Idx]; + if (E->isSame(VL)) + return vectorizeTree(E); + } Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - if (needToGatherAny(VL)) - return Gather(VL, VecTy); + return Gather(VL, VecTy); +} + +Value *BoUpSLP::vectorizeTree(TreeEntry *E) { + BuilderLocGuard Guard(Builder); - if (VectorizedValues.count(VL[0])) { - DEBUG(dbgs() << "SLP: Diamond merged at depth.\n"); - return VectorizedValues[VL[0]]; + if (E->VectorizedValue) { + DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + return E->VectorizedValue; } - Instruction *VL0 = cast<Instruction>(VL[0]); - unsigned Opcode = VL0->getOpcode(); - assert(Opcode == getSameOpcode(VL) && "Invalid opcode"); + Type *ScalarTy = E->Scalars[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(E->Scalars[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); - switch (Opcode) { - case Instruction::PHI: { - PHINode *PH = dyn_cast<PHINode>(VL0); - Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt()); - PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); - VectorizedValues[VL0] = NewPhi; + if (E->NeedToGather) { + return Gather(E->Scalars, VecTy); + } - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - ValueList Operands; - BasicBlock *IBB = PH->getIncomingBlock(i); + Instruction *VL0 = cast<Instruction>(E->Scalars[0]); + unsigned Opcode = VL0->getOpcode(); + assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(IBB)); + switch (Opcode) { + case Instruction::PHI: { + PHINode *PH = dyn_cast<PHINode>(VL0); + Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt()); + PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); + E->VectorizedValue = NewPhi; + + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + ValueList Operands; + BasicBlock *IBB = PH->getIncomingBlock(i); + + // Prepare the operand vector. + for (unsigned j = 0; j < E->Scalars.size(); ++j) + Operands.push_back(cast<PHINode>(E->Scalars[j])-> + getIncomingValueForBlock(IBB)); + + Builder.SetInsertPoint(IBB->getTerminator()); + Value *Vec = vectorizeTree(Operands); + NewPhi->addIncoming(Vec, IBB); + } - Builder.SetInsertPoint(IBB->getTerminator()); - Value *Vec = vectorizeTree_rec(Operands); - NewPhi->addIncoming(Vec, IBB); + assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && + "Invalid number of incoming values"); + return NewPhi; } - assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && - "Invalid number of incoming values"); - return NewPhi; - } - - case Instruction::ExtractElement: { - if (CanReuseExtract(VL, VL.size(), VecTy)) - return VL0->getOperand(0); - return Gather(VL, VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - ValueList INVL; - for (int i = 0, e = VL.size(); i < e; ++i) - INVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); - - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *InVec = vectorizeTree_rec(INVL); - CastInst *CI = dyn_cast<CastInst>(VL0); - Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::FCmp: - case Instruction::ICmp: { - // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast<CmpInst>(VL[i]); - if (Cmp->getPredicate() != P0) - return Gather(VL, VecTy); + case Instruction::ExtractElement: { + if (CanReuseExtract(E->Scalars)) { + Value *V = VL0->getOperand(0); + E->VectorizedValue = V; + return V; + } + return Gather(E->Scalars, VecTy); } - - ValueList LHSV, RHSV; - for (int i = 0, e = VL.size(); i < e; ++i) { - LHSV.push_back(cast<Instruction>(VL[i])->getOperand(0)); - RHSV.push_back(cast<Instruction>(VL[i])->getOperand(1)); + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + ValueList INVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *InVec = vectorizeTree(INVL); + CastInst *CI = dyn_cast<CastInst>(VL0); + Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + E->VectorizedValue = V; + return V; } + case Instruction::FCmp: + case Instruction::ICmp: { + ValueList LHSV, RHSV; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + } - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *L = vectorizeTree_rec(LHSV); - Value *R = vectorizeTree_rec(RHSV); - Value *V; + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *L = vectorizeTree(LHSV); + Value *R = vectorizeTree(RHSV); + Value *V; - if (Opcode == Instruction::FCmp) - V = Builder.CreateFCmp(P0, L, R); - else - V = Builder.CreateICmp(P0, L, R); + CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); + if (Opcode == Instruction::FCmp) + V = Builder.CreateFCmp(P0, L, R); + else + V = Builder.CreateICmp(P0, L, R); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Select: { - ValueList TrueVec, FalseVec, CondVec; - for (int i = 0, e = VL.size(); i < e; ++i) { - CondVec.push_back(cast<Instruction>(VL[i])->getOperand(0)); - TrueVec.push_back(cast<Instruction>(VL[i])->getOperand(1)); - FalseVec.push_back(cast<Instruction>(VL[i])->getOperand(2)); + E->VectorizedValue = V; + return V; } + case Instruction::Select: { + ValueList TrueVec, FalseVec, CondVec; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2)); + } - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *True = vectorizeTree_rec(TrueVec); - Value *False = vectorizeTree_rec(FalseVec); - Value *Cond = vectorizeTree_rec(CondVec); - Value *V = Builder.CreateSelect(Cond, True, False); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - ValueList LHSVL, RHSVL; - for (int i = 0, e = VL.size(); i < e; ++i) { - LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); - RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1)); + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *Cond = vectorizeTree(CondVec); + Value *True = vectorizeTree(TrueVec); + Value *False = vectorizeTree(FalseVec); + Value *V = Builder.CreateSelect(Cond, True, False); + E->VectorizedValue = V; + return V; } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + } - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *LHS = vectorizeTree_rec(LHSVL); - Value *RHS = vectorizeTree_rec(RHSVL); + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); - if (LHS == RHS) { - assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); - } + if (LHS == RHS && isa<Instruction>(LHS)) { + assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); + } - BinaryOperator *BinOp = cast<BinaryOperator>(VL0); - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Load: { - // Check if all of the loads are consecutive. - for (unsigned i = 1, e = VL.size(); i < e; ++i) - if (!isConsecutiveAccess(VL[i - 1], VL[i])) - return Gather(VL, VecTy); - - // Loads are inserted at the head of the tree because we don't want to - // sink them all the way down past store instructions. - Builder.SetInsertPoint(getLastInstruction(VL)); - LoadInst *LI = cast<LoadInst>(VL0); - Value *VecPtr = - Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo()); - unsigned Alignment = LI->getAlignment(); - LI = Builder.CreateLoad(VecPtr); - LI->setAlignment(Alignment); - - VectorizedValues[VL0] = LI; - return LI; + BinaryOperator *BinOp = cast<BinaryOperator>(VL0); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); + E->VectorizedValue = V; + return V; + } + case Instruction::Load: { + // Loads are inserted at the head of the tree because we don't want to + // sink them all the way down past store instructions. + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + LoadInst *LI = cast<LoadInst>(VL0); + Value *VecPtr = + Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo()); + unsigned Alignment = LI->getAlignment(); + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + E->VectorizedValue = LI; + return LI; + } + case Instruction::Store: { + StoreInst *SI = cast<StoreInst>(VL0); + unsigned Alignment = SI->getAlignment(); + + ValueList ValueOp; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand()); + + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *VecValue = vectorizeTree(ValueOp); + Value *VecPtr = + Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); + StoreInst *S = Builder.CreateStore(VecValue, VecPtr); + S->setAlignment(Alignment); + E->VectorizedValue = S; + return S; + } + default: + llvm_unreachable("unknown inst"); } - case Instruction::Store: { - StoreInst *SI = cast<StoreInst>(VL0); - unsigned Alignment = SI->getAlignment(); + return 0; +} - ValueList ValueOp; - for (int i = 0, e = VL.size(); i < e; ++i) - ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand()); +void BoUpSLP::vectorizeTree() { + vectorizeTree(&VectorizableTree[0]); - Value *VecValue = vectorizeTree_rec(ValueOp); + // For each vectorized value: + for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { + TreeEntry *Entry = &VectorizableTree[EIdx]; - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *VecPtr = - Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); - Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); - return 0; - } - default: - return Gather(VL, VecTy); - } -} + // For each lane: + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; -Value *FuncSLP::vectorizeTree(ArrayRef<Value *> VL) { - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *V = vectorizeTree_rec(VL); + // No need to handle users of gathered values. + if (Entry->NeedToGather) + continue; - DEBUG(dbgs() << "SLP: Placing 'extracts'\n"); - for (SetVector<Instruction*>::iterator it = ExtractedLane.begin(), e = - ExtractedLane.end(); it != e; ++it) { - Instruction *Scalar = *it; - DEBUG(dbgs() << "SLP: Looking at " << *Scalar); + Value *Vec = Entry->VectorizedValue; + assert(Vec && "Can't find vectorizable value"); - if (!Scalar) - continue; + SmallVector<User*, 16> Users(Scalar->use_begin(), Scalar->use_end()); - Instruction *Loc = 0; + for (SmallVector<User*, 16>::iterator User = Users.begin(), + UE = Users.end(); User != UE; ++User) { + DEBUG(dbgs() << "SLP: \tupdating user " << **User << ".\n"); - assert(MultiUserVals.count(Scalar) && "Can't find the lane to extract"); - Instruction *Leader = MultiUserVals[Scalar].Leader; + bool Gathered = MustGather.count(*User); - // This value is gathered so we don't need to extract from anywhere. - if (!VectorizedValues.count(Leader)) - continue; + // Skip in-tree scalars that become vectors. + if (ScalarToTreeEntry.count(*User) && !Gathered) { + DEBUG(dbgs() << "SLP: \tUser will be removed soon:" << + **User << ".\n"); + int Idx = ScalarToTreeEntry[*User]; (void) Idx; + assert(!VectorizableTree[Idx].NeedToGather && "bad state ?"); + continue; + } - Value *Vec = VectorizedValues[Leader]; - if (PHINode *PN = dyn_cast<PHINode>(Vec)) { - Loc = PN->getParent()->getFirstInsertionPt(); - } else { - Instruction *I = cast<Instruction>(Vec); - BasicBlock::iterator L = *I; - Loc = ++L; - } + if (!isa<Instruction>(*User)) + continue; - Builder.SetInsertPoint(Loc); - assert(LaneMap.count(Scalar) && "Can't find the extracted lane."); - int Lane = LaneMap[Scalar]; - Value *Idx = Builder.getInt32(Lane); - Value *Extract = Builder.CreateExtractElement(Vec, Idx); + // Generate extracts for out-of-tree users. + // Find the insertion point for the extractelement lane. + Instruction *Loc = 0; + if (PHINode *PN = dyn_cast<PHINode>(Vec)) { + Loc = PN->getParent()->getFirstInsertionPt(); + } else if (Instruction *Iv = dyn_cast<Instruction>(Vec)){ + Loc = ++((BasicBlock::iterator)*Iv); + } else { + Loc = F->getEntryBlock().begin(); + } - bool Replaced = false;; - for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end(); - U != UE; ++U) { - Instruction *UI = cast<Instruction>(*U); - // No need to replace instructions that are inside our lane map. - if (LaneMap.count(UI)) - continue; + Builder.SetInsertPoint(Loc); + Value *Ex = Builder.CreateExtractElement(Vec, Builder.getInt32(Lane)); + (*User)->replaceUsesOfWith(Scalar, Ex); + DEBUG(dbgs() << "SLP: \tupdated user:" << **User << ".\n"); + } - UI->replaceUsesOfWith(Scalar ,Extract); - Replaced = true; + Type *Ty = Scalar->getType(); + if (!Ty->isVoidTy()) { + for (Value::use_iterator User = Scalar->use_begin(), UE = Scalar->use_end(); + User != UE; ++User) { + DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n"); + assert(!MustGather.count(*User) && + "Replacing gathered value with undef"); + assert(ScalarToTreeEntry.count(*User) && + "Replacing out-of-tree value with undef"); + } + Value *Undef = UndefValue::get(Ty); + Scalar->replaceAllUsesWith(Undef); + } + DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); + cast<Instruction>(Scalar)->eraseFromParent(); } - assert(Replaced && "Must replace at least one outside user"); - (void)Replaced; } - // We moved some instructions around. We have to number them again - // before we can do any analysis. - forgetNumbering(); - - // Clear the state. - MustGather.clear(); - VisitedPHIs.clear(); - VectorizedValues.clear(); - MemBarrierIgnoreList.clear(); - return V; -} - -Value *FuncSLP::vectorizeArith(ArrayRef<Value *> Operands) { - Instruction *LastInst = getLastInstruction(Operands); - Value *Vec = vectorizeTree(Operands); - // After vectorizing the operands we need to generate extractelement - // instructions and replace all of the uses of the scalar values with - // the values that we extracted from the vectorized tree. - Builder.SetInsertPoint(LastInst); - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); - Operands[i]->replaceAllUsesWith(S); + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BlocksNumbers[it].forget(); } - - forgetNumbering(); - return Vec; } -void FuncSLP::optimizeGatherSequence() { +void BoUpSLP::optimizeGatherSequence() { + DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() + << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. for (SetVector<Instruction *>::iterator it = GatherSeq.begin(), e = GatherSeq.end(); it != e; ++it) { @@ -1449,8 +1367,6 @@ void FuncSLP::optimizeGatherSequence() { assert((*v)->getNumUses() == 0 && "Can't remove instructions with uses"); (*v)->eraseFromParent(); } - - forgetNumbering(); } /// The SLPVectorizer Pass. @@ -1492,7 +1408,7 @@ struct SLPVectorizer : public FunctionPass { // Use the bollom up slp vectorizer to construct chains that start with // he store instructions. - FuncSLP R(&F, SE, DL, TTI, AA, LI, DT); + BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT); // Scan the blocks in the function in post order. for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), @@ -1536,31 +1452,146 @@ private: /// object. We sort the stores to their base objects to reduce the cost of the /// quadratic search on the stores. TODO: We can further reduce this cost /// if we flush the chain creation every time we run into a memory barrier. - unsigned collectStores(BasicBlock *BB, FuncSLP &R); + unsigned collectStores(BasicBlock *BB, BoUpSLP &R); /// \brief Try to vectorize a chain that starts at two arithmetic instrs. - bool tryToVectorizePair(Value *A, Value *B, FuncSLP &R); + bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); /// \brief Try to vectorize a list of operands. If \p NeedExtracts is true /// then we calculate the cost of extracting the scalars from the vector. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R, bool NeedExtracts); + bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool NeedExtracts); /// \brief Try to vectorize a chain that may start at the operands of \V; - bool tryToVectorize(BinaryOperator *V, FuncSLP &R); + bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); /// \brief Vectorize the stores that were collected in StoreRefs. - bool vectorizeStoreChains(FuncSLP &R); + bool vectorizeStoreChains(BoUpSLP &R); /// \brief Scan the basic block and look for patterns that are likely to start /// a vectorization chain. - bool vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R); + bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); + + bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, + BoUpSLP &R); + bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, + BoUpSLP &R); private: StoreListMap StoreRefs; }; -unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) { +bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, + int CostThreshold, BoUpSLP &R) { + unsigned ChainLen = Chain.size(); + DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen + << "\n"); + Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); + unsigned Sz = DL->getTypeSizeInBits(StoreTy); + unsigned VF = MinVecRegSize / Sz; + + if (!isPowerOf2_32(Sz) || VF < 2) + return false; + + bool Changed = false; + // Look for profitable vectorizable trees at all offsets, starting at zero. + for (unsigned i = 0, e = ChainLen; i < e; ++i) { + if (i + VF > e) + break; + DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i + << "\n"); + ArrayRef<Value *> Operands = Chain.slice(i, VF); + + R.buildTree(Operands); + + int Cost = R.getTreeCost(); + + DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + R.vectorizeTree(); + + // Move to the next bundle. + i += VF - 1; + Changed = true; + } + } + + if (Changed || ChainLen > VF) + return Changed; + + // Handle short chains. This helps us catch types such as <3 x float> that + // are smaller than vector size. + R.buildTree(Chain); + + int Cost = R.getTreeCost(); + + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost + << " for size = " << ChainLen << "\n"); + R.vectorizeTree(); + return true; + } + + return false; +} + +bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, + int costThreshold, BoUpSLP &R) { + SetVector<Value *> Heads, Tails; + SmallDenseMap<Value *, Value *> ConsecutiveChain; + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we vectorized so that we don't visit the same store twice. + BoUpSLP::ValueSet VectorizedStores; + bool Changed = false; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of loads that follow each other. + for (unsigned i = 0, e = Stores.size(); i < e; ++i) + for (unsigned j = 0; j < e; ++j) { + if (i == j) + continue; + + if (R.isConsecutiveAccess(Stores[i], Stores[j])) { + Tails.insert(Stores[j]); + Heads.insert(Stores[i]); + ConsecutiveChain[Stores[i]] = Stores[j]; + } + } + + // For stores that start but don't end a link in the chain: + for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to vectorize it. + BoUpSLP::ValueList Operands; + Value *I = *it; + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + if (VectorizedStores.count(I)) + break; + Operands.push_back(I); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); + + // Mark the vectorized stores so that we don't vectorize them again. + if (Vectorized) + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed |= Vectorized; + } + + return Changed; +} + + +unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { unsigned count = 0; StoreRefs.clear(); for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { @@ -1585,14 +1616,14 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) { return count; } -bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, FuncSLP &R) { +bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; Value *VL[] = { A, B }; return tryToVectorizeList(VL, R, true); } -bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R, +bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool NeedExtracts) { if (VL.size() < 2) return false; @@ -1615,9 +1646,8 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R, return 0; } - int Cost = R.getTreeCost(VL); - if (Cost == FuncSLP::MAX_COST) - return false; + R.buildTree(VL); + int Cost = R.getTreeCost(); int ExtrCost = NeedExtracts ? R.getGatherCost(VL) : 0; DEBUG(dbgs() << "SLP: Cost of pair:" << Cost @@ -1625,11 +1655,11 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R, if ((Cost + ExtrCost) >= -SLPCostThreshold) return false; DEBUG(dbgs() << "SLP: Vectorizing pair.\n"); - R.vectorizeArith(VL); + R.vectorizeTree(); return true; } -bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) { +bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { if (!V) return false; @@ -1669,7 +1699,7 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) { return 0; } -bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) { +bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { if (isa<DbgInfoIntrinsic>(it)) @@ -1737,7 +1767,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) { return Changed; } -bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) { +bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { bool Changed = false; // Attempt to sort and vectorize each of the store-groups. for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); @@ -1748,7 +1778,7 @@ bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) { DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << it->second.size() << ".\n"); - Changed |= R.vectorizeStores(it->second, -SLPCostThreshold); + Changed |= vectorizeStores(it->second, -SLPCostThreshold, R); } return Changed; } |