diff options
17 files changed, 1686 insertions, 1019 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 85c01bd..d2e7450 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -59,7 +59,7 @@ static const unsigned RecursionMaxDepth = 12; class BuilderLocGuard { public: BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()) {} - ~BuilderLocGuard() { Builder.SetInsertPoint(Loc); } + ~BuilderLocGuard() { if (Loc) Builder.SetInsertPoint(Loc); } private: // Prevent copying. @@ -91,6 +91,7 @@ struct BlockNumbering { } int getIndex(Instruction *I) { + assert(I->getParent() == BB && "Invalid instruction"); if (!Valid) numberInstructions(); assert(InstrIdx.count(I) && "Unknown instruction"); @@ -117,72 +118,169 @@ private: std::vector<Instruction *> InstrVec; }; -class FuncSLP { +/// \returns the parent basic block if all of the instructions in \p VL +/// are in the same block or null otherwise. +static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + if (!I0) + return 0; + BasicBlock *BB = I0->getParent(); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (!I) + return 0; + + if (BB != I->getParent()) + return 0; + } + return BB; +} + +/// \returns True if all of the values in \p VL are constants. +static bool allConstant(ArrayRef<Value *> VL) { + for (unsigned i = 0, e = VL.size(); i < e; ++i) + if (!isa<Constant>(VL[i])) + return false; + return true; +} + +/// \returns True if all of the values in \p VL are identical. +static bool isSplat(ArrayRef<Value *> VL) { + for (unsigned i = 1, e = VL.size(); i < e; ++i) + if (VL[i] != VL[0]) + return false; + return true; +} + +/// \returns The opcode if all of the Instructions in \p VL have the same +/// opcode, or zero. +static unsigned getSameOpcode(ArrayRef<Value *> VL) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + if (!I0) + return 0; + unsigned Opcode = I0->getOpcode(); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (!I || Opcode != I->getOpcode()) + return 0; + } + return Opcode; +} + +/// \returns The type that all of the values in \p VL have or null if there +/// are different types. +static Type* getSameType(ArrayRef<Value *> VL) { + Type *Ty = VL[0]->getType(); + for (int i = 1, e = VL.size(); i < e; i++) + if (VL[0]->getType() != Ty) + return 0; + + return Ty; +} + +/// \returns True if the ExtractElement instructions in VL can be vectorized +/// to use the original vector. +static bool CanReuseExtract(ArrayRef<Value *> VL) { + assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); + // Check if all of the extracts come from the same vector and from the + // correct offset. + Value *VL0 = VL[0]; + ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); + Value *Vec = E0->getOperand(0); + + // We have to extract from the same vector type. + unsigned NElts = Vec->getType()->getVectorNumElements(); + + if (NElts != VL.size()) + return false; + + // Check that all of the indices extract from the correct offset. + ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); + if (!CI || CI->getZExtValue()) + return false; + + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); + ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); + + if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) + return false; + } + + return true; +} + +/// Bottom Up SLP Vectorizer. +class BoUpSLP { +public: typedef SmallVector<Value *, 8> ValueList; typedef SmallVector<Instruction *, 16> InstrList; typedef SmallPtrSet<Value *, 16> ValueSet; typedef SmallVector<StoreInst *, 8> StoreList; -public: - static const int MAX_COST = INT_MIN; - - FuncSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, + BoUpSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt) : F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt), Builder(Se->getContext()) { - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { - BasicBlock *BB = it; - BlocksNumbers[BB] = BlockNumbering(BB); + // Setup the block numbering utility for all of the blocks in the + // function. + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BasicBlock *BB = it; + BlocksNumbers[BB] = BlockNumbering(BB); + } } - } - - /// \brief Take the pointer operand from the Load/Store instruction. - /// \returns NULL if this is not a valid Load/Store instruction. - static Value *getPointerOperand(Value *I); - - /// \brief Take the address space operand from the Load/Store instruction. - /// \returns -1 if this is not a valid Load/Store instruction. - static unsigned getAddressSpaceOperand(Value *I); - - /// \returns true if the memory operations A and B are consecutive. - bool isConsecutiveAccess(Value *A, Value *B); /// \brief Vectorize the tree that starts with the elements in \p VL. - /// \returns the vectorized value. - Value *vectorizeTree(ArrayRef<Value *> VL); + void vectorizeTree(); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - int getTreeCost(ArrayRef<Value *> VL); + int getTreeCost(); + + /// Construct a vectorizable tree that starts at \p Roots. + void buildTree(ArrayRef<Value *> Roots); + + /// Clear the internal data structures that are created by 'buildTree'. + void deleteTree() { + VectorizableTree.clear(); + ScalarToTreeEntry.clear(); + MustGather.clear(); + MemBarrierIgnoreList.clear(); + } /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. int getGatherCost(ArrayRef<Value *> VL); - /// \brief Attempts to order and vectorize a sequence of stores. This - /// function does a quadratic scan of the given stores. - /// \returns true if the basic block was modified. - bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold); + /// \returns true if the memory operations A and B are consecutive. + bool isConsecutiveAccess(Value *A, Value *B); - /// \brief Vectorize a group of scalars into a vector tree. - /// \returns the vectorized value. - Value *vectorizeArith(ArrayRef<Value *> Operands); + /// \brief Perform LICM and CSE on the newly generated gather sequences. + void optimizeGatherSequence(); +private: + struct TreeEntry; - /// \brief This method contains the recursive part of getTreeCost. - int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth); + /// \returns the cost of the vectorizable entry. + int getEntryCost(TreeEntry *E); - /// \brief This recursive method looks for vectorization hazards such as - /// values that are used by multiple users and checks that values are used - /// by only one vector lane. It updates the variables LaneMap, MultiUserVals. - void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth); + /// This is the recursive part of buildTree. + void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); - /// \brief This method contains the recursive part of vectorizeTree. - Value *vectorizeTree_rec(ArrayRef<Value *> VL); + /// Vectorizer a single entry in the tree. + Value *vectorizeTree(TreeEntry *E); - /// \brief Vectorize a sorted sequence of stores. - bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold); + /// Vectorizer a single entry in the tree, starting in \p VL. + Value *vectorizeTree(ArrayRef<Value *> VL); + + /// \brief Take the pointer operand from the Load/Store instruction. + /// \returns NULL if this is not a valid Load/Store instruction. + static Value *getPointerOperand(Value *I); + + /// \brief Take the address space operand from the Load/Store instruction. + /// \returns -1 if this is not a valid Load/Store instruction. + static unsigned getAddressSpaceOperand(Value *I); /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. @@ -211,57 +309,65 @@ public: /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); - /// \brief Perform LICM and CSE on the newly generated gather sequences. - void optimizeGatherSequence(); + struct TreeEntry { + TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0), + NeedToGather(0) {} - bool needToGatherAny(ArrayRef<Value *> VL) { - for (int i = 0, e = VL.size(); i < e; ++i) - if (MustGather.count(VL[i])) - return true; - return false; - } + /// \returns true if the scalars in VL are equal to this entry. + bool isSame(ArrayRef<Value *> VL) { + assert(VL.size() == Scalars.size() && "Invalid size"); + for (int i = 0, e = VL.size(); i != e; ++i) + if (VL[i] != Scalars[i]) + return false; + return true; + } + + /// A vector of scalars. + ValueList Scalars; + + /// The Scalars are vectorized into this value. It is initialized to Null. + Value *VectorizedValue; + + /// The index in the basic block of the last scalar. + int LastScalarIndex; - void forgetNumbering() { - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) - BlocksNumbers[it].forget(); + /// Do we need to gather this sequence ? + bool NeedToGather; + }; + + /// Create a new VectorizableTree entry. + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { + VectorizableTree.push_back(TreeEntry()); + int idx = VectorizableTree.size() - 1; + TreeEntry *Last = &VectorizableTree[idx]; + Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); + Last->NeedToGather = !Vectorized; + if (Vectorized) { + Last->LastScalarIndex = getLastIndex(VL); + for (int i = 0, e = VL.size(); i != e; ++i) { + assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); + ScalarToTreeEntry[VL[i]] = idx; + } + } else { + Last->LastScalarIndex = 0; + MustGather.insert(VL.begin(), VL.end()); + } + return Last; } /// -- Vectorization State -- + /// Holds all of the tree entries. + std::vector<TreeEntry> VectorizableTree; - /// Maps values in the tree to the vector lanes that uses them. This map must - /// be reset between runs of getCost. - std::map<Value *, int> LaneMap; - /// A list of instructions to ignore while sinking - /// memory instructions. This map must be reset between runs of getCost. - ValueSet MemBarrierIgnoreList; - - /// Maps between the first scalar to the vector. This map must be reset - /// between runs. - DenseMap<Value *, Value *> VectorizedValues; + /// Maps a specific scalar to its tree entry. + SmallDenseMap<Value*, int> ScalarToTreeEntry; - /// Contains values that must be gathered because they are used - /// by multiple lanes, or by users outside the tree. - /// NOTICE: The vectorization methods also use this set. + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; - /// Contains PHINodes that are being processed. We use this data structure - /// to stop cycles in the graph. - ValueSet VisitedPHIs; - - /// Contains a list of values that are used outside the current tree, the - /// first element in the bundle and the insertion point for extracts. This - /// set must be reset between runs. - struct UseInfo{ - UseInfo(Instruction *VL0, int I) : - Leader(VL0), LastIndex(I) {} - UseInfo() : Leader(0), LastIndex(0) {} - /// The first element in the bundle. - Instruction *Leader; - /// The insertion index. - int LastIndex; - }; - MapVector<Instruction*, UseInfo> MultiUserVals; - SetVector<Instruction*> ExtractedLane; + /// A list of instructions to ignore while sinking + /// memory instructions. This map must be reset between runs of getCost. + ValueSet MemBarrierIgnoreList; /// Holds all of the instructions that we gathered. SetVector<Instruction *> GatherSeq; @@ -281,14 +387,478 @@ public: IRBuilder<> Builder; }; -int FuncSLP::getGatherCost(Type *Ty) { +void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { + deleteTree(); + buildTree_rec(Roots, 0); +} + + +void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { + bool SameTy = getSameType(VL); (void)SameTy; + assert(SameTy && "Invalid types!"); + + if (Depth == RecursionMaxDepth) { + DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); + newTreeEntry(VL, false); + return; + } + + // Don't handle vectors. + if (VL[0]->getType()->isVectorTy()) { + DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); + newTreeEntry(VL, false); + return; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + if (SI->getValueOperand()->getType()->isVectorTy()) { + DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); + newTreeEntry(VL, false); + return; + } + + // If all of the operands are identical or constant we have a simple solution. + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || + !getSameOpcode(VL)) { + DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); + newTreeEntry(VL, false); + return; + } + + // We now know that this is a vector of instructions of the same type from + // the same block. + + // Check if this is a duplicate of another entry. + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *E = &VectorizableTree[Idx]; + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); + if (E->Scalars[i] != VL[i]) { + DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); + newTreeEntry(VL, false); + return; + } + } + DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); + return; + } + + // Check that none of the instructions in the bundle are already in the tree. + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + if (ScalarToTreeEntry.count(VL[i])) { + DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << + ") is already in tree.\n"); + newTreeEntry(VL, false); + return; + } + } + + // If any of the scalars appears in the table OR it is marked as a value that + // needs to stat scalar then we need to gather the scalars. + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) { + DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n"); + newTreeEntry(VL, false); + return; + } + } + + // Check that all of the users of the scalars that we want to vectorize are + // schedulable. + Instruction *VL0 = cast<Instruction>(VL[0]); + int MyLastIndex = getLastIndex(VL); + BasicBlock *BB = cast<Instruction>(VL0)->getParent(); + + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + Instruction *Scalar = cast<Instruction>(VL[i]); + DEBUG(dbgs() << "SLP: Checking users of " << *Scalar << ". \n"); + for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end(); + U != UE; ++U) { + DEBUG(dbgs() << "SLP: \tUser " << **U << ". \n"); + Instruction *User = dyn_cast<Instruction>(*U); + if (!User) { + DEBUG(dbgs() << "SLP: Gathering due unknown user. \n"); + newTreeEntry(VL, false); + return; + } + + // We don't care if the user is in a different basic block. + BasicBlock *UserBlock = User->getParent(); + if (UserBlock != BB) { + DEBUG(dbgs() << "SLP: User from a different basic block " + << *User << ". \n"); + continue; + } + + // If this is a PHINode within this basic block then we can place the + // extract wherever we want. + if (isa<PHINode>(*User)) { + DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *User << ". \n"); + continue; + } + + // Check if this is a safe in-tree user. + if (ScalarToTreeEntry.count(User)) { + int Idx = ScalarToTreeEntry[User]; + int VecLocation = VectorizableTree[Idx].LastScalarIndex; + if (VecLocation <= MyLastIndex) { + DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n"); + newTreeEntry(VL, false); + return; + } + DEBUG(dbgs() << "SLP: In-tree user (" << *User << ") at #" << + VecLocation << " vector value (" << *Scalar << ") at #" + << MyLastIndex << ".\n"); + continue; + } + + // Make sure that we can schedule this unknown user. + BlockNumbering &BN = BlocksNumbers[BB]; + int UserIndex = BN.getIndex(User); + if (UserIndex < MyLastIndex) { + + DEBUG(dbgs() << "SLP: Can't schedule extractelement for " + << *User << ". \n"); + newTreeEntry(VL, false); + return; + } + } + } + + // Check that every instructions appears once in this bundle. + for (unsigned i = 0, e = VL.size(); i < e; ++i) + for (unsigned j = i+1; j < e; ++j) + if (VL[i] == VL[j]) { + DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, false); + return; + } + + // Check that instructions in this bundle don't reference other instructions. + // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4. + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); + U != UE; ++U) { + for (unsigned j = 0; j < e; ++j) { + if (i != j && *U == VL[j]) { + DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << **U << ". \n"); + newTreeEntry(VL, false); + return; + } + } + } + } + + DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); + + unsigned Opcode = getSameOpcode(VL); + + // Check if it is safe to sink the loads or the stores. + if (Opcode == Instruction::Load || Opcode == Instruction::Store) { + Instruction *Last = getLastInstruction(VL); + + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + if (VL[i] == Last) + continue; + Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last); + if (Barrier) { + DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last + << "\n because of " << *Barrier << ". Gathering.\n"); + newTreeEntry(VL, false); + return; + } + } + } + + switch (Opcode) { + case Instruction::PHI: { + PHINode *PH = dyn_cast<PHINode>(VL0); + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); + + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } + case Instruction::ExtractElement: { + bool Reuse = CanReuseExtract(VL); + if (Reuse) { + DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); + } + newTreeEntry(VL, Reuse); + return; + } + case Instruction::Load: { + // Check if the loads are consecutive or of we need to swizzle them. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Need to swizzle loads.\n"); + return; + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of loads.\n"); + return; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + for (unsigned i = 0; i < VL.size(); ++i) { + Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); + if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); + return; + } + } + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of casts.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Check that all of the compares have the same predicate. + CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + CmpInst *Cmp = cast<CmpInst>(VL[i]); + if (Cmp->getPredicate() != P0) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); + return; + } + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of compares.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::Store: { + // Check if the stores are consecutive or of we need to swizzle them. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Non consecutive store.\n"); + return; + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); + + // We can ignore these values because we are sinking them down. + MemBarrierIgnoreList.insert(VL.begin(), VL.end()); + buildTree_rec(Operands, Depth + 1); + return; + } + default: + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); + return; + } +} + +int BoUpSLP::getEntryCost(TreeEntry *E) { + ArrayRef<Value*> VL = E->Scalars; + + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + if (E->NeedToGather) { + if (allConstant(VL)) + return 0; + if (isSplat(VL)) { + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + } + return getGatherCost(E->Scalars); + } + + assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) && + "Invalid VL"); + Instruction *VL0 = cast<Instruction>(VL[0]); + unsigned Opcode = VL0->getOpcode(); + switch (Opcode) { + case Instruction::PHI: { + return 0; + } + case Instruction::ExtractElement: { + if (CanReuseExtract(VL)) + return 0; + return getGatherCost(VecTy); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + + // Calculate the cost of this instruction. + int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), + VL0->getType(), SrcTy); + + VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + return VecCost - ScalarCost; + } + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Calculate the cost of this instruction. + int ScalarCost = 0; + int VecCost = 0; + if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || + Opcode == Instruction::Select) { + VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); + ScalarCost = VecTy->getNumElements() * + TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); + VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); + } else { + ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy); + VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); + } + return VecCost - ScalarCost; + } + case Instruction::Load: { + // Cost of wide load - cost of scalar loads. + int ScalarLdCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + return VecLdCost - ScalarLdCost; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + int ScalarStCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + return VecStCost - ScalarStCost; + } + default: + llvm_unreachable("Unknown instruction"); + } +} + +int BoUpSLP::getTreeCost() { + int Cost = 0; + DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << + VectorizableTree.size() << ".\n"); + + for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { + int C = getEntryCost(&VectorizableTree[i]); + DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " + << *VectorizableTree[i].Scalars[0] << " .\n"); + Cost += C; + } + DEBUG(dbgs() << "SLP: Total Cost " << Cost << ".\n"); + return Cost; +} + +int BoUpSLP::getGatherCost(Type *Ty) { int Cost = 0; for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); return Cost; } -int FuncSLP::getGatherCost(ArrayRef<Value *> VL) { +int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { // Find the type of the operands in VL. Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) @@ -298,7 +868,7 @@ int FuncSLP::getGatherCost(ArrayRef<Value *> VL) { return getGatherCost(VecTy); } -AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) { +AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI); if (LoadInst *LI = dyn_cast<LoadInst>(I)) @@ -306,7 +876,7 @@ AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) { return AliasAnalysis::Location(); } -Value *FuncSLP::getPointerOperand(Value *I) { +Value *BoUpSLP::getPointerOperand(Value *I) { if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand(); if (StoreInst *SI = dyn_cast<StoreInst>(I)) @@ -314,7 +884,7 @@ Value *FuncSLP::getPointerOperand(Value *I) { return 0; } -unsigned FuncSLP::getAddressSpaceOperand(Value *I) { +unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { if (LoadInst *L = dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace(); if (StoreInst *S = dyn_cast<StoreInst>(I)) @@ -322,7 +892,7 @@ unsigned FuncSLP::getAddressSpaceOperand(Value *I) { return -1; } -bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) { +bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { Value *PtrA = getPointerOperand(A); Value *PtrB = getPointerOperand(B); unsigned ASA = getAddressSpaceOperand(A); @@ -354,7 +924,7 @@ bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) { return ((-Offset) == Sz); } -Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { +Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { assert(Src->getParent() == Dst->getParent() && "Not the same BB"); BasicBlock::iterator I = Src, E = Dst; /// Scan all of the instruction from SRC to DST and check if @@ -379,234 +949,7 @@ Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { return 0; } -static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { - BasicBlock *BB = 0; - for (int i = 0, e = VL.size(); i < e; i++) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - if (!I) - return 0; - - if (!BB) { - BB = I->getParent(); - continue; - } - - if (BB != I->getParent()) - return 0; - } - return BB; -} - -static bool allConstant(ArrayRef<Value *> VL) { - for (unsigned i = 0, e = VL.size(); i < e; ++i) - if (!isa<Constant>(VL[i])) - return false; - return true; -} - -static bool isSplat(ArrayRef<Value *> VL) { - for (unsigned i = 1, e = VL.size(); i < e; ++i) - if (VL[i] != VL[0]) - return false; - return true; -} - -static unsigned getSameOpcode(ArrayRef<Value *> VL) { - unsigned Opcode = 0; - for (int i = 0, e = VL.size(); i < e; i++) { - if (Instruction *I = dyn_cast<Instruction>(VL[i])) { - if (!Opcode) { - Opcode = I->getOpcode(); - continue; - } - if (Opcode != I->getOpcode()) - return 0; - } - } - return Opcode; -} - -static bool CanReuseExtract(ArrayRef<Value *> VL, unsigned VF, - VectorType *VecTy) { - assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); - // Check if all of the extracts come from the same vector and from the - // correct offset. - Value *VL0 = VL[0]; - ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); - Value *Vec = E0->getOperand(0); - - // We have to extract from the same vector type. - if (Vec->getType() != VecTy) - return false; - - // Check that all of the indices extract from the correct offset. - ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); - if (!CI || CI->getZExtValue()) - return false; - - for (unsigned i = 1, e = VF; i < e; ++i) { - ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); - ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); - - if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) - return false; - } - - return true; -} - -void FuncSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) { - if (Depth == RecursionMaxDepth) - return MustGather.insert(VL.begin(), VL.end()); - - // Don't handle vectors. - if (VL[0]->getType()->isVectorTy()) - return; - - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) - if (SI->getValueOperand()->getType()->isVectorTy()) - return; - - // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL)) - return MustGather.insert(VL.begin(), VL.end()); - - // Stop the scan at unknown IR. - Instruction *VL0 = dyn_cast<Instruction>(VL[0]); - assert(VL0 && "Invalid instruction"); - - // Mark instructions with multiple users. - int LastIndex = getLastIndex(VL); - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - if (PHINode *PN = dyn_cast<PHINode>(VL[i])) { - unsigned NumUses = 0; - // Check that PHINodes have only one external (non-self) use. - for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); - U != UE; ++U) { - // Don't count self uses. - if (*U == PN) - continue; - NumUses++; - } - if (NumUses > 1) { - DEBUG(dbgs() << "SLP: Adding PHI to MultiUserVals " - "because it has " << NumUses << " users:" << *PN << " \n"); - UseInfo UI(VL0, 0); - MultiUserVals[PN] = UI; - } - continue; - } - - Instruction *I = dyn_cast<Instruction>(VL[i]); - // Remember to check if all of the users of this instruction are vectorized - // within our tree. At depth zero we have no local users, only external - // users that we don't care about. - if (Depth && I && I->getNumUses() > 1) { - DEBUG(dbgs() << "SLP: Adding to MultiUserVals " - "because it has " << I->getNumUses() << " users:" << *I << " \n"); - UseInfo UI(VL0, LastIndex); - MultiUserVals[I] = UI; - } - } - - // Check that the instruction is only used within one lane. - for (int i = 0, e = VL.size(); i < e; ++i) { - if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) { - DEBUG(dbgs() << "SLP: Value used by multiple lanes:" << *VL[i] << "\n"); - return MustGather.insert(VL.begin(), VL.end()); - } - // Make this instruction as 'seen' and remember the lane. - LaneMap[VL[i]] = i; - } - - unsigned Opcode = getSameOpcode(VL); - if (!Opcode) - return MustGather.insert(VL.begin(), VL.end()); - - switch (Opcode) { - case Instruction::PHI: { - PHINode *PH = dyn_cast<PHINode>(VL0); - - // Stop self cycles. - if (VisitedPHIs.count(PH)) - return; - - VisitedPHIs.insert(PH); - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i)); - - getTreeUses_rec(Operands, Depth + 1); - } - return; - } - case Instruction::ExtractElement: { - VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size()); - // No need to follow ExtractElements that are going to be optimized away. - if (CanReuseExtract(VL, VL.size(), VecTy)) - return; - // Fall through. - } - case Instruction::Load: - return; - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: - case Instruction::Select: - case Instruction::ICmp: - case Instruction::FCmp: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); - - getTreeUses_rec(Operands, Depth + 1); - } - return; - } - case Instruction::Store: { - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); - getTreeUses_rec(Operands, Depth + 1); - return; - } - default: - return MustGather.insert(VL.begin(), VL.end()); - } -} - -int FuncSLP::getLastIndex(ArrayRef<Value *> VL) { +int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) { BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); BlockNumbering &BN = BlocksNumbers[BB]; @@ -617,7 +960,7 @@ int FuncSLP::getLastIndex(ArrayRef<Value *> VL) { return MaxIdx; } -Instruction *FuncSLP::getLastInstruction(ArrayRef<Value *> VL) { +Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) { BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); BlockNumbering &BN = BlocksNumbers[BB]; @@ -625,15 +968,17 @@ Instruction *FuncSLP::getLastInstruction(ArrayRef<Value *> VL) { int MaxIdx = BN.getIndex(cast<Instruction>(VL[0])); for (unsigned i = 1, e = VL.size(); i < e; ++i) MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i]))); - return BN.getInstruction(MaxIdx); + Instruction *I = BN.getInstruction(MaxIdx); + assert(I && "bad location"); + return I; } -Instruction *FuncSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) { +Instruction *BoUpSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) { BlockNumbering &BN = BlocksNumbers[BB]; return BN.getInstruction(Index); } -int FuncSLP::getFirstUserIndex(ArrayRef<Value *> VL) { +int BoUpSLP::getFirstUserIndex(ArrayRef<Value *> VL) { BasicBlock *BB = getSameBlock(VL); assert(BB && "All instructions must come from the same block"); BlockNumbering &BN = BlocksNumbers[BB]; @@ -654,444 +999,7 @@ int FuncSLP::getFirstUserIndex(ArrayRef<Value *> VL) { return FirstUser; } -int FuncSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) { - Type *ScalarTy = VL[0]->getType(); - - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - - /// Don't mess with vectors. - if (ScalarTy->isVectorTy()) - return FuncSLP::MAX_COST; - - if (allConstant(VL)) - return 0; - - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - - if (isSplat(VL)) - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); - - int GatherCost = getGatherCost(VecTy); - if (Depth == RecursionMaxDepth || needToGatherAny(VL)) - return GatherCost; - - BasicBlock *BB = getSameBlock(VL); - unsigned Opcode = getSameOpcode(VL); - assert(Opcode && BB && "Invalid Instruction Value"); - - // Check if it is safe to sink the loads or the stores. - if (Opcode == Instruction::Load || Opcode == Instruction::Store) { - int MaxIdx = getLastIndex(VL); - Instruction *Last = getInstructionForIndex(MaxIdx, BB); - - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - if (VL[i] == Last) - continue; - Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last); - if (Barrier) { - DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last - << "\n because of " << *Barrier << "\n"); - return MAX_COST; - } - } - } - - // Calculate the extract cost. - unsigned ExternalUserExtractCost = 0; - for (unsigned i = 0, e = VL.size(); i < e; ++i) - if (ExtractedLane.count(cast<Instruction>(VL[i]))) - ExternalUserExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); - - Instruction *VL0 = cast<Instruction>(VL[0]); - switch (Opcode) { - case Instruction::PHI: { - PHINode *PH = dyn_cast<PHINode>(VL0); - - // Stop self cycles. - if (VisitedPHIs.count(PH)) - return 0; - - VisitedPHIs.insert(PH); - int TotalCost = 0; - // Calculate the cost of all of the operands. - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i)); - - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; - TotalCost += TotalCost; - } - - if (TotalCost > GatherCost) { - MustGather.insert(VL.begin(), VL.end()); - return GatherCost; - } - - return TotalCost + ExternalUserExtractCost; - } - case Instruction::ExtractElement: { - if (CanReuseExtract(VL, VL.size(), VecTy)) - return 0; - return getGatherCost(VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - ValueList Operands; - Type *SrcTy = VL0->getOperand(0)->getType(); - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); - // Check that the casted type is the same for all users. - if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy) - return getGatherCost(VecTy); - } - - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; - - // Calculate the cost of this instruction. - int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); - - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); - Cost += (VecCost - ScalarCost); - - if (Cost > GatherCost) { - MustGather.insert(VL.begin(), VL.end()); - return GatherCost; - } - - return Cost + ExternalUserExtractCost; - } - case Instruction::FCmp: - case Instruction::ICmp: { - // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast<CmpInst>(VL[i]); - if (Cmp->getPredicate() != P0) - return getGatherCost(VecTy); - } - // Fall through. - } - case Instruction::Select: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - int TotalCost = 0; - // Calculate the cost of all of the operands. - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); - - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; - TotalCost += Cost; - } - - // Calculate the cost of this instruction. - int ScalarCost = 0; - int VecCost = 0; - if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || - Opcode == Instruction::Select) { - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - ScalarCost = - VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); - VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); - } else { - ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy); - VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); - } - TotalCost += (VecCost - ScalarCost); - - if (TotalCost > GatherCost) { - MustGather.insert(VL.begin(), VL.end()); - return GatherCost; - } - - return TotalCost + ExternalUserExtractCost; - } - case Instruction::Load: { - // If we are scalarize the loads, add the cost of forming the vector. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1])) - return getGatherCost(VecTy); - - // Cost of wide load - cost of scalar loads. - int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - int TotalCost = VecLdCost - ScalarLdCost; - - if (TotalCost > GatherCost) { - MustGather.insert(VL.begin(), VL.end()); - return GatherCost; - } - - return TotalCost + ExternalUserExtractCost; - } - case Instruction::Store: { - // We know that we can merge the stores. Calculate the cost. - int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int StoreCost = VecStCost - ScalarStCost; - - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); - MemBarrierIgnoreList.insert(VL[j]); - } - - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; - - int TotalCost = StoreCost + Cost; - return TotalCost + ExternalUserExtractCost; - } - default: - // Unable to vectorize unknown instructions. - return getGatherCost(VecTy); - } -} - -int FuncSLP::getTreeCost(ArrayRef<Value *> VL) { - // Get rid of the list of stores that were removed, and from the - // lists of instructions with multiple users. - MemBarrierIgnoreList.clear(); - LaneMap.clear(); - MultiUserVals.clear(); - ExtractedLane.clear(); - MustGather.clear(); - VisitedPHIs.clear(); - - if (!getSameBlock(VL)) - return MAX_COST; - - // Find the location of the last root. - int LastRootIndex = getLastIndex(VL); - int FirstUserIndex = getFirstUserIndex(VL); - - // Don't vectorize if there are users of the tree roots inside the tree - // itself. - if (LastRootIndex > FirstUserIndex) - return MAX_COST; - - // Scan the tree and find which value is used by which lane, and which values - // must be scalarized. - getTreeUses_rec(VL, 0); - - // Check that instructions with multiple users can be vectorized. Mark - // unsafe instructions. - for (MapVector<Instruction *, UseInfo>::iterator UI = MultiUserVals.begin(), - e = MultiUserVals.end(); UI != e; ++UI) { - Instruction *Scalar = UI->first; - - if (MustGather.count(Scalar)) - continue; - - assert(LaneMap.count(Scalar) && "Unknown scalar"); - int ScalarLane = LaneMap[Scalar]; - - bool ExternalUse = false; - // Check that all of the users of this instr are within the tree. - for (Value::use_iterator Usr = Scalar->use_begin(), - UE = Scalar->use_end(); Usr != UE; ++Usr) { - // If this user is within the tree, make sure it is from the same lane. - // Notice that we have both in-tree and out-of-tree users. - if (LaneMap.count(*Usr)) { - if (LaneMap[*Usr] != ScalarLane) { - DEBUG(dbgs() << "SLP: Adding to MustExtract " - "because of an out-of-lane usage.\n"); - MustGather.insert(Scalar); - break; - } - continue; - } - - // We have an out-of-tree user. Check if we can place an 'extract'. - Instruction *User = cast<Instruction>(*Usr); - // We care about the order only if the user is in the same block. - if (User->getParent() == Scalar->getParent()) { - int LastLoc = UI->second.LastIndex; - BlockNumbering &BN = BlocksNumbers[User->getParent()]; - int UserIdx = BN.getIndex(User); - if (UserIdx <= LastLoc) { - DEBUG(dbgs() << "SLP: Adding to MustExtract because of an external " - "user that we can't schedule.\n"); - MustGather.insert(Scalar); - break; - } - } - // We have an external user. - ExternalUse = true; - } - - if (ExternalUse) { - // Items that are left in MultiUserVals are to be extracted. - // ExtractLane is used for the lookup. - ExtractedLane.insert(Scalar); - } - - } - - // Now calculate the cost of vectorizing the tree. - return getTreeCost_rec(VL, 0); -} -bool FuncSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) { - unsigned ChainLen = Chain.size(); - DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen - << "\n"); - Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); - unsigned Sz = DL->getTypeSizeInBits(StoreTy); - unsigned VF = MinVecRegSize / Sz; - - if (!isPowerOf2_32(Sz) || VF < 2) - return false; - - bool Changed = false; - // Look for profitable vectorizable trees at all offsets, starting at zero. - for (unsigned i = 0, e = ChainLen; i < e; ++i) { - if (i + VF > e) - break; - DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i - << "\n"); - ArrayRef<Value *> Operands = Chain.slice(i, VF); - - int Cost = getTreeCost(Operands); - if (Cost == FuncSLP::MAX_COST) - continue; - DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); - vectorizeTree(Operands); - - // Remove the scalar stores. - for (int j = 0, e = VF; j < e; ++j) - cast<Instruction>(Operands[j])->eraseFromParent(); - - // Move to the next bundle. - i += VF - 1; - Changed = true; - } - } - - if (Changed || ChainLen > VF) - return Changed; - - // Handle short chains. This helps us catch types such as <3 x float> that - // are smaller than vector size. - int Cost = getTreeCost(Chain); - if (Cost == FuncSLP::MAX_COST) - return false; - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost - << " for size = " << ChainLen << "\n"); - vectorizeTree(Chain); - - // Remove all of the scalar stores. - for (int i = 0, e = Chain.size(); i < e; ++i) - cast<Instruction>(Chain[i])->eraseFromParent(); - - return true; - } - - return false; -} - -bool FuncSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) { - SetVector<Value *> Heads, Tails; - SmallDenseMap<Value *, Value *> ConsecutiveChain; - - // We may run into multiple chains that merge into a single chain. We mark the - // stores that we vectorized so that we don't visit the same store twice. - ValueSet VectorizedStores; - bool Changed = false; - - // Do a quadratic search on all of the given stores and find - // all of the pairs of loads that follow each other. - for (unsigned i = 0, e = Stores.size(); i < e; ++i) - for (unsigned j = 0; j < e; ++j) { - if (i == j) - continue; - - if (isConsecutiveAccess(Stores[i], Stores[j])) { - Tails.insert(Stores[j]); - Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[j]; - } - } - - // For stores that start but don't end a link in the chain: - for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); - it != e; ++it) { - if (Tails.count(*it)) - continue; - - // We found a store instr that starts a chain. Now follow the chain and try - // to vectorize it. - ValueList Operands; - Value *I = *it; - // Collect the chain into a list. - while (Tails.count(I) || Heads.count(I)) { - if (VectorizedStores.count(I)) - break; - Operands.push_back(I); - // Move to the next value in the chain. - I = ConsecutiveChain[I]; - } - - bool Vectorized = vectorizeStoreChain(Operands, costThreshold); - - // Mark the vectorized stores so that we don't vectorize them again. - if (Vectorized) - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed |= Vectorized; - } - - return Changed; -} - -Value *FuncSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { +Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { Value *Vec = UndefValue::get(Ty); // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { @@ -1103,282 +1011,292 @@ Value *FuncSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { return Vec; } -Value *FuncSLP::vectorizeTree_rec(ArrayRef<Value *> VL) { - BuilderLocGuard Guard(Builder); +Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *E = &VectorizableTree[Idx]; + if (E->isSame(VL)) + return vectorizeTree(E); + } Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - if (needToGatherAny(VL)) - return Gather(VL, VecTy); + return Gather(VL, VecTy); +} + +Value *BoUpSLP::vectorizeTree(TreeEntry *E) { + BuilderLocGuard Guard(Builder); - if (VectorizedValues.count(VL[0])) { - DEBUG(dbgs() << "SLP: Diamond merged at depth.\n"); - return VectorizedValues[VL[0]]; + if (E->VectorizedValue) { + DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + return E->VectorizedValue; } - Instruction *VL0 = cast<Instruction>(VL[0]); - unsigned Opcode = VL0->getOpcode(); - assert(Opcode == getSameOpcode(VL) && "Invalid opcode"); + Type *ScalarTy = E->Scalars[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(E->Scalars[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); - switch (Opcode) { - case Instruction::PHI: { - PHINode *PH = dyn_cast<PHINode>(VL0); - Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt()); - PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); - VectorizedValues[VL0] = NewPhi; + if (E->NeedToGather) { + return Gather(E->Scalars, VecTy); + } - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - ValueList Operands; - BasicBlock *IBB = PH->getIncomingBlock(i); + Instruction *VL0 = cast<Instruction>(E->Scalars[0]); + unsigned Opcode = VL0->getOpcode(); + assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(IBB)); + switch (Opcode) { + case Instruction::PHI: { + PHINode *PH = dyn_cast<PHINode>(VL0); + Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt()); + PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); + E->VectorizedValue = NewPhi; + + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + ValueList Operands; + BasicBlock *IBB = PH->getIncomingBlock(i); + + // Prepare the operand vector. + for (unsigned j = 0; j < E->Scalars.size(); ++j) + Operands.push_back(cast<PHINode>(E->Scalars[j])-> + getIncomingValueForBlock(IBB)); + + Builder.SetInsertPoint(IBB->getTerminator()); + Value *Vec = vectorizeTree(Operands); + NewPhi->addIncoming(Vec, IBB); + } - Builder.SetInsertPoint(IBB->getTerminator()); - Value *Vec = vectorizeTree_rec(Operands); - NewPhi->addIncoming(Vec, IBB); + assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && + "Invalid number of incoming values"); + return NewPhi; } - assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && - "Invalid number of incoming values"); - return NewPhi; - } - - case Instruction::ExtractElement: { - if (CanReuseExtract(VL, VL.size(), VecTy)) - return VL0->getOperand(0); - return Gather(VL, VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - ValueList INVL; - for (int i = 0, e = VL.size(); i < e; ++i) - INVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); - - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *InVec = vectorizeTree_rec(INVL); - CastInst *CI = dyn_cast<CastInst>(VL0); - Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::FCmp: - case Instruction::ICmp: { - // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast<CmpInst>(VL[i]); - if (Cmp->getPredicate() != P0) - return Gather(VL, VecTy); + case Instruction::ExtractElement: { + if (CanReuseExtract(E->Scalars)) { + Value *V = VL0->getOperand(0); + E->VectorizedValue = V; + return V; + } + return Gather(E->Scalars, VecTy); } - - ValueList LHSV, RHSV; - for (int i = 0, e = VL.size(); i < e; ++i) { - LHSV.push_back(cast<Instruction>(VL[i])->getOperand(0)); - RHSV.push_back(cast<Instruction>(VL[i])->getOperand(1)); + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + ValueList INVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *InVec = vectorizeTree(INVL); + CastInst *CI = dyn_cast<CastInst>(VL0); + Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + E->VectorizedValue = V; + return V; } + case Instruction::FCmp: + case Instruction::ICmp: { + ValueList LHSV, RHSV; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + } - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *L = vectorizeTree_rec(LHSV); - Value *R = vectorizeTree_rec(RHSV); - Value *V; + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *L = vectorizeTree(LHSV); + Value *R = vectorizeTree(RHSV); + Value *V; - if (Opcode == Instruction::FCmp) - V = Builder.CreateFCmp(P0, L, R); - else - V = Builder.CreateICmp(P0, L, R); + CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); + if (Opcode == Instruction::FCmp) + V = Builder.CreateFCmp(P0, L, R); + else + V = Builder.CreateICmp(P0, L, R); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Select: { - ValueList TrueVec, FalseVec, CondVec; - for (int i = 0, e = VL.size(); i < e; ++i) { - CondVec.push_back(cast<Instruction>(VL[i])->getOperand(0)); - TrueVec.push_back(cast<Instruction>(VL[i])->getOperand(1)); - FalseVec.push_back(cast<Instruction>(VL[i])->getOperand(2)); + E->VectorizedValue = V; + return V; } + case Instruction::Select: { + ValueList TrueVec, FalseVec, CondVec; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2)); + } - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *True = vectorizeTree_rec(TrueVec); - Value *False = vectorizeTree_rec(FalseVec); - Value *Cond = vectorizeTree_rec(CondVec); - Value *V = Builder.CreateSelect(Cond, True, False); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - ValueList LHSVL, RHSVL; - for (int i = 0, e = VL.size(); i < e; ++i) { - LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); - RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1)); + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *Cond = vectorizeTree(CondVec); + Value *True = vectorizeTree(TrueVec); + Value *False = vectorizeTree(FalseVec); + Value *V = Builder.CreateSelect(Cond, True, False); + E->VectorizedValue = V; + return V; } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + } - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *LHS = vectorizeTree_rec(LHSVL); - Value *RHS = vectorizeTree_rec(RHSVL); + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); - if (LHS == RHS) { - assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); - } + if (LHS == RHS && isa<Instruction>(LHS)) { + assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); + } - BinaryOperator *BinOp = cast<BinaryOperator>(VL0); - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Load: { - // Check if all of the loads are consecutive. - for (unsigned i = 1, e = VL.size(); i < e; ++i) - if (!isConsecutiveAccess(VL[i - 1], VL[i])) - return Gather(VL, VecTy); - - // Loads are inserted at the head of the tree because we don't want to - // sink them all the way down past store instructions. - Builder.SetInsertPoint(getLastInstruction(VL)); - LoadInst *LI = cast<LoadInst>(VL0); - Value *VecPtr = - Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo()); - unsigned Alignment = LI->getAlignment(); - LI = Builder.CreateLoad(VecPtr); - LI->setAlignment(Alignment); - - VectorizedValues[VL0] = LI; - return LI; + BinaryOperator *BinOp = cast<BinaryOperator>(VL0); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); + E->VectorizedValue = V; + return V; + } + case Instruction::Load: { + // Loads are inserted at the head of the tree because we don't want to + // sink them all the way down past store instructions. + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + LoadInst *LI = cast<LoadInst>(VL0); + Value *VecPtr = + Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo()); + unsigned Alignment = LI->getAlignment(); + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + E->VectorizedValue = LI; + return LI; + } + case Instruction::Store: { + StoreInst *SI = cast<StoreInst>(VL0); + unsigned Alignment = SI->getAlignment(); + + ValueList ValueOp; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand()); + + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Value *VecValue = vectorizeTree(ValueOp); + Value *VecPtr = + Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); + StoreInst *S = Builder.CreateStore(VecValue, VecPtr); + S->setAlignment(Alignment); + E->VectorizedValue = S; + return S; + } + default: + llvm_unreachable("unknown inst"); } - case Instruction::Store: { - StoreInst *SI = cast<StoreInst>(VL0); - unsigned Alignment = SI->getAlignment(); + return 0; +} - ValueList ValueOp; - for (int i = 0, e = VL.size(); i < e; ++i) - ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand()); +void BoUpSLP::vectorizeTree() { + vectorizeTree(&VectorizableTree[0]); - Value *VecValue = vectorizeTree_rec(ValueOp); + // For each vectorized value: + for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { + TreeEntry *Entry = &VectorizableTree[EIdx]; - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *VecPtr = - Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); - Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); - return 0; - } - default: - return Gather(VL, VecTy); - } -} + // For each lane: + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; -Value *FuncSLP::vectorizeTree(ArrayRef<Value *> VL) { - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *V = vectorizeTree_rec(VL); + // No need to handle users of gathered values. + if (Entry->NeedToGather) + continue; - DEBUG(dbgs() << "SLP: Placing 'extracts'\n"); - for (SetVector<Instruction*>::iterator it = ExtractedLane.begin(), e = - ExtractedLane.end(); it != e; ++it) { - Instruction *Scalar = *it; - DEBUG(dbgs() << "SLP: Looking at " << *Scalar); + Value *Vec = Entry->VectorizedValue; + assert(Vec && "Can't find vectorizable value"); - if (!Scalar) - continue; + SmallVector<User*, 16> Users(Scalar->use_begin(), Scalar->use_end()); - Instruction *Loc = 0; + for (SmallVector<User*, 16>::iterator User = Users.begin(), + UE = Users.end(); User != UE; ++User) { + DEBUG(dbgs() << "SLP: \tupdating user " << **User << ".\n"); - assert(MultiUserVals.count(Scalar) && "Can't find the lane to extract"); - Instruction *Leader = MultiUserVals[Scalar].Leader; + bool Gathered = MustGather.count(*User); - // This value is gathered so we don't need to extract from anywhere. - if (!VectorizedValues.count(Leader)) - continue; + // Skip in-tree scalars that become vectors. + if (ScalarToTreeEntry.count(*User) && !Gathered) { + DEBUG(dbgs() << "SLP: \tUser will be removed soon:" << + **User << ".\n"); + int Idx = ScalarToTreeEntry[*User]; (void) Idx; + assert(!VectorizableTree[Idx].NeedToGather && "bad state ?"); + continue; + } - Value *Vec = VectorizedValues[Leader]; - if (PHINode *PN = dyn_cast<PHINode>(Vec)) { - Loc = PN->getParent()->getFirstInsertionPt(); - } else { - Instruction *I = cast<Instruction>(Vec); - BasicBlock::iterator L = *I; - Loc = ++L; - } + if (!isa<Instruction>(*User)) + continue; - Builder.SetInsertPoint(Loc); - assert(LaneMap.count(Scalar) && "Can't find the extracted lane."); - int Lane = LaneMap[Scalar]; - Value *Idx = Builder.getInt32(Lane); - Value *Extract = Builder.CreateExtractElement(Vec, Idx); + // Generate extracts for out-of-tree users. + // Find the insertion point for the extractelement lane. + Instruction *Loc = 0; + if (PHINode *PN = dyn_cast<PHINode>(Vec)) { + Loc = PN->getParent()->getFirstInsertionPt(); + } else if (Instruction *Iv = dyn_cast<Instruction>(Vec)){ + Loc = ++((BasicBlock::iterator)*Iv); + } else { + Loc = F->getEntryBlock().begin(); + } - bool Replaced = false;; - for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end(); - U != UE; ++U) { - Instruction *UI = cast<Instruction>(*U); - // No need to replace instructions that are inside our lane map. - if (LaneMap.count(UI)) - continue; + Builder.SetInsertPoint(Loc); + Value *Ex = Builder.CreateExtractElement(Vec, Builder.getInt32(Lane)); + (*User)->replaceUsesOfWith(Scalar, Ex); + DEBUG(dbgs() << "SLP: \tupdated user:" << **User << ".\n"); + } - UI->replaceUsesOfWith(Scalar ,Extract); - Replaced = true; + Type *Ty = Scalar->getType(); + if (!Ty->isVoidTy()) { + for (Value::use_iterator User = Scalar->use_begin(), UE = Scalar->use_end(); + User != UE; ++User) { + DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n"); + assert(!MustGather.count(*User) && + "Replacing gathered value with undef"); + assert(ScalarToTreeEntry.count(*User) && + "Replacing out-of-tree value with undef"); + } + Value *Undef = UndefValue::get(Ty); + Scalar->replaceAllUsesWith(Undef); + } + DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); + cast<Instruction>(Scalar)->eraseFromParent(); } - assert(Replaced && "Must replace at least one outside user"); - (void)Replaced; } - // We moved some instructions around. We have to number them again - // before we can do any analysis. - forgetNumbering(); - - // Clear the state. - MustGather.clear(); - VisitedPHIs.clear(); - VectorizedValues.clear(); - MemBarrierIgnoreList.clear(); - return V; -} - -Value *FuncSLP::vectorizeArith(ArrayRef<Value *> Operands) { - Instruction *LastInst = getLastInstruction(Operands); - Value *Vec = vectorizeTree(Operands); - // After vectorizing the operands we need to generate extractelement - // instructions and replace all of the uses of the scalar values with - // the values that we extracted from the vectorized tree. - Builder.SetInsertPoint(LastInst); - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); - Operands[i]->replaceAllUsesWith(S); + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BlocksNumbers[it].forget(); } - - forgetNumbering(); - return Vec; } -void FuncSLP::optimizeGatherSequence() { +void BoUpSLP::optimizeGatherSequence() { + DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() + << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. for (SetVector<Instruction *>::iterator it = GatherSeq.begin(), e = GatherSeq.end(); it != e; ++it) { @@ -1449,8 +1367,6 @@ void FuncSLP::optimizeGatherSequence() { assert((*v)->getNumUses() == 0 && "Can't remove instructions with uses"); (*v)->eraseFromParent(); } - - forgetNumbering(); } /// The SLPVectorizer Pass. @@ -1492,7 +1408,7 @@ struct SLPVectorizer : public FunctionPass { // Use the bollom up slp vectorizer to construct chains that start with // he store instructions. - FuncSLP R(&F, SE, DL, TTI, AA, LI, DT); + BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT); // Scan the blocks in the function in post order. for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), @@ -1536,31 +1452,146 @@ private: /// object. We sort the stores to their base objects to reduce the cost of the /// quadratic search on the stores. TODO: We can further reduce this cost /// if we flush the chain creation every time we run into a memory barrier. - unsigned collectStores(BasicBlock *BB, FuncSLP &R); + unsigned collectStores(BasicBlock *BB, BoUpSLP &R); /// \brief Try to vectorize a chain that starts at two arithmetic instrs. - bool tryToVectorizePair(Value *A, Value *B, FuncSLP &R); + bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); /// \brief Try to vectorize a list of operands. If \p NeedExtracts is true /// then we calculate the cost of extracting the scalars from the vector. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R, bool NeedExtracts); + bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool NeedExtracts); /// \brief Try to vectorize a chain that may start at the operands of \V; - bool tryToVectorize(BinaryOperator *V, FuncSLP &R); + bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); /// \brief Vectorize the stores that were collected in StoreRefs. - bool vectorizeStoreChains(FuncSLP &R); + bool vectorizeStoreChains(BoUpSLP &R); /// \brief Scan the basic block and look for patterns that are likely to start /// a vectorization chain. - bool vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R); + bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); + + bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, + BoUpSLP &R); + bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, + BoUpSLP &R); private: StoreListMap StoreRefs; }; -unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) { +bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, + int CostThreshold, BoUpSLP &R) { + unsigned ChainLen = Chain.size(); + DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen + << "\n"); + Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); + unsigned Sz = DL->getTypeSizeInBits(StoreTy); + unsigned VF = MinVecRegSize / Sz; + + if (!isPowerOf2_32(Sz) || VF < 2) + return false; + + bool Changed = false; + // Look for profitable vectorizable trees at all offsets, starting at zero. + for (unsigned i = 0, e = ChainLen; i < e; ++i) { + if (i + VF > e) + break; + DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i + << "\n"); + ArrayRef<Value *> Operands = Chain.slice(i, VF); + + R.buildTree(Operands); + + int Cost = R.getTreeCost(); + + DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + R.vectorizeTree(); + + // Move to the next bundle. + i += VF - 1; + Changed = true; + } + } + + if (Changed || ChainLen > VF) + return Changed; + + // Handle short chains. This helps us catch types such as <3 x float> that + // are smaller than vector size. + R.buildTree(Chain); + + int Cost = R.getTreeCost(); + + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost + << " for size = " << ChainLen << "\n"); + R.vectorizeTree(); + return true; + } + + return false; +} + +bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, + int costThreshold, BoUpSLP &R) { + SetVector<Value *> Heads, Tails; + SmallDenseMap<Value *, Value *> ConsecutiveChain; + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we vectorized so that we don't visit the same store twice. + BoUpSLP::ValueSet VectorizedStores; + bool Changed = false; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of loads that follow each other. + for (unsigned i = 0, e = Stores.size(); i < e; ++i) + for (unsigned j = 0; j < e; ++j) { + if (i == j) + continue; + + if (R.isConsecutiveAccess(Stores[i], Stores[j])) { + Tails.insert(Stores[j]); + Heads.insert(Stores[i]); + ConsecutiveChain[Stores[i]] = Stores[j]; + } + } + + // For stores that start but don't end a link in the chain: + for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to vectorize it. + BoUpSLP::ValueList Operands; + Value *I = *it; + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + if (VectorizedStores.count(I)) + break; + Operands.push_back(I); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); + + // Mark the vectorized stores so that we don't vectorize them again. + if (Vectorized) + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed |= Vectorized; + } + + return Changed; +} + + +unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { unsigned count = 0; StoreRefs.clear(); for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { @@ -1585,14 +1616,14 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) { return count; } -bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, FuncSLP &R) { +bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; Value *VL[] = { A, B }; return tryToVectorizeList(VL, R, true); } -bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R, +bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool NeedExtracts) { if (VL.size() < 2) return false; @@ -1615,9 +1646,8 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R, return 0; } - int Cost = R.getTreeCost(VL); - if (Cost == FuncSLP::MAX_COST) - return false; + R.buildTree(VL); + int Cost = R.getTreeCost(); int ExtrCost = NeedExtracts ? R.getGatherCost(VL) : 0; DEBUG(dbgs() << "SLP: Cost of pair:" << Cost @@ -1625,11 +1655,11 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R, if ((Cost + ExtrCost) >= -SLPCostThreshold) return false; DEBUG(dbgs() << "SLP: Vectorizing pair.\n"); - R.vectorizeArith(VL); + R.vectorizeTree(); return true; } -bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) { +bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { if (!V) return false; @@ -1669,7 +1699,7 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) { return 0; } -bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) { +bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { if (isa<DbgInfoIntrinsic>(it)) @@ -1737,7 +1767,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) { return Changed; } -bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) { +bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { bool Changed = false; // Attempt to sort and vectorize each of the store-groups. for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); @@ -1748,7 +1778,7 @@ bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) { DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << it->second.size() << ".\n"); - Changed |= R.vectorizeStores(it->second, -SLPCostThreshold); + Changed |= vectorizeStores(it->second, -SLPCostThreshold, R); } return Changed; } diff --git a/test/Transforms/SLPVectorizer/X86/crash_7zip.ll b/test/Transforms/SLPVectorizer/X86/crash_7zip.ll new file mode 100644 index 0000000..51b1c08 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_7zip.ll @@ -0,0 +1,38 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334 = type { %struct._CLzmaProps.0.27.54.81.102.123.144.165.180.195.228.258.333, i16*, i8*, i8*, i32, i32, i64, i64, i32, i32, i32, [4 x i32], i32, i32, i32, i32, i32, [20 x i8] } +%struct._CLzmaProps.0.27.54.81.102.123.144.165.180.195.228.258.333 = type { i32, i32, i32, i32 } + +define fastcc void @LzmaDec_DecodeReal2(%struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p) { +entry: + %range20.i = getelementptr inbounds %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p, i64 0, i32 4 + %code21.i = getelementptr inbounds %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p, i64 0, i32 5 + br label %do.body66.i + +do.body66.i: ; preds = %do.cond.i, %entry + %range.2.i = phi i32 [ %range.4.i, %do.cond.i ], [ undef, %entry ] + %code.2.i = phi i32 [ %code.4.i, %do.cond.i ], [ undef, %entry ] + %.range.2.i = select i1 undef, i32 undef, i32 %range.2.i + %.code.2.i = select i1 undef, i32 undef, i32 %code.2.i + br i1 undef, label %do.cond.i, label %if.else.i + +if.else.i: ; preds = %do.body66.i + %sub91.i = sub i32 %.range.2.i, undef + %sub92.i = sub i32 %.code.2.i, undef + br label %do.cond.i + +do.cond.i: ; preds = %if.else.i, %do.body66.i + %range.4.i = phi i32 [ %sub91.i, %if.else.i ], [ undef, %do.body66.i ] + %code.4.i = phi i32 [ %sub92.i, %if.else.i ], [ %.code.2.i, %do.body66.i ] + br i1 undef, label %do.body66.i, label %do.end1006.i + +do.end1006.i: ; preds = %do.cond.i + %.range.4.i = select i1 undef, i32 undef, i32 %range.4.i + %.code.4.i = select i1 undef, i32 undef, i32 %code.4.i + store i32 %.range.4.i, i32* %range20.i, align 4 + store i32 %.code.4.i, i32* %code21.i, align 4 + ret void +} diff --git a/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/test/Transforms/SLPVectorizer/X86/crash_bullet.ll new file mode 100644 index 0000000..565905d --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_bullet.ll @@ -0,0 +1,38 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960" = type { i32, i32 } + +define void @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btConstraintInfo1E(%"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* nocapture %info) { +entry: + br i1 undef, label %if.else, label %if.then + +if.then: ; preds = %entry + ret void + +if.else: ; preds = %entry + %m_numConstraintRows4 = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* %info, i64 0, i32 0 + %nub5 = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* %info, i64 0, i32 1 + br i1 undef, label %land.lhs.true.i.1, label %if.then7.1 + +land.lhs.true.i.1: ; preds = %if.else + br i1 undef, label %for.inc.1, label %if.then7.1 + +if.then7.1: ; preds = %land.lhs.true.i.1, %if.else + %inc.1 = add nsw i32 0, 1 + store i32 %inc.1, i32* %m_numConstraintRows4, align 4 + %dec.1 = add nsw i32 6, -1 + store i32 %dec.1, i32* %nub5, align 4 + br label %for.inc.1 + +for.inc.1: ; preds = %if.then7.1, %land.lhs.true.i.1 + %0 = phi i32 [ %dec.1, %if.then7.1 ], [ 6, %land.lhs.true.i.1 ] + %1 = phi i32 [ %inc.1, %if.then7.1 ], [ 0, %land.lhs.true.i.1 ] + %inc.2 = add nsw i32 %1, 1 + store i32 %inc.2, i32* %m_numConstraintRows4, align 4 + %dec.2 = add nsw i32 %0, -1 + store i32 %dec.2, i32* %nub5, align 4 + unreachable +} diff --git a/test/Transforms/SLPVectorizer/X86/crash_bullet2.ll b/test/Transforms/SLPVectorizer/X86/crash_bullet2.ll new file mode 100644 index 0000000..df026d1 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_bullet2.ll @@ -0,0 +1,38 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332 = type { float, [3 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [3 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], %class.btVector4.7.32.67.92.117.142.177.187.262.282.331, %class.btVector4.7.32.67.92.117.142.177.187.262.282.331, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, [4 x float], float, float, [4 x float], float, float, [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330] } +%class.btVector3.5.30.65.90.115.140.175.185.260.280.330 = type { [4 x float] } +%class.btVector4.7.32.67.92.117.142.177.187.262.282.331 = type { %class.btVector3.5.30.65.90.115.140.175.185.260.280.330 } + +define void @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector3S2_S2_fS2_S2_S2_fR25GIM_TRIANGLE_CONTACT_DATA(%class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this) { +entry: + %arrayidx26 = getelementptr inbounds %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this, i64 0, i32 2, i64 0, i32 0, i64 1 + %arrayidx36 = getelementptr inbounds %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this, i64 0, i32 2, i64 0, i32 0, i64 2 + %0 = load float* %arrayidx36, align 4 + %add587 = fadd float undef, undef + %sub600 = fsub float %add587, undef + store float %sub600, float* undef, align 4 + %sub613 = fsub float %add587, %sub600 + store float %sub613, float* %arrayidx26, align 4 + %add626 = fadd float %0, undef + %sub639 = fsub float %add626, undef + %sub652 = fsub float %add626, %sub639 + store float %sub652, float* %arrayidx36, align 4 + br i1 undef, label %if.else1609, label %if.then1595 + +if.then1595: ; preds = %entry + br i1 undef, label %return, label %for.body.lr.ph.i.i1702 + +for.body.lr.ph.i.i1702: ; preds = %if.then1595 + unreachable + +if.else1609: ; preds = %entry + unreachable + +return: ; preds = %if.then1595 + ret void +} + diff --git a/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll b/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll new file mode 100644 index 0000000..ce01590 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll @@ -0,0 +1,40 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" +%"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731" = type { double*, double*, double*, double** } + +; Function Attrs: nounwind ssp uwtable +define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(%"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* nocapture %__last) { +entry: + %_M_cur2.i.i = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, i64 0, i32 0 + %0 = load double** %_M_cur2.i.i, align 8 + %_M_first3.i.i = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, i64 0, i32 1 + %_M_cur2.i.i81 = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__last, i64 0, i32 0 + %1 = load double** %_M_cur2.i.i81, align 8 + %_M_first3.i.i83 = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__last, i64 0, i32 1 + %2 = load double** %_M_first3.i.i83, align 8 + br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i.preheader + +while.cond.i.preheader: ; preds = %entry + br label %while.cond.i + +while.cond.i: ; preds = %while.body.i, %while.cond.i.preheader + br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.body.i + +while.body.i: ; preds = %while.cond.i + br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i + +_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit: ; preds = %while.body.i, %while.cond.i, %entry + %3 = phi double* [ %2, %entry ], [ %2, %while.cond.i ], [ undef, %while.body.i ] + %4 = phi double* [ %0, %entry ], [ %1, %while.cond.i ], [ undef, %while.body.i ] + store double* %4, double** %_M_cur2.i.i, align 8 + store double* %3, double** %_M_first3.i.i, align 8 + br i1 undef, label %if.then.i55, label %while.cond + +if.then.i55: ; preds = %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit + br label %while.cond + +while.cond: ; preds = %while.cond, %if.then.i55, %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit + br label %while.cond +} diff --git a/test/Transforms/SLPVectorizer/X86/crash_flop7.ll b/test/Transforms/SLPVectorizer/X86/crash_flop7.ll new file mode 100644 index 0000000..e11be48 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_flop7.ll @@ -0,0 +1,46 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; Function Attrs: nounwind ssp uwtable +define void @main() #0 { +entry: + br i1 undef, label %while.body, label %while.end + +while.body: ; preds = %entry + unreachable + +while.end: ; preds = %entry + br i1 undef, label %for.end80, label %for.body75.lr.ph + +for.body75.lr.ph: ; preds = %while.end + br label %for.body75 + +for.body75: ; preds = %for.body75, %for.body75.lr.ph + br label %for.body75 + +for.end80: ; preds = %while.end + br i1 undef, label %for.end300, label %for.body267.lr.ph + +for.body267.lr.ph: ; preds = %for.end80 + br label %for.body267 + +for.body267: ; preds = %for.body267, %for.body267.lr.ph + %s.71010 = phi double [ 0.000000e+00, %for.body267.lr.ph ], [ %add297, %for.body267 ] + %mul269 = fmul double undef, undef + %mul270 = fmul double %mul269, %mul269 + %add282 = fadd double undef, undef + %mul283 = fmul double %mul269, %add282 + %add293 = fadd double undef, undef + %mul294 = fmul double %mul270, %add293 + %add295 = fadd double undef, %mul294 + %div296 = fdiv double %mul283, %add295 + %add297 = fadd double %s.71010, %div296 + br i1 undef, label %for.body267, label %for.end300 + +for.end300: ; preds = %for.body267, %for.end80 + unreachable +} + +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/Transforms/SLPVectorizer/X86/crash_lame.ll b/test/Transforms/SLPVectorizer/X86/crash_lame.ll new file mode 100644 index 0000000..cfc3fa3 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_lame.ll @@ -0,0 +1,24 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; Function Attrs: nounwind ssp uwtable +define fastcc void @dct36(double* %inbuf) #0 { +entry: + %arrayidx41 = getelementptr inbounds double* %inbuf, i64 2 + %arrayidx44 = getelementptr inbounds double* %inbuf, i64 1 + %0 = load double* %arrayidx44, align 8, !tbaa !0 + %add46 = fadd double %0, undef + store double %add46, double* %arrayidx41, align 8, !tbaa !0 + %1 = load double* %inbuf, align 8, !tbaa !0 + %add49 = fadd double %1, %0 + store double %add49, double* %arrayidx44, align 8, !tbaa !0 + ret void +} + +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = metadata !{metadata !"double", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/test/Transforms/SLPVectorizer/X86/crash_lencod.ll new file mode 100644 index 0000000..b35a5d7 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_lencod.ll @@ -0,0 +1,66 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; Function Attrs: nounwind ssp uwtable +define void @RCModelEstimator() { +entry: + br i1 undef, label %for.body.lr.ph, label %for.end.thread + +for.end.thread: ; preds = %entry + unreachable + +for.body.lr.ph: ; preds = %entry + br i1 undef, label %for.end, label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + br i1 undef, label %for.end, label %for.body + +for.end: ; preds = %for.body, %for.body.lr.ph + br i1 undef, label %for.body3, label %if.end103 + +for.cond14.preheader: ; preds = %for.inc11 + br i1 undef, label %for.body16.lr.ph, label %if.end103 + +for.body16.lr.ph: ; preds = %for.cond14.preheader + br label %for.body16 + +for.body3: ; preds = %for.inc11, %for.end + br i1 undef, label %if.then7, label %for.inc11 + +if.then7: ; preds = %for.body3 + br label %for.inc11 + +for.inc11: ; preds = %if.then7, %for.body3 + br i1 false, label %for.cond14.preheader, label %for.body3 + +for.body16: ; preds = %for.body16, %for.body16.lr.ph + br i1 undef, label %for.end39, label %for.body16 + +for.end39: ; preds = %for.body16 + br i1 undef, label %if.end103, label %for.cond45.preheader + +for.cond45.preheader: ; preds = %for.end39 + br i1 undef, label %if.then88, label %if.else + +if.then88: ; preds = %for.cond45.preheader + %mul89 = fmul double 0.000000e+00, 0.000000e+00 + %mul90 = fmul double 0.000000e+00, 0.000000e+00 + %sub91 = fsub double %mul89, %mul90 + %div92 = fdiv double %sub91, undef + %mul94 = fmul double 0.000000e+00, 0.000000e+00 + %mul95 = fmul double 0.000000e+00, 0.000000e+00 + %sub96 = fsub double %mul94, %mul95 + %div97 = fdiv double %sub96, undef + br label %if.end103 + +if.else: ; preds = %for.cond45.preheader + br label %if.end103 + +if.end103: ; preds = %if.else, %if.then88, %for.end39, %for.cond14.preheader, %for.end + %0 = phi double [ 0.000000e+00, %for.end39 ], [ %div97, %if.then88 ], [ 0.000000e+00, %if.else ], [ 0.000000e+00, %for.cond14.preheader ], [ 0.000000e+00, %for.end ] + %1 = phi double [ undef, %for.end39 ], [ %div92, %if.then88 ], [ undef, %if.else ], [ 0.000000e+00, %for.cond14.preheader ], [ 0.000000e+00, %for.end ] + ret void +} + diff --git a/test/Transforms/SLPVectorizer/X86/crash_lencod2.ll b/test/Transforms/SLPVectorizer/X86/crash_lencod2.ll new file mode 100644 index 0000000..d1e719c --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_lencod2.ll @@ -0,0 +1,23 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; Function Attrs: nounwind ssp uwtable +define void @intrapred_luma() #0 { +entry: + %conv153 = trunc i32 undef to i16 + %arrayidx154 = getelementptr inbounds [13 x i16]* undef, i64 0, i64 12 + store i16 %conv153, i16* %arrayidx154, align 8, !tbaa !0 + %arrayidx155 = getelementptr inbounds [13 x i16]* undef, i64 0, i64 11 + store i16 %conv153, i16* %arrayidx155, align 2, !tbaa !0 + %arrayidx156 = getelementptr inbounds [13 x i16]* undef, i64 0, i64 10 + store i16 %conv153, i16* %arrayidx156, align 4, !tbaa !0 + ret void +} + +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = metadata !{metadata !"short", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll new file mode 100644 index 0000000..b3ca235 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll @@ -0,0 +1,53 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +define void @main() { +entry: + br label %for.body + +for.body: ; preds = %for.end44, %entry + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %if.then25, %for.body + br label %for.body6 + +for.body6: ; preds = %for.inc21, %for.cond4.preheader + br label %for.body12 + +for.body12: ; preds = %if.end, %for.body6 + %fZImg.069 = phi double [ undef, %for.body6 ], [ %add19, %if.end ] + %fZReal.068 = phi double [ undef, %for.body6 ], [ %add20, %if.end ] + %mul13 = fmul double %fZReal.068, %fZReal.068 + %mul14 = fmul double %fZImg.069, %fZImg.069 + %add15 = fadd double %mul13, %mul14 + %cmp16 = fcmp ogt double %add15, 4.000000e+00 + br i1 %cmp16, label %for.inc21, label %if.end + +if.end: ; preds = %for.body12 + %mul18 = fmul double undef, %fZImg.069 + %add19 = fadd double undef, %mul18 + %sub = fsub double %mul13, %mul14 + %add20 = fadd double undef, %sub + br i1 undef, label %for.body12, label %for.inc21 + +for.inc21: ; preds = %if.end, %for.body12 + br i1 undef, label %for.end23, label %for.body6 + +for.end23: ; preds = %for.inc21 + br i1 undef, label %if.then25, label %if.then26 + +if.then25: ; preds = %for.end23 + br i1 undef, label %for.end44, label %for.cond4.preheader + +if.then26: ; preds = %for.end23 + unreachable + +for.end44: ; preds = %if.then25 + br i1 undef, label %for.end48, label %for.body + +for.end48: ; preds = %for.end44 + ret void +} + diff --git a/test/Transforms/SLPVectorizer/X86/crash_rc4.ll b/test/Transforms/SLPVectorizer/X86/crash_rc4.ll new file mode 100644 index 0000000..2037470 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_rc4.ll @@ -0,0 +1,28 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct.rc4_state.0.24 = type { i32, i32, [256 x i32] } + +define void @rc4_crypt(%struct.rc4_state.0.24* nocapture %s) { +entry: + %x1 = getelementptr inbounds %struct.rc4_state.0.24* %s, i64 0, i32 0 + %y2 = getelementptr inbounds %struct.rc4_state.0.24* %s, i64 0, i32 1 + br i1 undef, label %for.body, label %for.end + +for.body: ; preds = %for.body, %entry + %x.045 = phi i32 [ %conv4, %for.body ], [ undef, %entry ] + %conv4 = and i32 undef, 255 + %conv7 = and i32 undef, 255 + %idxprom842 = zext i32 %conv7 to i64 + br i1 undef, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %x.0.lcssa = phi i32 [ undef, %entry ], [ %conv4, %for.body ] + %y.0.lcssa = phi i32 [ undef, %entry ], [ %conv7, %for.body ] + store i32 %x.0.lcssa, i32* %x1, align 4 + store i32 %y.0.lcssa, i32* %y2, align 4 + ret void +} + diff --git a/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll b/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll new file mode 100644 index 0000000..0541545 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll @@ -0,0 +1,113 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171 = type { i32, i32, i32, i32, i32, i32, [8 x i8] } + +define void @SIM4() { +entry: + br i1 undef, label %return, label %lor.lhs.false + +lor.lhs.false: ; preds = %entry + br i1 undef, label %return, label %if.end + +if.end: ; preds = %lor.lhs.false + br i1 undef, label %for.end605, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %if.end + br label %for.body + +for.body: ; preds = %for.inc603, %for.body.lr.ph + br i1 undef, label %for.inc603, label %if.end12 + +if.end12: ; preds = %for.body + br i1 undef, label %land.lhs.true, label %land.lhs.true167 + +land.lhs.true: ; preds = %if.end12 + br i1 undef, label %if.then17, label %land.lhs.true167 + +if.then17: ; preds = %land.lhs.true + br i1 undef, label %if.end98, label %land.rhs.lr.ph + +land.rhs.lr.ph: ; preds = %if.then17 + unreachable + +if.end98: ; preds = %if.then17 + %from299 = getelementptr inbounds %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 1 + br i1 undef, label %land.lhs.true167, label %if.then103 + +if.then103: ; preds = %if.end98 + %.sub100 = select i1 undef, i32 250, i32 undef + %mul114 = shl nsw i32 %.sub100, 2 + %from1115 = getelementptr inbounds %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0 + %cond125 = select i1 undef, i32 undef, i32 %mul114 + br label %for.cond.i + +for.cond.i: ; preds = %land.rhs.i874, %if.then103 + %row.0.i = phi i32 [ undef, %land.rhs.i874 ], [ %.sub100, %if.then103 ] + %col.0.i = phi i32 [ undef, %land.rhs.i874 ], [ %cond125, %if.then103 ] + br i1 undef, label %land.rhs.i874, label %for.end.i + +land.rhs.i874: ; preds = %for.cond.i + br i1 undef, label %for.cond.i, label %for.end.i + +for.end.i: ; preds = %land.rhs.i874, %for.cond.i + br i1 undef, label %if.then.i, label %if.end.i + +if.then.i: ; preds = %for.end.i + %add14.i = add nsw i32 %row.0.i, undef + %add15.i = add nsw i32 %col.0.i, undef + br label %extend_bw.exit + +if.end.i: ; preds = %for.end.i + %add16.i = add i32 %cond125, %.sub100 + %cmp26514.i = icmp slt i32 %add16.i, 0 + br i1 %cmp26514.i, label %for.end33.i, label %for.body28.lr.ph.i + +for.body28.lr.ph.i: ; preds = %if.end.i + br label %for.end33.i + +for.end33.i: ; preds = %for.body28.lr.ph.i, %if.end.i + br i1 undef, label %for.end58.i, label %for.body52.lr.ph.i + +for.body52.lr.ph.i: ; preds = %for.end33.i + br label %for.end58.i + +for.end58.i: ; preds = %for.body52.lr.ph.i, %for.end33.i + br label %while.cond260.i + +while.cond260.i: ; preds = %land.rhs263.i, %for.end58.i + br i1 undef, label %land.rhs263.i, label %while.end275.i + +land.rhs263.i: ; preds = %while.cond260.i + br i1 undef, label %while.cond260.i, label %while.end275.i + +while.end275.i: ; preds = %land.rhs263.i, %while.cond260.i + br label %extend_bw.exit + +extend_bw.exit: ; preds = %while.end275.i, %if.then.i + %add14.i1262 = phi i32 [ %add14.i, %if.then.i ], [ undef, %while.end275.i ] + %add15.i1261 = phi i32 [ %add15.i, %if.then.i ], [ undef, %while.end275.i ] + br i1 false, label %if.then157, label %land.lhs.true167 + +if.then157: ; preds = %extend_bw.exit + %add158 = add nsw i32 %add14.i1262, 1 + store i32 %add158, i32* %from299, align 4 + %add160 = add nsw i32 %add15.i1261, 1 + store i32 %add160, i32* %from1115, align 4 + br label %land.lhs.true167 + +land.lhs.true167: ; preds = %if.then157, %extend_bw.exit, %if.end98, %land.lhs.true, %if.end12 + unreachable + +for.inc603: ; preds = %for.body + br i1 undef, label %for.body, label %for.end605 + +for.end605: ; preds = %for.inc603, %if.end + unreachable + +return: ; preds = %lor.lhs.false, %entry + ret void +} + diff --git a/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll new file mode 100644 index 0000000..ac7e412 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -0,0 +1,65 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct.Ray.5.11.53.113.119.137.149.185.329.389.415 = type { %struct.Vec.0.6.48.108.114.132.144.180.324.384.414, %struct.Vec.0.6.48.108.114.132.144.180.324.384.414 } +%struct.Vec.0.6.48.108.114.132.144.180.324.384.414 = type { double, double, double } + +; Function Attrs: ssp uwtable +define void @main() #0 { +entry: + br i1 undef, label %cond.true, label %cond.end + +cond.true: ; preds = %entry + unreachable + +cond.end: ; preds = %entry + br label %invoke.cont + +invoke.cont: ; preds = %invoke.cont, %cond.end + br i1 undef, label %arrayctor.cont, label %invoke.cont + +arrayctor.cont: ; preds = %invoke.cont + %agg.tmp99208.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.415* undef, i64 0, i32 0, i32 0 + %agg.tmp99208.sroa.1.8.idx388 = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.415* undef, i64 0, i32 0, i32 1 + %agg.tmp101211.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.415* undef, i64 0, i32 1, i32 0 + %agg.tmp101211.sroa.1.8.idx390 = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.415* undef, i64 0, i32 1, i32 1 + br label %for.cond36.preheader + +for.cond36.preheader: ; preds = %_Z5clampd.exit.1, %arrayctor.cont + br i1 undef, label %for.body42.lr.ph.us, label %_Z5clampd.exit.1 + +cond.false51.us: ; preds = %for.body42.lr.ph.us + unreachable + +cond.true48.us: ; preds = %for.body42.lr.ph.us + br i1 undef, label %cond.true63.us, label %cond.false66.us + +cond.false66.us: ; preds = %cond.true48.us + %add.i276.us = fadd double 0.000000e+00, undef + %add.i264.us = fadd double %add.i276.us, 0.000000e+00 + %add4.i267.us = fadd double undef, 0xBFA5CC2D1960285F + %mul.i254.us = fmul double %add.i264.us, 1.400000e+02 + %mul2.i256.us = fmul double %add4.i267.us, 1.400000e+02 + %add.i243.us = fadd double %mul.i254.us, 5.000000e+01 + %add4.i246.us = fadd double %mul2.i256.us, 5.200000e+01 + %mul.i.i.us = fmul double undef, %add.i264.us + %mul2.i.i.us = fmul double undef, %add4.i267.us + store double %add.i243.us, double* %agg.tmp99208.sroa.0.0.idx, align 8 + store double %add4.i246.us, double* %agg.tmp99208.sroa.1.8.idx388, align 8 + store double %mul.i.i.us, double* %agg.tmp101211.sroa.0.0.idx, align 8 + store double %mul2.i.i.us, double* %agg.tmp101211.sroa.1.8.idx390, align 8 + unreachable + +cond.true63.us: ; preds = %cond.true48.us + unreachable + +for.body42.lr.ph.us: ; preds = %for.cond36.preheader + br i1 undef, label %cond.true48.us, label %cond.false51.us + +_Z5clampd.exit.1: ; preds = %for.cond36.preheader + br label %for.cond36.preheader +} + +attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/Transforms/SLPVectorizer/X86/crash_smallpt2.ll b/test/Transforms/SLPVectorizer/X86/crash_smallpt2.ll new file mode 100644 index 0000000..84c7b3a --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/crash_smallpt2.ll @@ -0,0 +1,46 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601 = type { %struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600, %struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600 } +%struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600 = type { double, double, double } + +; Function Attrs: ssp uwtable +define void @_Z8radianceRK3RayiPt() #0 { +entry: + br i1 undef, label %if.then78, label %if.then38 + +if.then38: ; preds = %entry + %mul.i.i790 = fmul double undef, undef + %mul3.i.i792 = fmul double undef, undef + %mul.i764 = fmul double undef, %mul3.i.i792 + %mul4.i767 = fmul double undef, undef + %sub.i768 = fsub double %mul.i764, %mul4.i767 + %mul6.i770 = fmul double undef, %mul.i.i790 + %mul9.i772 = fmul double undef, %mul3.i.i792 + %sub10.i773 = fsub double %mul6.i770, %mul9.i772 + %mul.i736 = fmul double undef, %sub.i768 + %mul2.i738 = fmul double undef, %sub10.i773 + %mul.i727 = fmul double undef, %mul.i736 + %mul2.i729 = fmul double undef, %mul2.i738 + %add.i716 = fadd double undef, %mul.i727 + %add4.i719 = fadd double undef, %mul2.i729 + %add.i695 = fadd double undef, %add.i716 + %add4.i698 = fadd double undef, %add4.i719 + %mul.i.i679 = fmul double undef, %add.i695 + %mul2.i.i680 = fmul double undef, %add4.i698 + %agg.tmp74663.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0 + store double %mul.i.i679, double* %agg.tmp74663.sroa.0.0.idx, align 8 + %agg.tmp74663.sroa.1.8.idx943 = getelementptr inbounds %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 1 + store double %mul2.i.i680, double* %agg.tmp74663.sroa.1.8.idx943, align 8 + br label %return + +if.then78: ; preds = %entry + br label %return + +return: ; preds = %if.then78, %if.then38 + ret void +} + +attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/Transforms/SLPVectorizer/X86/diamond.ll b/test/Transforms/SLPVectorizer/X86/diamond.ll index 008f09d..2a237ea 100644 --- a/test/Transforms/SLPVectorizer/X86/diamond.ll +++ b/test/Transforms/SLPVectorizer/X86/diamond.ll @@ -50,7 +50,8 @@ entry: ; } ; CHECK: @extr_user -; CHECK: load i32* +; CHECK: load <4 x i32> +; CHECK-NEXT: extractelement <4 x i32> ; CHECK: store <4 x i32> ; CHECK-NEXT: ret define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) { @@ -79,7 +80,8 @@ entry: ; In this example we have an external user that is not the first element in the vector. ; CHECK: @extr_user1 -; CHECK: load i32* +; CHECK: load <4 x i32> +; CHECK-NEXT: extractelement <4 x i32> ; CHECK: store <4 x i32> ; CHECK-NEXT: ret define i32 @extr_user1(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) { diff --git a/test/Transforms/SLPVectorizer/X86/long_chains.ll b/test/Transforms/SLPVectorizer/X86/long_chains.ll index 0a2ace3..5af3e6d 100644 --- a/test/Transforms/SLPVectorizer/X86/long_chains.ll +++ b/test/Transforms/SLPVectorizer/X86/long_chains.ll @@ -3,12 +3,13 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" +; At this point we can't vectorize only parts of the tree. + ; CHECK: test -; CHECK: sitofp i8 -; CHECK-NEXT: sitofp i8 -; CHECK-NEXT: insertelement -; CHECK-NEXT: insertelement -; CHECK-NEXT: fmul <2 x double> +; CHECK: insertelement <2 x i8> +; CHECK: insertelement <2 x i8> +; CHECK: sitofp <2 x i8> +; CHECK: fmul <2 x double> ; CHECK: ret define i32 @test(double* nocapture %A, i8* nocapture %B) { entry: @@ -18,7 +19,7 @@ entry: %add = add i8 %0, 3 %add4 = add i8 %1, 3 %conv6 = sitofp i8 %add to double - %conv7 = sitofp i8 %add4 to double ; <--- This is inefficient. The chain stops here. + %conv7 = sitofp i8 %add4 to double %mul = fmul double %conv6, %conv6 %add8 = fadd double %mul, 1.000000e+00 %mul9 = fmul double %conv7, %conv7 diff --git a/test/Transforms/SLPVectorizer/X86/saxpy.ll b/test/Transforms/SLPVectorizer/X86/saxpy.ll index b520913..4626341 100644 --- a/test/Transforms/SLPVectorizer/X86/saxpy.ll +++ b/test/Transforms/SLPVectorizer/X86/saxpy.ll @@ -43,3 +43,19 @@ define void @SAXPY(i32* noalias nocapture %x, i32* noalias nocapture %y, i32 %a, ret void } +; Make sure we don't crash on this one. +define void @SAXPY_crash(i32* noalias nocapture %x, i32* noalias nocapture %y, i64 %i) { + %1 = add i64 %i, 1 + %2 = getelementptr inbounds i32* %x, i64 %1 + %3 = getelementptr inbounds i32* %y, i64 %1 + %4 = load i32* %3, align 4 + %5 = add nsw i32 undef, %4 + store i32 %5, i32* %2, align 4 + %6 = add i64 %i, 2 + %7 = getelementptr inbounds i32* %x, i64 %6 + %8 = getelementptr inbounds i32* %y, i64 %6 + %9 = load i32* %8, align 4 + %10 = add nsw i32 undef, %9 + store i32 %10, i32* %7, align 4 + ret void +} |