From d15c0c7ac118cb23241b002e7206221283e36e2d Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 17 Oct 2012 18:25:06 +0000 Subject: Add a loop vectorizer. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166112 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 801 +++++++++++++++++++++++++++++ 1 file changed, 801 insertions(+) create mode 100644 lib/Transforms/Vectorize/LoopVectorize.cpp (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp new file mode 100644 index 0000000..60405e7 --- /dev/null +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -0,0 +1,801 @@ +//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a simple loop vectorizer. We currently only support single block +// loops. We have a very simple and restrictive legality check: we need to read +// and write from disjoint memory locations. We still don't have a cost model. +// This pass has three parts: +// 1. The main loop pass that drives the different parts. +// 2. LoopVectorizationLegality - A helper class that checks for the legality +// of the vectorization. +// 3. SingleBlockLoopVectorizer - A helper class that performs the actual +// widening of instructions. +// +//===----------------------------------------------------------------------===// +#define LV_NAME "loop-vectorize" +#define DEBUG_TYPE LV_NAME +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Value.h" +#include "llvm/Function.h" +#include "llvm/Module.h" +#include "llvm/Type.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/DataLayout.h" +#include "llvm/Transforms/Utils/Local.h" +#include +using namespace llvm; + +static cl::opt +DefaultVectorizationFactor("default-loop-vectorize-width", + cl::init(4), cl::Hidden, + cl::desc("Set the default loop vectorization width")); + +namespace { + +/// Vectorize a simple loop. This class performs the widening of simple single +/// basic block loops into vectors. It does not perform any +/// vectorization-legality checks, and just does it. It widens the vectors +/// to a given vectorization factor (VF). +class SingleBlockLoopVectorizer { +public: + + /// Ctor. + SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li, + unsigned VecWidth): + Orig(OrigLoop), SE(Se), LI(Li), VF(VecWidth), + Builder(0), Induction(0), OldInduction(0) { } + + ~SingleBlockLoopVectorizer() { + delete Builder; + } + + // Perform the actual loop widening (vectorization). + void vectorize() { + ///Create a new empty loop. Unlink the old loop and connect the new one. + copyEmptyLoop(); + /// Widen each instruction in the old loop to a new one in the new loop. + vectorizeLoop(); + // Delete the old loop. + deleteOldLoop(); + } + +private: + /// Create an empty loop, based on the loop ranges of the old loop. + void copyEmptyLoop(); + /// Copy and widen the instructions from the old loop. + void vectorizeLoop(); + /// Delete the old loop. + void deleteOldLoop(); + + /// This instruction is un-vectorizable. Implement it as a sequence + /// of scalars. + void scalarizeInstruction(Instruction *Instr); + + /// Create a broadcast instruction. This method generates a broadcast + /// instruction (shuffle) for loop invariant values and for the induction + /// value. If this is the induction variable then we extend it to N, N+1, ... + /// this is needed because each iteration in the loop corresponds to a SIMD + /// element. + Value *getBroadcastInstrs(Value *V); + + /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. + /// for each element in the vector. Starting from zero. + Value *getConsecutiveVector(Value* Val); + + /// Check that the GEP operands are all uniform except for the last index + /// which has to be the induction variable. + bool isConsecutiveGep(GetElementPtrInst *Gep); + + /// When we go over instructions in the basic block we rely on previous + /// values within the current basic block or on loop invariant values. + /// When we widen (vectorize) values we place them in the map. If the values + /// are not within the map, they have to be loop invariant, so we simply + /// broadcast them into a vector. + Value *getVectorValue(Value *V); + + /// The original loop. + Loop *Orig; + // Scev analysis to use. + ScalarEvolution *SE; + // Loop Info. + LoopInfo *LI; + // The vectorization factor to use. + unsigned VF; + + // The builder that we use + IRBuilder<> *Builder; + + // --- Vectorization state --- + + /// The new Induction variable which was added to the new block. + Instruction *Induction; + /// The induction variable of the old basic block. + Instruction *OldInduction; + // Maps scalars to widened vectors. + DenseMap WidenMap; +}; + + +/// Perform the vectorization legality check. This class does not look at the +/// profitability of vectorization, only the legality. At the moment the checks +/// are very simple and focus on single basic block loops with a constant +/// iteration count and no reductions. +class LoopVectorizationLegality { +public: + LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): + TheLoop(Lp), SE(Se), DL(Dl) { } + + /// Returns the maximum vectorization factor that we *can* use to vectorize + /// this loop. This does not mean that it is profitable to vectorize this + /// loop, only that it is legal to do so. This may be a large number. We + /// can vectorize to any SIMD width below this number. + unsigned getLoopMaxVF(); + +private: + /// Check if a single basic block loop is vectorizable. + /// At this point we know that this is a loop with a constant trip count + /// and we only need to check individual instructions. + bool canVectorizeBlock(BasicBlock &BB); + + // Check if a pointer value is known to be disjoint. + // Example: Alloca, Global, NoAlias. + bool isKnownDisjoint(Value* Val); + + /// The loop that we evaluate. + Loop *TheLoop; + /// Scev analysis. + ScalarEvolution *SE; + /// DataLayout analysis. + DataLayout *DL; +}; + +struct LoopVectorize : public LoopPass { + static char ID; // Pass identification, replacement for typeid + + LoopVectorize() : LoopPass(ID) { + initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); + } + + AliasAnalysis *AA; + ScalarEvolution *SE; + DataLayout *DL; + LoopInfo *LI; + + virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { + // Only vectorize innermost loops. + if (!L->empty()) + return false; + + AA = &getAnalysis(); + SE = &getAnalysis(); + DL = getAnalysisIfAvailable(); + LI = &getAnalysis(); + + BasicBlock *Header = L->getHeader(); + DEBUG(dbgs() << "LV: Checking a loop in \"" << + Header->getParent()->getName() << "\"\n"); + + // Check if it is legal to vectorize the loop. + LoopVectorizationLegality LVL(L, SE, DL); + unsigned MaxVF = LVL.getLoopMaxVF(); + + // Check that we can vectorize using the chosen vectorization width. + if ((MaxVF < DefaultVectorizationFactor) || + (MaxVF % DefaultVectorizationFactor)) { + DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n"); + return false; + } + + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n"); + + // If we decided that is is *legal* to vectorizer the loop. Do it. + SingleBlockLoopVectorizer LB(L, SE, LI, DefaultVectorizationFactor); + LB.vectorize(); + + // The loop is now vectorized. Remove it from LMP. + LPM.deleteLoopFromQueue(L); + return true; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + LoopPass::getAnalysisUsage(AU); + AU.addRequiredID(LoopSimplifyID); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } + +}; + +Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { + // Instructions that access the old induction variable + // actually want to get the new one. + if (V == OldInduction) + V = Induction; + // Create the types. + LLVMContext &C = V->getContext(); + Type *VTy = VectorType::get(V->getType(), VF); + Type *I32 = IntegerType::getInt32Ty(C); + Constant *Zero = ConstantInt::get(I32, 0); + Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); + Value *UndefVal = UndefValue::get(VTy); + // Insert the value into a new vector. + Value *SingleElem = Builder->CreateInsertElement(UndefVal, V, Zero); + // Broadcast the scalar into all locations in the vector. + Value *Shuf = Builder->CreateShuffleVector(SingleElem, UndefVal, Zeros, + "broadcast"); + // We are accessing the induction variable. Make sure to promote the + // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. + if (V == Induction) + return getConsecutiveVector(Shuf); + return Shuf; +} + +Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { + assert(Val->getType()->isVectorTy() && "Must be a vector"); + assert(Val->getType()->getScalarType()->isIntegerTy() && + "Elem must be an integer"); + // Create the types. + Type *ITy = Val->getType()->getScalarType(); + VectorType *Ty = cast(Val->getType()); + unsigned VLen = Ty->getNumElements(); + SmallVector Indices; + + // Create a vector of consecutive numbers from zero to VF. + for (unsigned i = 0; i < VLen; ++i) + Indices.push_back(ConstantInt::get(ITy, i)); + + // Add the consecutive indices to the vector value. + Constant *Cv = ConstantVector::get(Indices); + assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); + return Builder->CreateAdd(Val, Cv, "induction"); +} + + +bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) { + if (!Gep) + return false; + + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = Gep->getOperand(NumOperands - 1); + + // Check that all of the gep indices are uniform except for the last. + for (unsigned i = 0; i < NumOperands - 1; ++i) + if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), Orig)) + return false; + + // The last operand has to be the induction in order to emit + // a wide load/store. + const SCEV *Last = SE->getSCEV(LastIndex); + if (const SCEVAddRecExpr *AR = dyn_cast(Last)) { + const SCEV *Step = AR->getStepRecurrence(*SE); + + // The memory is consecutive because the last index is consecutive + // and all other indices are loop invariant. + if (Step->isOne()) + return true; + } + + return false; +} + +Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { + if (WidenMap.count(V)) + return WidenMap[V]; + return getBroadcastInstrs(V); +} + +void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { + assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + // Holds vector parameters or scalars, in case of uniform vals. + SmallVector Params; + + // Find all of the vectorized parameters. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *SrcOp = Instr->getOperand(op); + + // If we are accessing the old induction variable, use the new one. + if (SrcOp == OldInduction) { + Params.push_back(getBroadcastInstrs(Induction)); + continue; + } + + // Try using previously calculated values. + Instruction *SrcInst = dyn_cast(SrcOp); + + // If the src is an instruction that appeared earlier in the basic block + // then it should already be vectorized. + if (SrcInst && SrcInst->getParent() == Instr->getParent()) { + assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); + // The parameter is a vector value from earlier. + Params.push_back(WidenMap[SrcInst]); + } else { + // The parameter is a scalar from outside the loop. Maybe even a constant. + Params.push_back(SrcOp); + } + } + + assert(Params.size() == Instr->getNumOperands() && + "Invalid number of operands"); + + // Does this instruction return a value ? + bool IsVoidRetTy = Instr->getType()->isVoidTy(); + Value *VecResults = 0; + + // If we have a return value, create an empty vector. We place the scalarized + // instructions in this vector. + if (!IsVoidRetTy) + VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); + + // For each scalar that we create. + for (unsigned i = 0; i < VF; ++i) { + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); + // Replace the operands of the cloned instrucions with extracted scalars. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *Op = Params[op]; + // Param is a vector. Need to extract the right lane. + if (Op->getType()->isVectorTy()) + Op = Builder->CreateExtractElement(Op, Builder->getInt32(i)); + Cloned->setOperand(op, Op); + } + + // Place the clonsed scalar in the new loop. + Builder->Insert(Cloned); + + // If the original scalar returns a value we need to place it in a vector + // so that future users will be able to use it. + if (!IsVoidRetTy) + VecResults = Builder->CreateInsertElement(VecResults, Cloned, + Builder->getInt32(i)); + } + + if (!IsVoidRetTy) + WidenMap[Instr] = VecResults; +} + +void SingleBlockLoopVectorizer::copyEmptyLoop() { + assert(Orig->getNumBlocks() == 1 && "Invalid loop"); + BasicBlock *PH = Orig->getLoopPreheader(); + BasicBlock *ExitBlock = Orig->getExitBlock(); + assert(ExitBlock && "Invalid loop exit"); + + // Create a new single-basic block loop. + BasicBlock *BB = BasicBlock::Create(PH->getContext(), "vectorizedloop", + PH->getParent(), ExitBlock); + + // Find the induction variable. + BasicBlock *OldBasicBlock = Orig->getHeader(); + PHINode *OldInd = dyn_cast(OldBasicBlock->begin()); + assert(OldInd && "We must have a single phi node."); + Type *IdxTy = OldInd->getType(); + + // Use this IR builder to create the loop instructions (Phi, Br, Cmp) + // inside the loop. + Builder = new IRBuilder<>(BB); + Builder->SetInsertPoint(BB); + + // Generate the induction variable. + PHINode *Phi = Builder->CreatePHI(IdxTy, 2, "index"); + Constant *Zero = ConstantInt::get(IdxTy, 0); + Constant *Step = ConstantInt::get(IdxTy, VF); + + // Find the loop boundaries. + const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader()); + assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); + + // Get the trip count from the count by adding 1. + ExitCount = SE->getAddExpr(ExitCount, + SE->getConstant(ExitCount->getType(), 1)); + + // Expand the trip count and place the new instructions in the preheader. + // Notice that the pre-header does not change, only the loop body. + SCEVExpander Exp(*SE, "induction"); + Instruction *Loc = Orig->getLoopPreheader()->getTerminator(); + if (ExitCount->getType() != Phi->getType()) + ExitCount = SE->getSignExtendExpr(ExitCount, Phi->getType()); + Value *Count = Exp.expandCodeFor(ExitCount, Phi->getType(), Loc); + + // Create i+1 and fill the PHINode. + Value *Next = Builder->CreateAdd(Phi, Step, "index.next"); + Phi->addIncoming(Zero, PH); + Phi->addIncoming(Next, BB); + // Create the compare. + Value *ICmp = Builder->CreateICmpEQ(Next, Count); + Builder->CreateCondBr(ICmp, ExitBlock, BB); + // Fix preheader. + PH->getTerminator()->setSuccessor(0, BB); + Builder->SetInsertPoint(BB->getFirstInsertionPt()); + + // Save the indiction variables. + Induction = Phi; + OldInduction = OldInd; +} + +void SingleBlockLoopVectorizer::vectorizeLoop() { + BasicBlock &BB = *Orig->getHeader(); + + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { + Instruction *Inst = it; + + switch (Inst->getOpcode()) { + case Instruction::PHI: + case Instruction::Br: + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + continue; + + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen binops. + BinaryOperator *BinOp = dyn_cast(Inst); + Value *A = getVectorValue(Inst->getOperand(0)); + Value *B = getVectorValue(Inst->getOperand(1)); + // Use this vector value for all users of the original instruction. + WidenMap[Inst] = Builder->CreateBinOp(BinOp->getOpcode(), A, B); + break; + } + case Instruction::Select: { + // Widen selects. + Value *A = getVectorValue(Inst->getOperand(0)); + Value *B = getVectorValue(Inst->getOperand(1)); + Value *C = getVectorValue(Inst->getOperand(2)); + WidenMap[Inst] = Builder->CreateSelect(A, B, C); + break; + } + + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (Inst->getOpcode() == Instruction::FCmp); + CmpInst *Cmp = dyn_cast(Inst); + Value *A = getVectorValue(Inst->getOperand(0)); + Value *B = getVectorValue(Inst->getOperand(1)); + if (FCmp) + WidenMap[Inst] = Builder->CreateFCmp(Cmp->getPredicate(), A, B); + else + WidenMap[Inst] = Builder->CreateICmp(Cmp->getPredicate(), A, B); + break; + } + + case Instruction::Store: { + // Attempt to issue a wide store. + StoreInst *SI = dyn_cast(Inst); + Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); + Value *Ptr = SI->getPointerOperand(); + unsigned Alignment = SI->getAlignment(); + GetElementPtrInst *Gep = dyn_cast(Ptr); + // This store does not use GEPs. + if (!isConsecutiveGep(Gep)) { + scalarizeInstruction(Inst); + break; + } + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast(Gep->clone()); + unsigned NumOperands = Gep->getNumOperands(); + Gep2->setOperand(NumOperands - 1, Induction); + Ptr = Builder->Insert(Gep2); + Ptr = Builder->CreateBitCast(Ptr, StTy->getPointerTo()); + Value *Val = getVectorValue(SI->getValueOperand()); + Builder->CreateStore(Val, Ptr)->setAlignment(Alignment); + break; + } + case Instruction::Load: { + // Attempt to issue a wide load. + LoadInst *LI = dyn_cast(Inst); + Type *RetTy = VectorType::get(LI->getType(), VF); + Value *Ptr = LI->getPointerOperand(); + unsigned Alignment = LI->getAlignment(); + GetElementPtrInst *Gep = dyn_cast(Ptr); + + // We don't have a gep. Scalarize the load. + if (!isConsecutiveGep(Gep)) { + scalarizeInstruction(Inst); + break; + } + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast(Gep->clone()); + unsigned NumOperands = Gep->getNumOperands(); + Gep2->setOperand(NumOperands - 1, Induction); + Ptr = Builder->Insert(Gep2); + Ptr = Builder->CreateBitCast(Ptr, RetTy->getPointerTo()); + LI = Builder->CreateLoad(Ptr); + LI->setAlignment(Alignment); + // Use this vector value for all users of the load. + WidenMap[Inst] = LI; + break; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + /// Vectorize bitcasts. + CastInst *CI = dyn_cast(Inst); + Value *A = getVectorValue(Inst->getOperand(0)); + Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); + WidenMap[Inst] = Builder->CreateCast(CI->getOpcode(), A, DestTy); + break; + } + + default: + /// All other instructions are unsupported. Scalarize them. + scalarizeInstruction(Inst); + break; + }// end of switch. + }// end of for_each instr. +} + +void SingleBlockLoopVectorizer::deleteOldLoop() { + // The original basic block. + BasicBlock *BB = Orig->getHeader(); + SE->forgetLoop(Orig); + + LI->removeBlock(BB); + Orig->addBasicBlockToLoop(Induction->getParent(), LI->getBase()); + + // Remove the old loop block. + DeleteDeadBlock(BB); +} + +unsigned LoopVectorizationLegality::getLoopMaxVF() { + if (!TheLoop->getLoopPreheader()) { + assert(false && "No preheader!!"); + DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); + return 1; + } + + // We can only vectorize single basic block loops. + unsigned NumBlocks = TheLoop->getNumBlocks(); + if (NumBlocks != 1) { + DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); + return 1; + } + + // We need to have a loop header. + BasicBlock *BB = TheLoop->getHeader(); + DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); + + // Find the max vectorization factor. + unsigned MaxVF = SE->getSmallConstantTripMultiple(TheLoop, BB); + + + // Perform an early check. Do not scan the block if we did not find a loop. + if (MaxVF < 2) { + DEBUG(dbgs() << "LV: Can't find a vectorizable loop structure\n"); + return 1; + } + + // Go over each instruction and look at memory deps. + if (!canVectorizeBlock(*BB)) { + DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); + return 1; + } + + DEBUG(dbgs() << "LV: We can vectorize this loop! VF="< ValueVector; + ValueVector Reads; + ValueVector Writes; + + unsigned NumPhis = 0; + for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { + Instruction *I = it; + + PHINode *Phi = dyn_cast(I); + if (Phi) { + NumPhis++; + // We only look at integer phi nodes. + if (!Phi->getType()->isIntegerTy()) { + DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); + return false; + } + + // If we found an induction variable. + if (NumPhis > 1) { + DEBUG(dbgs() << "LV: Found more than one PHI.\n"); + return false; + } + + // This should not happen because the loop should be normalized. + if (Phi->getNumIncomingValues() != 2) { + DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + return false; + } + + // Check that the PHI is consecutive and starts at zero. + const SCEV *PhiScev = SE->getSCEV(Phi); + const SCEVAddRecExpr *AR = dyn_cast(PhiScev); + if (!AR) { + DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); + return false; + } + + const SCEV *Step = AR->getStepRecurrence(*SE); + const SCEV *Start = AR->getStart(); + + if (!Step->isOne() || !Start->isZero()) { + DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n"); + return false; + } + } + + // IF this is a load, record its pointer. If it is not a load, abort. + // Notice that we don't handle function calls that read or write. + if (I->mayReadFromMemory()) { + LoadInst *Ld = dyn_cast(I); + if (!Ld) return false; + if (!Ld->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple load.\n"); + return false; + } + GetUnderlyingObjects(Ld->getPointerOperand(), Reads, DL); + } + + // Record store pointers. Abort on all other instructions that write to + // memory. + if (I->mayWriteToMemory()) { + StoreInst *St = dyn_cast(I); + if (!St) return false; + if (!St->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple store.\n"); + return false; + } + GetUnderlyingObjects(St->getPointerOperand(), Writes, DL); + } + + // We still don't handle functions. + CallInst *CI = dyn_cast(I); + if (CI) { + DEBUG(dbgs() << "LV: Found a call site:"<< + CI->getCalledFunction()->getName() << "\n"); + return false; + } + + // We do not re-vectorize vectors. + if (!VectorType::isValidElementType(I->getType()) && + !I->getType()->isVoidTy()) { + DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); + return false; + } + //Check that all of the users of the loop are inside the BB. + for (Value::use_iterator it = I->use_begin(), e = I->use_end(); + it != e; ++it) { + Instruction *U = cast(*it); + BasicBlock *Parent = U->getParent(); + if (Parent != &BB) { + DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + return false; + } + } + } // next instr. + + // Check that the underlying objects of the reads and writes are either + // disjoint memory locations, or that they are no-alias arguments. + ValueVector::iterator r, re, w, we; + for (r = Reads.begin(), re = Reads.end(); r != re; ++r) { + if (!isKnownDisjoint(*r)) { + DEBUG(dbgs() << "LV: Found a bad read Ptr: "<< **r << "\n"); + return false; + } + } + + for (w = Writes.begin(), we = Writes.end(); w != we; ++w) { + if (!isKnownDisjoint(*w)) { + DEBUG(dbgs() << "LV: Found a bad write Ptr: "<< **w << "\n"); + return false; + } + } + + // Check that there are no multiple write locations to the same pointer. + SmallPtrSet BasePointers; + for (w = Writes.begin(), we = Writes.end(); w != we; ++w) { + if (BasePointers.count(*w)) { + DEBUG(dbgs() << "LV: Multiple writes to the same index :"<< **w << "\n"); + return false; + } + BasePointers.insert(*w); + } + + // Sort the writes vector so that we can use a binary search. + std::sort(Writes.begin(), Writes.end()); + // Check that the reads and the writes are disjoint. + for (r = Reads.begin(), re = Reads.end(); r != re; ++r) { + if (std::binary_search(Writes.begin(), Writes.end(), *r)) { + DEBUG(dbgs() << "Vectorizer: Found a read/write ptr:"<< **r << "\n"); + return false; + } + } + + // All is okay. + return true; +} + +/// Checks if the value is a Global variable or if it is an Arguments +/// marked with the NoAlias attribute. +bool LoopVectorizationLegality::isKnownDisjoint(Value* Val) { + assert(Val && "Invalid value"); + if (dyn_cast(Val)) + return true; + if (dyn_cast(Val)) + return true; + Argument *A = dyn_cast(Val); + if (!A) + return false; + return A->hasNoAliasAttr(); +} + +} // namespace + +char LoopVectorize::ID = 0; +static const char lv_name[] = "Loop Vectorization"; +INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) + +namespace llvm { + Pass *createLoopVectorizePass() { + return new LoopVectorize(); + } + +} + -- cgit v1.1 From a5401d67304b9814fc13521563a9634f37f2047b Mon Sep 17 00:00:00 2001 From: Roman Divacky Date: Wed, 17 Oct 2012 21:07:35 +0000 Subject: Fix some typos and wrong indenting. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166128 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 60405e7..ce44db0 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -365,7 +365,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { Cloned->setOperand(op, Op); } - // Place the clonsed scalar in the new loop. + // Place the cloned scalar in the new loop. Builder->Insert(Cloned); // If the original scalar returns a value we need to place it in a vector @@ -432,7 +432,7 @@ void SingleBlockLoopVectorizer::copyEmptyLoop() { PH->getTerminator()->setSuccessor(0, BB); Builder->SetInsertPoint(BB->getFirstInsertionPt()); - // Save the indiction variables. + // Save the induction variables. Induction = Phi; OldInduction = OldInd; } @@ -442,7 +442,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { // For each instruction in the old loop. for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { - Instruction *Inst = it; + Instruction *Inst = it; switch (Inst->getOpcode()) { case Instruction::PHI: @@ -677,7 +677,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { } } - // IF this is a load, record its pointer. If it is not a load, abort. + // If this is a load, record its pointer. If it is not a load, abort. // Notice that we don't handle function calls that read or write. if (I->mayReadFromMemory()) { LoadInst *Ld = dyn_cast(I); -- cgit v1.1 From c2358eb361ee3304e553c0d283c5c3a44f28950f Mon Sep 17 00:00:00 2001 From: Jakub Staszak Date: Wed, 17 Oct 2012 23:06:37 +0000 Subject: Remove redundant SetInsertPoint call. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166138 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 1 - 1 file changed, 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index ce44db0..b256155 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -398,7 +398,6 @@ void SingleBlockLoopVectorizer::copyEmptyLoop() { // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. Builder = new IRBuilder<>(BB); - Builder->SetInsertPoint(BB); // Generate the induction variable. PHINode *Phi = Builder->CreatePHI(IdxTy, 2, "index"); -- cgit v1.1 From 6dc0050f6d42520ad92e6786551f27ee3f365ecd Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Wed, 17 Oct 2012 23:40:15 +0000 Subject: LoopVectorize.cpp: Fix a warning. [-Wunused-variable] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166153 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index b256155..f84e392 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -195,9 +195,8 @@ struct LoopVectorize : public LoopPass { DL = getAnalysisIfAvailable(); LI = &getAnalysis(); - BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "LV: Checking a loop in \"" << - Header->getParent()->getName() << "\"\n"); + L->getHeader()->getParent()->getName() << "\"\n"); // Check if it is legal to vectorize the loop. LoopVectorizationLegality LVL(L, SE, DL); -- cgit v1.1 From 1953ace81d3121808967a7fa47ad6d631499933d Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 18 Oct 2012 05:29:12 +0000 Subject: Vectorizer: Add support for loops with an unknown count. For example: for (i=0; i WidenMap; }; @@ -184,6 +188,7 @@ struct LoopVectorize : public LoopPass { ScalarEvolution *SE; DataLayout *DL; LoopInfo *LI; + DominatorTree *DT; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { // Only vectorize innermost loops. @@ -194,6 +199,7 @@ struct LoopVectorize : public LoopPass { SE = &getAnalysis(); DL = getAnalysisIfAvailable(); LI = &getAnalysis(); + DT = &getAnalysis(); DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); @@ -203,8 +209,7 @@ struct LoopVectorize : public LoopPass { unsigned MaxVF = LVL.getLoopMaxVF(); // Check that we can vectorize using the chosen vectorization width. - if ((MaxVF < DefaultVectorizationFactor) || - (MaxVF % DefaultVectorizationFactor)) { + if (MaxVF < DefaultVectorizationFactor) { DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n"); return false; } @@ -212,11 +217,10 @@ struct LoopVectorize : public LoopPass { DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n"); // If we decided that is is *legal* to vectorizer the loop. Do it. - SingleBlockLoopVectorizer LB(L, SE, LI, DefaultVectorizationFactor); + SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor); LB.vectorize(); - // The loop is now vectorized. Remove it from LMP. - LPM.deleteLoopFromQueue(L); + DEBUG(verifyFunction(*L->getHeader()->getParent())); return true; } @@ -226,6 +230,7 @@ struct LoopVectorize : public LoopPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); } }; @@ -327,7 +332,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { Instruction *SrcInst = dyn_cast(SrcOp); // If the src is an instruction that appeared earlier in the basic block - // then it should already be vectorized. + // then it should already be vectorized. if (SrcInst && SrcInst->getParent() == Instr->getParent()) { assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); // The parameter is a vector value from earlier. @@ -378,28 +383,71 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { WidenMap[Instr] = VecResults; } -void SingleBlockLoopVectorizer::copyEmptyLoop() { - assert(Orig->getNumBlocks() == 1 && "Invalid loop"); - BasicBlock *PH = Orig->getLoopPreheader(); +void SingleBlockLoopVectorizer::createEmptyLoop() { + /* + In this function we generate a new loop. The new loop will contain + the vectorized instructions while the old loop will continue to run the + scalar remainder. + + [ ] <-- vector loop bypass. + / | + / v +| [ ] <-- vector pre header. +| | +| v +| [ ] \ +| [ ]_| <-- vector loop. +| | + \ v + >[ ] <--- middle-block. + / | + / v +| [ ] <--- new preheader. +| | +| v +| [ ] \ +| [ ]_| <-- old scalar loop to handle remainder. () + \ | + \ v + >[ ] <-- exit block. + ... + */ + + // This is the original scalar-loop preheader. + BasicBlock *BypassBlock = Orig->getLoopPreheader(); BasicBlock *ExitBlock = Orig->getExitBlock(); - assert(ExitBlock && "Invalid loop exit"); + assert(ExitBlock && "Must have an exit block"); + + BasicBlock *ScalarBody = Orig->getHeader(); + assert(Orig->getNumBlocks() == 1 && "Invalid loop"); + assert(ScalarBody && BypassBlock && "Invalid loop structure"); + + BasicBlock *VectorPH = + BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); + BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), + "vector.body"); - // Create a new single-basic block loop. - BasicBlock *BB = BasicBlock::Create(PH->getContext(), "vectorizedloop", - PH->getParent(), ExitBlock); + BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), + "middle.block"); + + + BasicBlock *ScalarPH = + MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), + "scalar.preheader"); // Find the induction variable. BasicBlock *OldBasicBlock = Orig->getHeader(); - PHINode *OldInd = dyn_cast(OldBasicBlock->begin()); - assert(OldInd && "We must have a single phi node."); - Type *IdxTy = OldInd->getType(); + OldInduction = dyn_cast(OldBasicBlock->begin()); + assert(OldInduction && "We must have a single phi node."); + Type *IdxTy = OldInduction->getType(); // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. - Builder = new IRBuilder<>(BB); + Builder = new IRBuilder<>(VecBody); + Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); // Generate the induction variable. - PHINode *Phi = Builder->CreatePHI(IdxTy, 2, "index"); + Induction = Builder->CreatePHI(IdxTy, 2, "index"); Constant *Zero = ConstantInt::get(IdxTy, 0); Constant *Step = ConstantInt::get(IdxTy, VF); @@ -407,32 +455,78 @@ void SingleBlockLoopVectorizer::copyEmptyLoop() { const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader()); assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); - // Get the trip count from the count by adding 1. + // Get the total trip count from the count by adding 1. ExitCount = SE->getAddExpr(ExitCount, SE->getConstant(ExitCount->getType(), 1)); // Expand the trip count and place the new instructions in the preheader. // Notice that the pre-header does not change, only the loop body. SCEVExpander Exp(*SE, "induction"); - Instruction *Loc = Orig->getLoopPreheader()->getTerminator(); - if (ExitCount->getType() != Phi->getType()) - ExitCount = SE->getSignExtendExpr(ExitCount, Phi->getType()); - Value *Count = Exp.expandCodeFor(ExitCount, Phi->getType(), Loc); - + Instruction *Loc = BypassBlock->getTerminator(); + + // We may need to extend the index in case there is a type mismatch. + // We know that the count starts at zero and does not overflow. + // We are using Zext because it should be less expensive. + if (ExitCount->getType() != Induction->getType()) + ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); + + // Count holds the overall loop count (N). + Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); + // Now we need to generate the expression for N - (N % VF), which is + // the part that the vectorized body will execute. + Constant *CIVF = ConstantInt::get(IdxTy, VF); + Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); + Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); + + // Now, compare the new count to zero. If it is zero, jump to the scalar part. + Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + CountRoundDown, ConstantInt::getNullValue(IdxTy), + "cmp.zero", Loc); + BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); + // Remove the old terminator. + Loc->eraseFromParent(); + + // Add a check in the middle block to see if we have completed + // all of the iterations in the first vector loop. + // If (N - N%VF) == N, then we *don't* need to run the remainder. + Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", + MiddleBlock->getTerminator()); + + BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); + // Remove the old terminator. + MiddleBlock->getTerminator()->eraseFromParent(); + // Create i+1 and fill the PHINode. - Value *Next = Builder->CreateAdd(Phi, Step, "index.next"); - Phi->addIncoming(Zero, PH); - Phi->addIncoming(Next, BB); + Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next"); + Induction->addIncoming(Zero, VectorPH); + Induction->addIncoming(NextIdx, VecBody); // Create the compare. - Value *ICmp = Builder->CreateICmpEQ(Next, Count); - Builder->CreateCondBr(ICmp, ExitBlock, BB); - // Fix preheader. - PH->getTerminator()->setSuccessor(0, BB); - Builder->SetInsertPoint(BB->getFirstInsertionPt()); - - // Save the induction variables. - Induction = Phi; - OldInduction = OldInd; + Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown); + Builder->CreateCondBr(ICmp, MiddleBlock, VecBody); + + // Now we have two terminators. Remove the old one from the block. + VecBody->getTerminator()->eraseFromParent(); + + // Fix the scalar body iteration count. + unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); + OldInduction->setIncomingValue(BlockIdx, CountRoundDown); + + // Get ready to start creating new instructions into the vectorized body. + Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); + + // Register the new loop. + Loop* Lp = new Loop(); + LPM->insertLoop(Lp, Orig->getParentLoop()); + + Lp->addBasicBlockToLoop(VecBody, LI->getBase()); + + Loop *ParentLoop = Orig->getParentLoop(); + if (ParentLoop) { + ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); + ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); + ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); + } } void SingleBlockLoopVectorizer::vectorizeLoop() { @@ -575,16 +669,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { }// end of for_each instr. } -void SingleBlockLoopVectorizer::deleteOldLoop() { +void SingleBlockLoopVectorizer::cleanup() { // The original basic block. - BasicBlock *BB = Orig->getHeader(); SE->forgetLoop(Orig); - - LI->removeBlock(BB); - Orig->addBasicBlockToLoop(Induction->getParent(), LI->getBase()); - - // Remove the old loop block. - DeleteDeadBlock(BB); } unsigned LoopVectorizationLegality::getLoopMaxVF() { @@ -605,26 +692,25 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() { BasicBlock *BB = TheLoop->getHeader(); DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); - // Find the max vectorization factor. - unsigned MaxVF = SE->getSmallConstantTripMultiple(TheLoop, BB); - - - // Perform an early check. Do not scan the block if we did not find a loop. - if (MaxVF < 2) { - DEBUG(dbgs() << "LV: Can't find a vectorizable loop structure\n"); - return 1; - } - // Go over each instruction and look at memory deps. if (!canVectorizeBlock(*BB)) { DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); return 1; } - DEBUG(dbgs() << "LV: We can vectorize this loop! VF="<getExitCount(TheLoop, BB); + if (ExitCount == SE->getCouldNotCompute()) { + DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); + return 1; + } + + DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); + + // Okay! We can vectorize. At this point we don't have any other mem analysis + // which may limit our maximum vectorization factor, so just return the + // maximum SIMD size. + return DefaultVectorizationFactor; } bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { @@ -725,6 +811,11 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { } } // next instr. + if (NumPhis != 1) { + DEBUG(dbgs() << "LV: Did not find a Phi node.\n"); + return false; + } + // Check that the underlying objects of the reads and writes are either // disjoint memory locations, or that they are no-alias arguments. ValueVector::iterator r, re, w, we; -- cgit v1.1 From d22d5f9122de2c8527f1d8890ce2802a89e0db0c Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 18 Oct 2012 05:33:02 +0000 Subject: Remove the use of dominators and AA. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166167 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 8 -------- 1 file changed, 8 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 80fdad3..6ce303c 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -28,7 +28,6 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Value.h" #include "llvm/Function.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/Verifier.h" #include "llvm/Module.h" #include "llvm/Type.h" @@ -143,7 +142,6 @@ private: DenseMap WidenMap; }; - /// Perform the vectorization legality check. This class does not look at the /// profitability of vectorization, only the legality. At the moment the checks /// are very simple and focus on single basic block loops with a constant @@ -184,22 +182,18 @@ struct LoopVectorize : public LoopPass { initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } - AliasAnalysis *AA; ScalarEvolution *SE; DataLayout *DL; LoopInfo *LI; - DominatorTree *DT; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { // Only vectorize innermost loops. if (!L->empty()) return false; - AA = &getAnalysis(); SE = &getAnalysis(); DL = getAnalysisIfAvailable(); LI = &getAnalysis(); - DT = &getAnalysis(); DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); @@ -227,10 +221,8 @@ struct LoopVectorize : public LoopPass { virtual void getAnalysisUsage(AnalysisUsage &AU) const { LoopPass::getAnalysisUsage(AU); AU.addRequiredID(LoopSimplifyID); - AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); } }; -- cgit v1.1 From 3141d2db037bb3c31a1efd50118df45c1aebbb51 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 18 Oct 2012 06:09:21 +0000 Subject: remove unused variable to fix a warning. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166170 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 6ce303c..e3b8fc7 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -410,9 +410,8 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { BasicBlock *ExitBlock = Orig->getExitBlock(); assert(ExitBlock && "Must have an exit block"); - BasicBlock *ScalarBody = Orig->getHeader(); assert(Orig->getNumBlocks() == 1 && "Invalid loop"); - assert(ScalarBody && BypassBlock && "Invalid loop structure"); + assert(BypassBlock && "Invalid loop structure"); BasicBlock *VectorPH = BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); -- cgit v1.1 From bef36ac2a47dd647af332b8594031df6fe2522c5 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 18 Oct 2012 17:31:49 +0000 Subject: When looking for a vector representation of a scalar, do a single lookup. Also, cache the result of the broadcast instruction. No functionality change. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166191 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index e3b8fc7..cb67163 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -63,7 +63,6 @@ namespace { /// to a given vectorization factor (VF). class SingleBlockLoopVectorizer { public: - /// Ctor. SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li, LPPassManager *Lpm, unsigned VecWidth): @@ -118,6 +117,8 @@ private: /// broadcast them into a vector. Value *getVectorValue(Value *V); + typedef DenseMap ValueMap; + /// The original loop. Loop *Orig; // Scev analysis to use. @@ -139,7 +140,7 @@ private: /// The induction variable of the old basic block. PHINode *OldInduction; // Maps scalars to widened vectors. - DenseMap WidenMap; + ValueMap WidenMap; }; /// Perform the vectorization legality check. This class does not look at the @@ -284,8 +285,8 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) { if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), Orig)) return false; - // The last operand has to be the induction in order to emit - // a wide load/store. + // We can emit wide load/stores only of the last index is the induction + // variable. const SCEV *Last = SE->getSCEV(LastIndex); if (const SCEVAddRecExpr *AR = dyn_cast(Last)) { const SCEV *Step = AR->getStepRecurrence(*SE); @@ -300,9 +301,15 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) { } Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { - if (WidenMap.count(V)) - return WidenMap[V]; - return getBroadcastInstrs(V); + // If we saved a vectorized copy of V, use it. + ValueMap::iterator it = WidenMap.find(V); + if (it != WidenMap.end()) + return it->second; + + // Broadcast V and save the value for future uses. + Value *B = getBroadcastInstrs(V); + WidenMap[V] = B; + return B; } void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { -- cgit v1.1 From b943d9d497175ce44cca7b7bb14b83a86dba7d76 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 18 Oct 2012 18:34:50 +0000 Subject: Avoid reconstructing the pointer set when searching for duplicated read/write pointers. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166205 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index cb67163..9bbd9ab 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -166,7 +166,7 @@ private: // Check if a pointer value is known to be disjoint. // Example: Alloca, Global, NoAlias. - bool isKnownDisjoint(Value* Val); + bool isidentifiedSafeObject(Value* Val); /// The loop that we evaluate. Loop *TheLoop; @@ -818,34 +818,31 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { // disjoint memory locations, or that they are no-alias arguments. ValueVector::iterator r, re, w, we; for (r = Reads.begin(), re = Reads.end(); r != re; ++r) { - if (!isKnownDisjoint(*r)) { + if (!isidentifiedSafeObject(*r)) { DEBUG(dbgs() << "LV: Found a bad read Ptr: "<< **r << "\n"); return false; } } for (w = Writes.begin(), we = Writes.end(); w != we; ++w) { - if (!isKnownDisjoint(*w)) { + if (!isidentifiedSafeObject(*w)) { DEBUG(dbgs() << "LV: Found a bad write Ptr: "<< **w << "\n"); return false; } } // Check that there are no multiple write locations to the same pointer. - SmallPtrSet BasePointers; + SmallPtrSet WritePointerSet; for (w = Writes.begin(), we = Writes.end(); w != we; ++w) { - if (BasePointers.count(*w)) { + if (!WritePointerSet.insert(*w)) { DEBUG(dbgs() << "LV: Multiple writes to the same index :"<< **w << "\n"); return false; } - BasePointers.insert(*w); } - // Sort the writes vector so that we can use a binary search. - std::sort(Writes.begin(), Writes.end()); // Check that the reads and the writes are disjoint. for (r = Reads.begin(), re = Reads.end(); r != re; ++r) { - if (std::binary_search(Writes.begin(), Writes.end(), *r)) { + if (WritePointerSet.count(*r)) { DEBUG(dbgs() << "Vectorizer: Found a read/write ptr:"<< **r << "\n"); return false; } @@ -857,7 +854,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { /// Checks if the value is a Global variable or if it is an Arguments /// marked with the NoAlias attribute. -bool LoopVectorizationLegality::isKnownDisjoint(Value* Val) { +bool LoopVectorizationLegality::isidentifiedSafeObject(Value* Val) { assert(Val && "Invalid value"); if (dyn_cast(Val)) return true; -- cgit v1.1 From 6220fb16c1d08534e770785f47717ee80baac517 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 18 Oct 2012 21:45:31 +0000 Subject: fix a naming typo git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166232 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 9bbd9ab..7af4fa1 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -166,7 +166,7 @@ private: // Check if a pointer value is known to be disjoint. // Example: Alloca, Global, NoAlias. - bool isidentifiedSafeObject(Value* Val); + bool isIdentifiedSafeObject(Value* Val); /// The loop that we evaluate. Loop *TheLoop; @@ -427,8 +427,6 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); - - BasicBlock *ScalarPH = MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.preheader"); @@ -818,14 +816,14 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { // disjoint memory locations, or that they are no-alias arguments. ValueVector::iterator r, re, w, we; for (r = Reads.begin(), re = Reads.end(); r != re; ++r) { - if (!isidentifiedSafeObject(*r)) { + if (!isIdentifiedSafeObject(*r)) { DEBUG(dbgs() << "LV: Found a bad read Ptr: "<< **r << "\n"); return false; } } for (w = Writes.begin(), we = Writes.end(); w != we; ++w) { - if (!isidentifiedSafeObject(*w)) { + if (!isIdentifiedSafeObject(*w)) { DEBUG(dbgs() << "LV: Found a bad write Ptr: "<< **w << "\n"); return false; } @@ -854,7 +852,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { /// Checks if the value is a Global variable or if it is an Arguments /// marked with the NoAlias attribute. -bool LoopVectorizationLegality::isidentifiedSafeObject(Value* Val) { +bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) { assert(Val && "Invalid value"); if (dyn_cast(Val)) return true; -- cgit v1.1 From ebd3f27c7e2c6c3a1b76786da0d0205a5fdb1ef5 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 18 Oct 2012 23:21:01 +0000 Subject: cleanup the comment. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166247 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 7af4fa1..f5c9bb3 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -405,7 +405,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { | | | v | [ ] \ -| [ ]_| <-- old scalar loop to handle remainder. () +| [ ]_| <-- old scalar loop to handle remainder. \ | \ v >[ ] <-- exit block. -- cgit v1.1 From 89e7b356f270e29c2e9e18c6bbd30e5925585f06 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 19 Oct 2012 01:24:18 +0000 Subject: vectorizer: Add support for reading and writing from the same memory location. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166255 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index f5c9bb3..5152ec1 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -715,6 +715,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { ValueVector Reads; ValueVector Writes; + SmallPtrSet AnalyzedPtrs; unsigned NumPhis = 0; for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { Instruction *I = it; @@ -766,7 +767,10 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { DEBUG(dbgs() << "LV: Found a non-simple load.\n"); return false; } - GetUnderlyingObjects(Ld->getPointerOperand(), Reads, DL); + + Value* Ptr = Ld->getPointerOperand(); + if (AnalyzedPtrs.insert(Ptr)) + GetUnderlyingObjects(Ptr, Reads, DL); } // Record store pointers. Abort on all other instructions that write to @@ -778,7 +782,10 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { DEBUG(dbgs() << "LV: Found a non-simple store.\n"); return false; } - GetUnderlyingObjects(St->getPointerOperand(), Writes, DL); + + Value* Ptr = St->getPointerOperand(); + if (AnalyzedPtrs.insert(Ptr)) + GetUnderlyingObjects(St->getPointerOperand(), Writes, DL); } // We still don't handle functions. -- cgit v1.1 From 17f68c52d2f3b6b2472b07492d76520df230afcd Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Fri, 19 Oct 2012 08:42:02 +0000 Subject: LoopVectorize: Keep the IRBuilder on the stack. No functionality change. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166274 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 77 ++++++++++++++---------------- 1 file changed, 36 insertions(+), 41 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 5152ec1..1602e29 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -67,11 +67,7 @@ public: SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li, LPPassManager *Lpm, unsigned VecWidth): Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), - Builder(0), Induction(0), OldInduction(0) { } - - ~SingleBlockLoopVectorizer() { - delete Builder; - } + Builder(Se->getContext()), Induction(0), OldInduction(0) { } // Perform the actual loop widening (vectorization). void vectorize() { @@ -81,7 +77,7 @@ public: vectorizeLoop(); // register the new loop. cleanup(); - } + } private: /// Create an empty loop, based on the loop ranges of the old loop. @@ -131,7 +127,7 @@ private: unsigned VF; // The builder that we use - IRBuilder<> *Builder; + IRBuilder<> Builder; // --- Vectorization state --- @@ -241,10 +237,10 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); Value *UndefVal = UndefValue::get(VTy); // Insert the value into a new vector. - Value *SingleElem = Builder->CreateInsertElement(UndefVal, V, Zero); + Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); // Broadcast the scalar into all locations in the vector. - Value *Shuf = Builder->CreateShuffleVector(SingleElem, UndefVal, Zeros, - "broadcast"); + Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, + "broadcast"); // We are accessing the induction variable. Make sure to promote the // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. if (V == Induction) @@ -269,7 +265,7 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); - return Builder->CreateAdd(Val, Cv, "induction"); + return Builder.CreateAdd(Val, Cv, "induction"); } @@ -304,7 +300,7 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { // If we saved a vectorized copy of V, use it. ValueMap::iterator it = WidenMap.find(V); if (it != WidenMap.end()) - return it->second; + return it->second; // Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); @@ -364,18 +360,18 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { Value *Op = Params[op]; // Param is a vector. Need to extract the right lane. if (Op->getType()->isVectorTy()) - Op = Builder->CreateExtractElement(Op, Builder->getInt32(i)); + Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); Cloned->setOperand(op, Op); } // Place the cloned scalar in the new loop. - Builder->Insert(Cloned); + Builder.Insert(Cloned); // If the original scalar returns a value we need to place it in a vector // so that future users will be able to use it. if (!IsVoidRetTy) - VecResults = Builder->CreateInsertElement(VecResults, Cloned, - Builder->getInt32(i)); + VecResults = Builder.CreateInsertElement(VecResults, Cloned, + Builder.getInt32(i)); } if (!IsVoidRetTy) @@ -421,15 +417,15 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { assert(BypassBlock && "Invalid loop structure"); BasicBlock *VectorPH = - BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); + BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), - "vector.body"); + "vector.body"); BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), - "middle.block"); + "middle.block"); BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), - "scalar.preheader"); + MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), + "scalar.preheader"); // Find the induction variable. BasicBlock *OldBasicBlock = Orig->getHeader(); @@ -439,11 +435,10 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. - Builder = new IRBuilder<>(VecBody); - Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); // Generate the induction variable. - Induction = Builder->CreatePHI(IdxTy, 2, "index"); + Induction = Builder.CreatePHI(IdxTy, 2, "index"); Constant *Zero = ConstantInt::get(IdxTy, 0); Constant *Step = ConstantInt::get(IdxTy, VF); @@ -494,12 +489,12 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { MiddleBlock->getTerminator()->eraseFromParent(); // Create i+1 and fill the PHINode. - Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next"); + Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); Induction->addIncoming(Zero, VectorPH); Induction->addIncoming(NextIdx, VecBody); // Create the compare. - Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown); - Builder->CreateCondBr(ICmp, MiddleBlock, VecBody); + Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown); + Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); // Now we have two terminators. Remove the old one from the block. VecBody->getTerminator()->eraseFromParent(); @@ -509,7 +504,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { OldInduction->setIncomingValue(BlockIdx, CountRoundDown); // Get ready to start creating new instructions into the vectorized body. - Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); // Register the new loop. Loop* Lp = new Loop(); @@ -562,7 +557,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); // Use this vector value for all users of the original instruction. - WidenMap[Inst] = Builder->CreateBinOp(BinOp->getOpcode(), A, B); + WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B); break; } case Instruction::Select: { @@ -570,7 +565,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); Value *C = getVectorValue(Inst->getOperand(2)); - WidenMap[Inst] = Builder->CreateSelect(A, B, C); + WidenMap[Inst] = Builder.CreateSelect(A, B, C); break; } @@ -582,9 +577,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); if (FCmp) - WidenMap[Inst] = Builder->CreateFCmp(Cmp->getPredicate(), A, B); + WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); else - WidenMap[Inst] = Builder->CreateICmp(Cmp->getPredicate(), A, B); + WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); break; } @@ -605,10 +600,10 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { GetElementPtrInst *Gep2 = cast(Gep->clone()); unsigned NumOperands = Gep->getNumOperands(); Gep2->setOperand(NumOperands - 1, Induction); - Ptr = Builder->Insert(Gep2); - Ptr = Builder->CreateBitCast(Ptr, StTy->getPointerTo()); + Ptr = Builder.Insert(Gep2); + Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); Value *Val = getVectorValue(SI->getValueOperand()); - Builder->CreateStore(Val, Ptr)->setAlignment(Alignment); + Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); break; } case Instruction::Load: { @@ -629,9 +624,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { GetElementPtrInst *Gep2 = cast(Gep->clone()); unsigned NumOperands = Gep->getNumOperands(); Gep2->setOperand(NumOperands - 1, Induction); - Ptr = Builder->Insert(Gep2); - Ptr = Builder->CreateBitCast(Ptr, RetTy->getPointerTo()); - LI = Builder->CreateLoad(Ptr); + Ptr = Builder.Insert(Gep2); + Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); + LI = Builder.CreateLoad(Ptr); LI->setAlignment(Alignment); // Use this vector value for all users of the load. WidenMap[Inst] = LI; @@ -653,7 +648,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { CastInst *CI = dyn_cast(Inst); Value *A = getVectorValue(Inst->getOperand(0)); Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); - WidenMap[Inst] = Builder->CreateCast(CI->getOpcode(), A, DestTy); + WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); break; } @@ -815,8 +810,8 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { } // next instr. if (NumPhis != 1) { - DEBUG(dbgs() << "LV: Did not find a Phi node.\n"); - return false; + DEBUG(dbgs() << "LV: Did not find a Phi node.\n"); + return false; } // Check that the underlying objects of the reads and writes are either -- cgit v1.1 From 5dbe64e2bc2e4b96654703e85f909536df7ddb84 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 19 Oct 2012 23:05:40 +0000 Subject: Vectorizer: Add support for loop reductions. For example: for (i=0; i DefaultVectorizationFactor("default-loop-vectorize-width", cl::init(4), cl::Hidden, cl::desc("Set the default loop vectorization width")); - namespace { +// Forward declaration. +class LoopVectorizationLegality; + /// Vectorize a simple loop. This class performs the widening of simple single /// basic block loops into vectors. It does not perform any /// vectorization-legality checks, and just does it. It widens the vectors @@ -67,23 +71,28 @@ public: SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li, LPPassManager *Lpm, unsigned VecWidth): Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), - Builder(Se->getContext()), Induction(0), OldInduction(0) { } + Builder(0), Induction(0), OldInduction(0) { } + + ~SingleBlockLoopVectorizer() { + delete Builder; + } // Perform the actual loop widening (vectorization). - void vectorize() { + void vectorize(LoopVectorizationLegality *Legal) { ///Create a new empty loop. Unlink the old loop and connect the new one. createEmptyLoop(); /// Widen each instruction in the old loop to a new one in the new loop. - vectorizeLoop(); + /// Use the Legality module to find the induction and reduction variables. + vectorizeLoop(Legal); // register the new loop. cleanup(); - } + } private: /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(); /// Copy and widen the instructions from the old loop. - void vectorizeLoop(); + void vectorizeLoop(LoopVectorizationLegality *Legal); /// Insert the new loop to the loop hierarchy and pass manager. void cleanup(); @@ -113,6 +122,10 @@ private: /// broadcast them into a vector. Value *getVectorValue(Value *V); + /// Get a uniform vector of constant integers. We use this to get + /// vectors of ones and zeros for the reduction code. + Constant* getUniformVector(unsigned Val, Type* ScalarTy); + typedef DenseMap ValueMap; /// The original loop. @@ -127,10 +140,21 @@ private: unsigned VF; // The builder that we use - IRBuilder<> Builder; + IRBuilder<> *Builder; // --- Vectorization state --- + /// Middle Block between the vector and the scalar. + BasicBlock *LoopMiddleBlock; + ///The ExitBlock of the scalar loop. + BasicBlock *LoopExitBlock; + ///The vector loop body. + BasicBlock *LoopVectorBody; + ///The scalar loop body. + BasicBlock *LoopScalarBody; + ///The first bypass block. + BasicBlock *LoopBypassBlock; + /// The new Induction variable which was added to the new block. PHINode *Induction; /// The induction variable of the old basic block. @@ -146,7 +170,23 @@ private: class LoopVectorizationLegality { public: LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): - TheLoop(Lp), SE(Se), DL(Dl) { } + TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } + + /// This represents the kinds of reductions that we support. + enum ReductionKind { + IntegerAdd, /// Sum of numbers. + IntegerMult, /// Product of numbers. + NoReduction /// Not a reduction. + }; + + // Holds a pairing of reduction instruction and the reduction kind. + typedef std::pair ReductionPair; + + /// ReductionList contains the reduction variables + /// as well as a single EXIT (from the block) value and the kind of + /// reduction variable.. + /// Notice that the EXIT instruction can also be the PHI itself. + typedef DenseMap ReductionList; /// Returns the maximum vectorization factor that we *can* use to vectorize /// this loop. This does not mean that it is profitable to vectorize this @@ -154,6 +194,12 @@ public: /// can vectorize to any SIMD width below this number. unsigned getLoopMaxVF(); + /// Returns the Induction variable. + PHINode *getInduction() {return Induction;} + + /// Returns the reduction variables found in the loop. + ReductionList *getReductionVars() { return &Reductions; } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -164,12 +210,32 @@ private: // Example: Alloca, Global, NoAlias. bool isIdentifiedSafeObject(Value* Val); + /// Returns True, if 'Phi' is the kind of reduction variable for type + /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. + bool AddReductionVar(PHINode *Phi, ReductionKind Kind); + /// Checks if a constant matches the reduction kind. + /// Sums starts with zero. Products start at one. + bool isReductionConstant(Value *V, ReductionKind Kind); + /// Returns true if the instruction I can be a reduction variable of type + /// 'Kind'. + bool isReductionInstr(Instruction *I, ReductionKind Kind); + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. ScalarEvolution *SE; /// DataLayout analysis. DataLayout *DL; + + // --- vectorization state --- // + + /// Holds the induction variable. + PHINode *Induction; + /// Holds the reduction variables. + ReductionList Reductions; + /// Allowed outside users. This holds the reduction + /// vars which can be accessed from outside the loop. + SmallPtrSet AllowedExit; }; struct LoopVectorize : public LoopPass { @@ -184,6 +250,7 @@ struct LoopVectorize : public LoopPass { LoopInfo *LI; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { + // Only vectorize innermost loops. if (!L->empty()) return false; @@ -209,7 +276,7 @@ struct LoopVectorize : public LoopPass { // If we decided that is is *legal* to vectorizer the loop. Do it. SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor); - LB.vectorize(); + LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); return true; @@ -218,6 +285,7 @@ struct LoopVectorize : public LoopPass { virtual void getAnalysisUsage(AnalysisUsage &AU) const { LoopPass::getAnalysisUsage(AU); AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); AU.addRequired(); AU.addRequired(); } @@ -237,10 +305,10 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); Value *UndefVal = UndefValue::get(VTy); // Insert the value into a new vector. - Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); + Value *SingleElem = Builder->CreateInsertElement(UndefVal, V, Zero); // Broadcast the scalar into all locations in the vector. - Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, - "broadcast"); + Value *Shuf = Builder->CreateShuffleVector(SingleElem, UndefVal, Zeros, + "broadcast"); // We are accessing the induction variable. Make sure to promote the // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. if (V == Induction) @@ -265,7 +333,7 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); - return Builder.CreateAdd(Val, Cv, "induction"); + return Builder->CreateAdd(Val, Cv, "induction"); } @@ -297,10 +365,11 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) { } Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { + assert(!V->getType()->isVectorTy() && "Can't widen a vector"); // If we saved a vectorized copy of V, use it. ValueMap::iterator it = WidenMap.find(V); if (it != WidenMap.end()) - return it->second; + return it->second; // Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); @@ -308,6 +377,17 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { return B; } +Constant* +SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { + SmallVector Indices; + // Create a vector of consecutive numbers from zero to VF. + for (unsigned i = 0; i < VF; ++i) + Indices.push_back(ConstantInt::get(ScalarTy, Val)); + + // Add the consecutive indices to the vector value. + return ConstantVector::get(Indices); +} + void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. @@ -360,18 +440,18 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { Value *Op = Params[op]; // Param is a vector. Need to extract the right lane. if (Op->getType()->isVectorTy()) - Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); + Op = Builder->CreateExtractElement(Op, Builder->getInt32(i)); Cloned->setOperand(op, Op); } // Place the cloned scalar in the new loop. - Builder.Insert(Cloned); + Builder->Insert(Cloned); // If the original scalar returns a value we need to place it in a vector // so that future users will be able to use it. if (!IsVoidRetTy) - VecResults = Builder.CreateInsertElement(VecResults, Cloned, - Builder.getInt32(i)); + VecResults = Builder->CreateInsertElement(VecResults, Cloned, + Builder->getInt32(i)); } if (!IsVoidRetTy) @@ -417,16 +497,15 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { assert(BypassBlock && "Invalid loop structure"); BasicBlock *VectorPH = - BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); + BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), - "vector.body"); + "vector.body"); BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), - "middle.block"); + "middle.block"); BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), - "scalar.preheader"); - + MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), + "scalar.preheader"); // Find the induction variable. BasicBlock *OldBasicBlock = Orig->getHeader(); OldInduction = dyn_cast(OldBasicBlock->begin()); @@ -435,10 +514,11 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder = new IRBuilder<>(VecBody); + Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); // Generate the induction variable. - Induction = Builder.CreatePHI(IdxTy, 2, "index"); + Induction = Builder->CreatePHI(IdxTy, 2, "index"); Constant *Zero = ConstantInt::get(IdxTy, 0); Constant *Step = ConstantInt::get(IdxTy, VF); @@ -489,12 +569,12 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { MiddleBlock->getTerminator()->eraseFromParent(); // Create i+1 and fill the PHINode. - Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); + Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next"); Induction->addIncoming(Zero, VectorPH); Induction->addIncoming(NextIdx, VecBody); // Create the compare. - Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown); - Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); + Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown); + Builder->CreateCondBr(ICmp, MiddleBlock, VecBody); // Now we have two terminators. Remove the old one from the block. VecBody->getTerminator()->eraseFromParent(); @@ -504,7 +584,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { OldInduction->setIncomingValue(BlockIdx, CountRoundDown); // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); // Register the new loop. Loop* Lp = new Loop(); @@ -518,22 +598,52 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); } + + // Save the state. + LoopMiddleBlock = MiddleBlock; + LoopExitBlock = ExitBlock; + LoopVectorBody = VecBody; + LoopScalarBody = OldBasicBlock; + LoopBypassBlock = BypassBlock; } -void SingleBlockLoopVectorizer::vectorizeLoop() { +void +SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { + typedef SmallVector PhiVector; BasicBlock &BB = *Orig->getHeader(); + // In order to support reduction variables we need to be able to vectorize + // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two + // steages. First, we create a new vector PHI node with no incoming edges. + // We use this value when we vectorize all of the instructions that use the + // PHI. Next, after all of the instructions in the block are complete we + // add the new incoming edges to the PHI. At this point all of the + // instructions in the basic block are vectorized, so we can use them to + // construct the PHI. + PhiVector PHIsToFix; + // For each instruction in the old loop. for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { Instruction *Inst = it; switch (Inst->getOpcode()) { - case Instruction::PHI: case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the // loop control flow instructions. continue; - + case Instruction::PHI:{ + PHINode* P = cast(Inst); + // Special handling for the induction var. + if (OldInduction == Inst) + continue; + // This is phase I of vectorizing PHIs. + // This has to be a reduction variable. + assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); + Type *VecTy = VectorType::get(Inst->getType(), VF); + WidenMap[Inst] = Builder->CreatePHI(VecTy, 2, "vec.phi"); + PHIsToFix.push_back(P); + continue; + } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -557,15 +667,17 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); // Use this vector value for all users of the original instruction. - WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B); + WidenMap[Inst] = Builder->CreateBinOp(BinOp->getOpcode(), A, B); break; } case Instruction::Select: { // Widen selects. + // TODO: If the selector is loop invariant we can issue a select + // instruction with a scalar condition. Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); Value *C = getVectorValue(Inst->getOperand(2)); - WidenMap[Inst] = Builder.CreateSelect(A, B, C); + WidenMap[Inst] = Builder->CreateSelect(A, B, C); break; } @@ -577,9 +689,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); if (FCmp) - WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + WidenMap[Inst] = Builder->CreateFCmp(Cmp->getPredicate(), A, B); else - WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); + WidenMap[Inst] = Builder->CreateICmp(Cmp->getPredicate(), A, B); break; } @@ -600,10 +712,10 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { GetElementPtrInst *Gep2 = cast(Gep->clone()); unsigned NumOperands = Gep->getNumOperands(); Gep2->setOperand(NumOperands - 1, Induction); - Ptr = Builder.Insert(Gep2); - Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); + Ptr = Builder->Insert(Gep2); + Ptr = Builder->CreateBitCast(Ptr, StTy->getPointerTo()); Value *Val = getVectorValue(SI->getValueOperand()); - Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); + Builder->CreateStore(Val, Ptr)->setAlignment(Alignment); break; } case Instruction::Load: { @@ -624,9 +736,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { GetElementPtrInst *Gep2 = cast(Gep->clone()); unsigned NumOperands = Gep->getNumOperands(); Gep2->setOperand(NumOperands - 1, Induction); - Ptr = Builder.Insert(Gep2); - Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); - LI = Builder.CreateLoad(Ptr); + Ptr = Builder->Insert(Gep2); + Ptr = Builder->CreateBitCast(Ptr, RetTy->getPointerTo()); + LI = Builder->CreateLoad(Ptr); LI->setAlignment(Alignment); // Use this vector value for all users of the load. WidenMap[Inst] = LI; @@ -648,7 +760,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { CastInst *CI = dyn_cast(Inst); Value *A = getVectorValue(Inst->getOperand(0)); Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); - WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); + WidenMap[Inst] = Builder->CreateCast(CI->getOpcode(), A, DestTy); break; } @@ -658,6 +770,102 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { break; }// end of switch. }// end of for_each instr. + + // At this point every instruction in the original loop is widended to + // a vector form. We are almost done. Now, we need to fix the PHI nodes + // that we vectorized. The PHI nodes are currently empty because we did + // not want to introduce cycles. Notice that the remaining PHI nodes + // that we need to fix are reduction variables. + + // Create the 'reduced' values for each of the induction vars. + // The reduced values are the vector values that we scalarize and combine + // after the loop is finished. + for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end(); + it != e; ++it) { + PHINode *RdxPhi = *it; + PHINode *VecRdxPhi = dyn_cast(WidenMap[RdxPhi]); + assert(RdxPhi && "Unable to recover vectorized PHI"); + + // Find the reduction variable. + assert(Legal->getReductionVars()->count(RdxPhi) && + "Unable to find the reduction variable"); + LoopVectorizationLegality::ReductionPair ReductionVar = + (*Legal->getReductionVars())[RdxPhi]; + + // This is the vector-clone of the value that leaves the loop. + Value *VectorExit = getVectorValue(ReductionVar.first); + Type *VecTy = VectorExit->getType(); + + // This is the kind of reduction. + LoopVectorizationLegality::ReductionKind RdxKind = ReductionVar.second; + // Find the reduction identity variable. + // Zero for addition. One for Multiplication. + unsigned IdentitySclr = + (RdxKind == LoopVectorizationLegality::IntegerAdd ? 0 : 1); + Constant *Identity = getUniformVector(IdentitySclr, VecTy->getScalarType()); + + // Fix the vector-loop phi. + // We created the induction variable so we know that the + // preheader is the first entry. + BasicBlock *VecPreheader = Induction->getIncomingBlock(0); + VecRdxPhi->addIncoming(Identity, VecPreheader); + unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); + Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); + VecRdxPhi->addIncoming(Val, LoopVectorBody); + + // Before each round, move the insertion point right between + // the PHIs and the values we are going to write. + // This allows us to write both PHINodes and the extractelement + // instructions. + Builder->SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); + + // This PHINode contains the vectorized reduction variable, or + // the identity vector, if we bypass the vector loop. + PHINode *NewPhi = Builder->CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); + NewPhi->addIncoming(Identity, LoopBypassBlock); + NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody); + + // Extract the first scalar. + Value *Scalar0 = + Builder->CreateExtractElement(NewPhi, Builder->getInt32(0)); + // Extract and sum the remaining vector elements. + for (unsigned i=1; i < VF; ++i) { + Value *Scalar1 = + Builder->CreateExtractElement(NewPhi, Builder->getInt32(i)); + if (RdxKind == LoopVectorizationLegality::IntegerAdd) { + Scalar0 = Builder->CreateAdd(Scalar0, Scalar1); + } else { + Scalar0 = Builder->CreateMul(Scalar0, Scalar1); + } + } + + // Now, we need to fix the users of the reduction variable + // inside and outside of the scalar remainder loop. + // We know that the loop is in LCSSA form. We need to update the + // PHI nodes in the exit blocks. + for (BasicBlock::iterator LEI = LoopExitBlock->begin(), + LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { + PHINode *LCSSAPhi = dyn_cast(LEI); + if (!LCSSAPhi) continue; + + // All PHINodes need to have a single entry edge, or two if we already fixed them. + assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); + + // We found our reduction value exit-PHI. Update it with the incoming bypass edge. + if (LCSSAPhi->getIncomingValue(0) == ReductionVar.first) { + // Add an edge coming from the bypass. + LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); + break; + } + }// end of the LCSSA phi scan. + + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. + int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); + int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. + (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); + (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, ReductionVar.first); + }// end of for each redux variable. } void SingleBlockLoopVectorizer::cleanup() { @@ -710,31 +918,35 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { ValueVector Reads; ValueVector Writes; - SmallPtrSet AnalyzedPtrs; - unsigned NumPhis = 0; for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { Instruction *I = it; PHINode *Phi = dyn_cast(I); if (Phi) { - NumPhis++; + // This should not happen because the loop should be normalized. + if (Phi->getNumIncomingValues() != 2) { + DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + return false; + } // We only look at integer phi nodes. if (!Phi->getType()->isIntegerTy()) { DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); return false; } - - // If we found an induction variable. - if (NumPhis > 1) { - DEBUG(dbgs() << "LV: Found more than one PHI.\n"); - return false; + if (AddReductionVar(Phi, IntegerAdd)) { + DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); + continue; } - - // This should not happen because the loop should be normalized. - if (Phi->getNumIncomingValues() != 2) { - DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + if (AddReductionVar(Phi, IntegerMult)) { + DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n"); + continue; + } + if (Induction) { + DEBUG(dbgs() << "LV: Found too many PHIs.\n"); return false; } + // Found the induction variable. + Induction = Phi; // Check that the PHI is consecutive and starts at zero. const SCEV *PhiScev = SE->getSCEV(Phi); @@ -751,7 +963,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n"); return false; } - } + }// end of PHI handling // If this is a load, record its pointer. If it is not a load, abort. // Notice that we don't handle function calls that read or write. @@ -764,8 +976,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { } Value* Ptr = Ld->getPointerOperand(); - if (AnalyzedPtrs.insert(Ptr)) - GetUnderlyingObjects(Ptr, Reads, DL); + GetUnderlyingObjects(Ptr, Reads, DL); } // Record store pointers. Abort on all other instructions that write to @@ -779,8 +990,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { } Value* Ptr = St->getPointerOperand(); - if (AnalyzedPtrs.insert(Ptr)) - GetUnderlyingObjects(St->getPointerOperand(), Writes, DL); + GetUnderlyingObjects(Ptr, Writes, DL); } // We still don't handle functions. @@ -797,21 +1007,26 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); return false; } - //Check that all of the users of the loop are inside the BB. - for (Value::use_iterator it = I->use_begin(), e = I->use_end(); - it != e; ++it) { - Instruction *U = cast(*it); - BasicBlock *Parent = U->getParent(); - if (Parent != &BB) { - DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); - return false; - } + + // Reduction instructions are allowed to have exit users. + // All other instructions must not have external users. + if (!AllowedExit.count(I)) + //Check that all of the users of the loop are inside the BB. + for (Value::use_iterator it = I->use_begin(), e = I->use_end(); + it != e; ++it) { + Instruction *U = cast(*it); + // This user may be a reduction exit value. + BasicBlock *Parent = U->getParent(); + if (Parent != &BB) { + DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + return false; + } } } // next instr. - if (NumPhis != 1) { - DEBUG(dbgs() << "LV: Did not find a Phi node.\n"); - return false; + if (!Induction) { + DEBUG(dbgs() << "LV: Did not find an induction var.\n"); + return false; } // Check that the underlying objects of the reads and writes are either @@ -866,6 +1081,110 @@ bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) { return A->hasNoAliasAttr(); } +bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, + ReductionKind Kind) { + if (Phi->getNumIncomingValues() != 2) + return false; + + // Find the possible incoming reduction variable. + BasicBlock *BB = Phi->getParent(); + int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); + int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. + Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); + + // We must have a constant that starts the reduction. + if (!isReductionConstant(RdxStart, Kind)) + return false; + + // ExitInstruction is the single value which is used outside the loop. + // We only allow for a single reduction value to be used outside the loop. + // This includes users of the reduction, variables (which form a cycle + // which ends in the phi node). + Instruction *ExitInstruction = 0; + + // Iter is our iterator. We start with the PHI node and scan for all of the + // users of this instruction. All users must be instructions which can be + // used as reduction variables (such as ADD). We may have a single + // out-of-block user. They cycle must end with the original PHI. + // Also, we can't have multiple block-local users. + Instruction *Iter = Phi; + while (true) { + // Any reduction instr must be of one of the allowed kinds. + if (!isReductionInstr(Iter, Kind)) + return false; + + // Did we found a user inside this block ? + bool FoundInBlockUser = false; + // Did we reach the initial PHI node ? + bool FoundStartPHI = false; + // For each of the *users* of iter. + for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); + it != e; ++it) { + Instruction *U = cast(*it); + // We already know that the PHI is a user. + if (U == Phi) { + FoundStartPHI = true; + continue; + } + // Check if we found the exit user. + BasicBlock *Parent = U->getParent(); + if (Parent != BB) { + // We must have a single exit instruction. + if (ExitInstruction != 0) + return false; + ExitInstruction = Iter; + } + // We can't have multiple inside users. + if (FoundInBlockUser) + return false; + FoundInBlockUser = true; + Iter = U; + } + + // We found a reduction var if we have reached the original + // phi node and we only have a single instruction with out-of-loop + // users. + if (FoundStartPHI && ExitInstruction) { + // This instruction is allowed to have out-of-loop users. + AllowedExit.insert(ExitInstruction); + // Mark this as a reduction var. + Reductions[Phi] = std::make_pair(ExitInstruction, Kind); + return true; + } + } +} + +bool +LoopVectorizationLegality::isReductionConstant(Value *V, ReductionKind Kind) { + ConstantInt *CI = dyn_cast(V); + if (!CI) + return false; + if (Kind == IntegerMult && CI->isOne()) + return true; + if (Kind == IntegerAdd && CI->isZero()) + return true; + return false; +} + +bool +LoopVectorizationLegality::isReductionInstr(Instruction *I, + ReductionKind Kind) { + switch (I->getOpcode()) { + default: + return false; + case Instruction::PHI: + // possibly. + return true; + case Instruction::Add: + case Instruction::Sub: + return Kind == IntegerAdd; + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + return Kind == IntegerMult; + } +} + } // namespace char LoopVectorize::ID = 0; @@ -880,6 +1199,5 @@ namespace llvm { Pass *createLoopVectorizePass() { return new LoopVectorize(); } - } -- cgit v1.1 From 4ade92b6145a0d298b3c5ae1af79aca2598a327d Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 19 Oct 2012 23:27:19 +0000 Subject: LoopVectorize: Keep the IRBuilder on the stack. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166354 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 75 ++++++++++++++---------------- 1 file changed, 35 insertions(+), 40 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 309b60f..c0b709a 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -71,11 +71,7 @@ public: SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li, LPPassManager *Lpm, unsigned VecWidth): Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), - Builder(0), Induction(0), OldInduction(0) { } - - ~SingleBlockLoopVectorizer() { - delete Builder; - } + Builder(Se->getContext()), Induction(0), OldInduction(0) { } // Perform the actual loop widening (vectorization). void vectorize(LoopVectorizationLegality *Legal) { @@ -140,7 +136,7 @@ private: unsigned VF; // The builder that we use - IRBuilder<> *Builder; + IRBuilder<> Builder; // --- Vectorization state --- @@ -305,9 +301,9 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); Value *UndefVal = UndefValue::get(VTy); // Insert the value into a new vector. - Value *SingleElem = Builder->CreateInsertElement(UndefVal, V, Zero); + Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); // Broadcast the scalar into all locations in the vector. - Value *Shuf = Builder->CreateShuffleVector(SingleElem, UndefVal, Zeros, + Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, "broadcast"); // We are accessing the induction variable. Make sure to promote the // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. @@ -333,7 +329,7 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); - return Builder->CreateAdd(Val, Cv, "induction"); + return Builder.CreateAdd(Val, Cv, "induction"); } @@ -440,18 +436,18 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { Value *Op = Params[op]; // Param is a vector. Need to extract the right lane. if (Op->getType()->isVectorTy()) - Op = Builder->CreateExtractElement(Op, Builder->getInt32(i)); + Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); Cloned->setOperand(op, Op); } // Place the cloned scalar in the new loop. - Builder->Insert(Cloned); + Builder.Insert(Cloned); // If the original scalar returns a value we need to place it in a vector // so that future users will be able to use it. if (!IsVoidRetTy) - VecResults = Builder->CreateInsertElement(VecResults, Cloned, - Builder->getInt32(i)); + VecResults = Builder.CreateInsertElement(VecResults, Cloned, + Builder.getInt32(i)); } if (!IsVoidRetTy) @@ -504,8 +500,8 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), - "scalar.preheader"); + MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), + "scalar.preheader"); // Find the induction variable. BasicBlock *OldBasicBlock = Orig->getHeader(); OldInduction = dyn_cast(OldBasicBlock->begin()); @@ -514,11 +510,10 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. - Builder = new IRBuilder<>(VecBody); - Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); // Generate the induction variable. - Induction = Builder->CreatePHI(IdxTy, 2, "index"); + Induction = Builder.CreatePHI(IdxTy, 2, "index"); Constant *Zero = ConstantInt::get(IdxTy, 0); Constant *Step = ConstantInt::get(IdxTy, VF); @@ -569,12 +564,12 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { MiddleBlock->getTerminator()->eraseFromParent(); // Create i+1 and fill the PHINode. - Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next"); + Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); Induction->addIncoming(Zero, VectorPH); Induction->addIncoming(NextIdx, VecBody); // Create the compare. - Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown); - Builder->CreateCondBr(ICmp, MiddleBlock, VecBody); + Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown); + Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); // Now we have two terminators. Remove the old one from the block. VecBody->getTerminator()->eraseFromParent(); @@ -584,7 +579,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { OldInduction->setIncomingValue(BlockIdx, CountRoundDown); // Get ready to start creating new instructions into the vectorized body. - Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); // Register the new loop. Loop* Lp = new Loop(); @@ -640,7 +635,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // This has to be a reduction variable. assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); Type *VecTy = VectorType::get(Inst->getType(), VF); - WidenMap[Inst] = Builder->CreatePHI(VecTy, 2, "vec.phi"); + WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi"); PHIsToFix.push_back(P); continue; } @@ -667,7 +662,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); // Use this vector value for all users of the original instruction. - WidenMap[Inst] = Builder->CreateBinOp(BinOp->getOpcode(), A, B); + WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B); break; } case Instruction::Select: { @@ -677,7 +672,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); Value *C = getVectorValue(Inst->getOperand(2)); - WidenMap[Inst] = Builder->CreateSelect(A, B, C); + WidenMap[Inst] = Builder.CreateSelect(A, B, C); break; } @@ -689,9 +684,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); if (FCmp) - WidenMap[Inst] = Builder->CreateFCmp(Cmp->getPredicate(), A, B); + WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); else - WidenMap[Inst] = Builder->CreateICmp(Cmp->getPredicate(), A, B); + WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); break; } @@ -712,10 +707,10 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { GetElementPtrInst *Gep2 = cast(Gep->clone()); unsigned NumOperands = Gep->getNumOperands(); Gep2->setOperand(NumOperands - 1, Induction); - Ptr = Builder->Insert(Gep2); - Ptr = Builder->CreateBitCast(Ptr, StTy->getPointerTo()); + Ptr = Builder.Insert(Gep2); + Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); Value *Val = getVectorValue(SI->getValueOperand()); - Builder->CreateStore(Val, Ptr)->setAlignment(Alignment); + Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); break; } case Instruction::Load: { @@ -736,9 +731,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { GetElementPtrInst *Gep2 = cast(Gep->clone()); unsigned NumOperands = Gep->getNumOperands(); Gep2->setOperand(NumOperands - 1, Induction); - Ptr = Builder->Insert(Gep2); - Ptr = Builder->CreateBitCast(Ptr, RetTy->getPointerTo()); - LI = Builder->CreateLoad(Ptr); + Ptr = Builder.Insert(Gep2); + Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); + LI = Builder.CreateLoad(Ptr); LI->setAlignment(Alignment); // Use this vector value for all users of the load. WidenMap[Inst] = LI; @@ -760,7 +755,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { CastInst *CI = dyn_cast(Inst); Value *A = getVectorValue(Inst->getOperand(0)); Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); - WidenMap[Inst] = Builder->CreateCast(CI->getOpcode(), A, DestTy); + WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); break; } @@ -817,25 +812,25 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // the PHIs and the values we are going to write. // This allows us to write both PHINodes and the extractelement // instructions. - Builder->SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); + Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); // This PHINode contains the vectorized reduction variable, or // the identity vector, if we bypass the vector loop. - PHINode *NewPhi = Builder->CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); + PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); NewPhi->addIncoming(Identity, LoopBypassBlock); NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody); // Extract the first scalar. Value *Scalar0 = - Builder->CreateExtractElement(NewPhi, Builder->getInt32(0)); + Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); // Extract and sum the remaining vector elements. for (unsigned i=1; i < VF; ++i) { Value *Scalar1 = - Builder->CreateExtractElement(NewPhi, Builder->getInt32(i)); + Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); if (RdxKind == LoopVectorizationLegality::IntegerAdd) { - Scalar0 = Builder->CreateAdd(Scalar0, Scalar1); + Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); } else { - Scalar0 = Builder->CreateMul(Scalar0, Scalar1); + Scalar0 = Builder.CreateMul(Scalar0, Scalar1); } } -- cgit v1.1 From e6748f91eaa69b91cc5c4081a45e013a251c2726 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sat, 20 Oct 2012 04:59:06 +0000 Subject: Vectorizer: refactor the memory checks to a new function. No functionality change. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166366 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 84 ++++++++++++++++++------------ 1 file changed, 51 insertions(+), 33 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index c0b709a..7866fcf 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -202,6 +202,12 @@ private: /// and we only need to check individual instructions. bool canVectorizeBlock(BasicBlock &BB); + /// When we vectorize loops we may change the order in which + /// we read and write from memory. This method checks if it is + /// legal to vectorize the code, considering only memory constrains. + /// Returns true if BB is vectorizable + bool canVectorizeMemory(BasicBlock &BB;) + // Check if a pointer value is known to be disjoint. // Example: Alloca, Global, NoAlias. bool isIdentifiedSafeObject(Value* Val); @@ -908,11 +914,7 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() { } bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { - // Holds the read and write pointers that we find. - typedef SmallVector ValueVector; - ValueVector Reads; - ValueVector Writes; - + // Scan the instructions in the block and look for hazards. for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { Instruction *I = it; @@ -960,34 +962,6 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { } }// end of PHI handling - // If this is a load, record its pointer. If it is not a load, abort. - // Notice that we don't handle function calls that read or write. - if (I->mayReadFromMemory()) { - LoadInst *Ld = dyn_cast(I); - if (!Ld) return false; - if (!Ld->isSimple()) { - DEBUG(dbgs() << "LV: Found a non-simple load.\n"); - return false; - } - - Value* Ptr = Ld->getPointerOperand(); - GetUnderlyingObjects(Ptr, Reads, DL); - } - - // Record store pointers. Abort on all other instructions that write to - // memory. - if (I->mayWriteToMemory()) { - StoreInst *St = dyn_cast(I); - if (!St) return false; - if (!St->isSimple()) { - DEBUG(dbgs() << "LV: Found a non-simple store.\n"); - return false; - } - - Value* Ptr = St->getPointerOperand(); - GetUnderlyingObjects(Ptr, Writes, DL); - } - // We still don't handle functions. CallInst *CI = dyn_cast(I); if (CI) { @@ -1024,6 +998,50 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { return false; } + // If the memory dependencies do not prevent us from + // vectorizing, then vectorize. + return canVectorizeMemory(BB); +} + +bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { + // Holds the read and write pointers that we find. + typedef SmallVector ValueVector; + ValueVector Reads; + ValueVector Writes; + + for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { + Instruction *I = it; + + // If this is a load, record its pointer. If it is not a load, abort. + // Notice that we don't handle function calls that read or write. + if (I->mayReadFromMemory()) { + LoadInst *Ld = dyn_cast(I); + if (!Ld) return false; + if (!Ld->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple load.\n"); + return false; + } + + Value* Ptr = Ld->getPointerOperand(); + GetUnderlyingObjects(Ptr, Reads, DL); + } + + // Record store pointers. Abort on all other instructions that write to + // memory. + if (I->mayWriteToMemory()) { + StoreInst *St = dyn_cast(I); + if (!St) return false; + if (!St->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple store.\n"); + return false; + } + + Value* Ptr = St->getPointerOperand(); + GetUnderlyingObjects(Ptr, Writes, DL); + } + } // next instr. + + // Check that the underlying objects of the reads and writes are either // disjoint memory locations, or that they are no-alias arguments. ValueVector::iterator r, re, w, we; -- cgit v1.1 From 71a148223907504c78f90f835131d5e8921011ad Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sat, 20 Oct 2012 05:03:27 +0000 Subject: Fix a typo git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166367 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 7866fcf..968d471 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -206,7 +206,7 @@ private: /// we read and write from memory. This method checks if it is /// legal to vectorize the code, considering only memory constrains. /// Returns true if BB is vectorizable - bool canVectorizeMemory(BasicBlock &BB;) + bool canVectorizeMemory(BasicBlock &BB); // Check if a pointer value is known to be disjoint. // Example: Alloca, Global, NoAlias. -- cgit v1.1 From bf8772ed2cc89a495e2692919331d7a03e76d791 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sat, 20 Oct 2012 08:26:33 +0000 Subject: Vectorize: teach cavVectorizeMemory to distinguish between A[i]+=x and A[B[i]]+=x. If the pointer is consecutive then it is safe to read and write. If the pointer is non-loop-consecutive then it is unsafe to vectorize it because we may hit an ordering issue. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166371 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 211 +++++++++++++++++++---------- 1 file changed, 137 insertions(+), 74 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 968d471..c11c66f 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -76,7 +76,7 @@ public: // Perform the actual loop widening (vectorization). void vectorize(LoopVectorizationLegality *Legal) { ///Create a new empty loop. Unlink the old loop and connect the new one. - createEmptyLoop(); + createEmptyLoop(Legal); /// Widen each instruction in the old loop to a new one in the new loop. /// Use the Legality module to find the induction and reduction variables. vectorizeLoop(Legal); @@ -86,7 +86,7 @@ public: private: /// Create an empty loop, based on the loop ranges of the old loop. - void createEmptyLoop(); + void createEmptyLoop(LoopVectorizationLegality *Legal); /// Copy and widen the instructions from the old loop. void vectorizeLoop(LoopVectorizationLegality *Legal); /// Insert the new loop to the loop hierarchy and pass manager. @@ -107,10 +107,6 @@ private: /// for each element in the vector. Starting from zero. Value *getConsecutiveVector(Value* Val); - /// Check that the GEP operands are all uniform except for the last index - /// which has to be the induction variable. - bool isConsecutiveGep(GetElementPtrInst *Gep); - /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. /// When we widen (vectorize) values we place them in the map. If the values @@ -196,6 +192,10 @@ public: /// Returns the reduction variables found in the loop. ReductionList *getReductionVars() { return &Reductions; } + /// Check that the GEP operands are all uniform except for the last index + /// which has to be the induction variable. + bool isConsecutiveGep(Value *Ptr); + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -221,6 +221,8 @@ private: /// Returns true if the instruction I can be a reduction variable of type /// 'Kind'. bool isReductionInstr(Instruction *I, ReductionKind Kind); + /// Returns True, if 'Phi' is an induction variable. + bool isInductionVariable(PHINode *Phi); /// The loop that we evaluate. Loop *TheLoop; @@ -338,8 +340,8 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { return Builder.CreateAdd(Val, Cv, "induction"); } - -bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) { +bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { + GetElementPtrInst *Gep = dyn_cast(Ptr); if (!Gep) return false; @@ -348,7 +350,7 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) { // Check that all of the gep indices are uniform except for the last. for (unsigned i = 0; i < NumOperands - 1; ++i) - if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), Orig)) + if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) return false; // We can emit wide load/stores only of the last index is the induction @@ -460,7 +462,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { WidenMap[Instr] = VecResults; } -void SingleBlockLoopVectorizer::createEmptyLoop() { +void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -510,7 +512,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { "scalar.preheader"); // Find the induction variable. BasicBlock *OldBasicBlock = Orig->getHeader(); - OldInduction = dyn_cast(OldBasicBlock->begin()); + OldInduction = Legal->getInduction(); assert(OldInduction && "We must have a single phi node."); Type *IdxTy = OldInduction->getType(); @@ -637,7 +639,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Special handling for the induction var. if (OldInduction == Inst) continue; - // This is phase I of vectorizing PHIs. + // This is phase one of vectorizing PHIs. // This has to be a reduction variable. assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); Type *VecTy = VectorType::get(Inst->getType(), VF); @@ -704,7 +706,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { unsigned Alignment = SI->getAlignment(); GetElementPtrInst *Gep = dyn_cast(Ptr); // This store does not use GEPs. - if (!isConsecutiveGep(Gep)) { + if (!Legal->isConsecutiveGep(Gep)) { scalarizeInstruction(Inst); break; } @@ -728,7 +730,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { GetElementPtrInst *Gep = dyn_cast(Ptr); // We don't have a gep. Scalarize the load. - if (!isConsecutiveGep(Gep)) { + if (!Legal->isConsecutiveGep(Gep)) { scalarizeInstruction(Inst); break; } @@ -930,6 +932,16 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); return false; } + + if (isInductionVariable(Phi)) { + if (Induction) { + DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); + return false; + } + DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); + Induction = Phi; + continue; + } if (AddReductionVar(Phi, IntegerAdd)) { DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); continue; @@ -938,28 +950,6 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n"); continue; } - if (Induction) { - DEBUG(dbgs() << "LV: Found too many PHIs.\n"); - return false; - } - // Found the induction variable. - Induction = Phi; - - // Check that the PHI is consecutive and starts at zero. - const SCEV *PhiScev = SE->getSCEV(Phi); - const SCEVAddRecExpr *AR = dyn_cast(PhiScev); - if (!AR) { - DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); - return false; - } - - const SCEV *Step = AR->getStepRecurrence(*SE); - const SCEV *Start = AR->getStart(); - - if (!Step->isOne() || !Start->isZero()) { - DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n"); - return false; - } }// end of PHI handling // We still don't handle functions. @@ -1004,16 +994,19 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { } bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { - // Holds the read and write pointers that we find. - typedef SmallVector ValueVector; - ValueVector Reads; - ValueVector Writes; + typedef SmallVector ValueVector; + typedef SmallPtrSet ValueSet; + // Holds the Load and Store *instructions*. + ValueVector Loads; + ValueVector Stores; + // Scan the BB and collect legal loads and stores. for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { Instruction *I = it; - // If this is a load, record its pointer. If it is not a load, abort. - // Notice that we don't handle function calls that read or write. + // If this is a load, save it. If this instruction can read from memory + // but is not a load, then we quit. Notice that we don't handle function + // calls that read or write. if (I->mayReadFromMemory()) { LoadInst *Ld = dyn_cast(I); if (!Ld) return false; @@ -1021,13 +1014,11 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { DEBUG(dbgs() << "LV: Found a non-simple load.\n"); return false; } - - Value* Ptr = Ld->getPointerOperand(); - GetUnderlyingObjects(Ptr, Reads, DL); + Loads.push_back(Ld); + continue; } - // Record store pointers. Abort on all other instructions that write to - // memory. + // Save store instructions. Abort if other instructions write to memory. if (I->mayWriteToMemory()) { StoreInst *St = dyn_cast(I); if (!St) return false; @@ -1035,45 +1026,99 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { DEBUG(dbgs() << "LV: Found a non-simple store.\n"); return false; } - - Value* Ptr = St->getPointerOperand(); - GetUnderlyingObjects(Ptr, Writes, DL); + Stores.push_back(St); } } // next instr. + // Now we have two lists that hold the loads and the stores. + // Next, we find the pointers that they use. - // Check that the underlying objects of the reads and writes are either - // disjoint memory locations, or that they are no-alias arguments. - ValueVector::iterator r, re, w, we; - for (r = Reads.begin(), re = Reads.end(); r != re; ++r) { - if (!isIdentifiedSafeObject(*r)) { - DEBUG(dbgs() << "LV: Found a bad read Ptr: "<< **r << "\n"); - return false; - } + // Check if we see any stores. If there are no stores, then we don't + // care if the pointers are *restrict*. + if (!Stores.size()) { + DEBUG(dbgs() << "LV: Found a read-only loop!\n"); + return true; } - for (w = Writes.begin(), we = Writes.end(); w != we; ++w) { - if (!isIdentifiedSafeObject(*w)) { - DEBUG(dbgs() << "LV: Found a bad write Ptr: "<< **w << "\n"); - return false; - } + // Holds the read and read-write *pointers* that we find. + ValueVector Reads; + ValueVector ReadWrites; + + // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects + // multiple times on the same object. If the ptr is accessed twice, once + // for read and once for write, it will only appear once (on the write + // list). This is okay, since we are going to check for conflicts between + // writes and between reads and writes, but not between reads and reads. + ValueSet Seen; + + ValueVector::iterator I, IE; + for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { + StoreInst *ST = dyn_cast(*I); + assert(ST && "Bad StoreInst"); + Value* Ptr = ST->getPointerOperand(); + // If we did *not* see this pointer before, insert it to + // the read-write list. At this phase it is only a 'write' list. + if (Seen.insert(Ptr)) + ReadWrites.push_back(Ptr); } - // Check that there are no multiple write locations to the same pointer. - SmallPtrSet WritePointerSet; - for (w = Writes.begin(), we = Writes.end(); w != we; ++w) { - if (!WritePointerSet.insert(*w)) { - DEBUG(dbgs() << "LV: Multiple writes to the same index :"<< **w << "\n"); - return false; + for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { + LoadInst *LD = dyn_cast(*I); + assert(LD && "Bad LoadInst"); + Value* Ptr = LD->getPointerOperand(); + // If we did *not* see this pointer before, insert it to the + // read list. If we *did* see it before, then it is already in + // the read-write list. This allows us to vectorize expressions + // such as A[i] += x; Because the address of A[i] is a read-write + // pointer. This only works if the index of A[i] is consecutive. + // If the address of i is unknown (for example A[B[i]]) then we may + // read a few words, modify, and write a few words, and some of the + // words may be written to the same address. + if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr)) + Reads.push_back(Ptr); + } + + // Now that the pointers are in two lists (Reads and ReadWrites), we + // can check that there are no conflicts between each of the writes and + // between the writes to the reads. + ValueSet WriteObjects; + ValueVector TempObjects; + + // Check that the read-writes do not conflict with other read-write + // pointers. + for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { + GetUnderlyingObjects(*I, TempObjects, DL); + for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); + it != e; ++it) { + if (!isIdentifiedSafeObject(*it)) { + DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); + return false; + } + if (!WriteObjects.insert(*it)) { + DEBUG(dbgs() << "LV: Found a possible write-write reorder:" + << **it <<"\n"); + return false; + } } + TempObjects.clear(); } - // Check that the reads and the writes are disjoint. - for (r = Reads.begin(), re = Reads.end(); r != re; ++r) { - if (WritePointerSet.count(*r)) { - DEBUG(dbgs() << "Vectorizer: Found a read/write ptr:"<< **r << "\n"); - return false; + /// Check that the reads don't conflict with the read-writes. + for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { + GetUnderlyingObjects(*I, TempObjects, DL); + for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); + it != e; ++it) { + if (!isIdentifiedSafeObject(*it)) { + DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); + return false; + } + if (WriteObjects.count(*it)) { + DEBUG(dbgs() << "LV: Found a possible read/write reorder:" + << **it <<"\n"); + return false; + } } + TempObjects.clear(); } // All is okay. @@ -1198,6 +1243,24 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, } } +bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { + // Check that the PHI is consecutive and starts at zero. + const SCEV *PhiScev = SE->getSCEV(Phi); + const SCEVAddRecExpr *AR = dyn_cast(PhiScev); + if (!AR) { + DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); + return false; + } + const SCEV *Step = AR->getStepRecurrence(*SE); + const SCEV *Start = AR->getStart(); + + if (!Step->isOne() || !Start->isZero()) { + DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n"); + return false; + } + return true; +} + } // namespace char LoopVectorize::ID = 0; -- cgit v1.1 From ccaccfa8bf070b169847109513610cf41abfd1f3 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sat, 20 Oct 2012 20:45:01 +0000 Subject: Fix an infinite loop in the loop-vectorizer. PR14134. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166379 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index c11c66f..027fe06 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1175,6 +1175,12 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, bool FoundInBlockUser = false; // Did we reach the initial PHI node ? bool FoundStartPHI = false; + + // If the instruction has no users then this is a broken + // chain and can't be a reduction variable. + if (Iter->use_begin() == Iter->use_end()) + return false; + // For each of the *users* of iter. for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); it != e; ++it) { -- cgit v1.1 From 5a418ba5f5a6498a25d5eacb0f876d9f358c977b Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 21 Oct 2012 02:38:01 +0000 Subject: Vectorizer: fix a bug in the classification of induction/reduction phis. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166384 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 3 +++ 1 file changed, 3 insertions(+) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 027fe06..76936d5 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -950,6 +950,9 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n"); continue; } + + DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); + return false; }// end of PHI handling // We still don't handle functions. -- cgit v1.1 From f01cad69c1ab1b025c524d1ac16060b2cc0f4668 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 21 Oct 2012 04:04:25 +0000 Subject: Document change. Describe the pass and some papers that inspired the design of the pass. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166386 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 76936d5..f32b66d 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7,10 +7,14 @@ // //===----------------------------------------------------------------------===// // -// This is a simple loop vectorizer. We currently only support single block -// loops. We have a very simple and restrictive legality check: we need to read -// and write from disjoint memory locations. We still don't have a cost model. -// We do support integer reductions. +// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops +// and generates target-independent LLVM-IR. Legalization of the IR is done +// in the codegen. However, the vectorizes uses (will use) the codegen +// interfaces to generate IR that is likely to result in an optimal binary. +// +// The loop vectorizer combines consecutive loop iteration into a single +// 'wide' iteration. After this transformation the index is incremented +// by the SIMD vector width, and not by one. // // This pass has three parts: // 1. The main loop pass that drives the different parts. @@ -18,6 +22,16 @@ // of the vectorization. // 3. SingleBlockLoopVectorizer - A helper class that performs the actual // widening of instructions. +//===----------------------------------------------------------------------===// +// +// The reduction-variable vectorization is based on the paper: +// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. +// +// Variable uniformity checks are inspired by: +// Karrenberg, R. and Hack, S. Whole Function Vectorization. +// +// Other ideas/concepts are from: +// A. Zaks and D. Nuzman. Autovectorization in GCC—two years later. // //===----------------------------------------------------------------------===// #define LV_NAME "loop-vectorize" -- cgit v1.1 From c84787262942a400a8de6539ee79f4de72d3f872 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 21 Oct 2012 05:52:51 +0000 Subject: Add support for reduction variables that do not start at zero. This is important for nested-loop reductions such as : In the innermost loop, the induction variable does not start with zero: for (i = 0 .. n) for (j = 0 .. m) sum += ... git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166387 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 118 ++++++++++++++++------------- 1 file changed, 67 insertions(+), 51 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index f32b66d..5a79c33 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -179,20 +179,36 @@ public: TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } /// This represents the kinds of reductions that we support. + /// We use the enum values to hold the 'identity' value for + /// each operand. This value does not change the result if applied. enum ReductionKind { - IntegerAdd, /// Sum of numbers. - IntegerMult, /// Product of numbers. - NoReduction /// Not a reduction. + NoReduction = -1, /// Not a reduction. + IntegerAdd = 0, /// Sum of numbers. + IntegerMult = 1 /// Product of numbers. }; - // Holds a pairing of reduction instruction and the reduction kind. - typedef std::pair ReductionPair; + /// This POD struct holds information about reduction variables. + struct ReductionDescriptor { + // Default C'tor + ReductionDescriptor(): + StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} + + // C'tor. + ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): + StartValue(Start), LoopExitInstr(Exit), Kind(K) {} + + // The starting value of the reduction. + // It does not have to be zero! + Value *StartValue; + // The instruction who's value is used outside the loop. + Instruction *LoopExitInstr; + // The kind of the reduction. + ReductionKind Kind; + }; - /// ReductionList contains the reduction variables - /// as well as a single EXIT (from the block) value and the kind of - /// reduction variable.. - /// Notice that the EXIT instruction can also be the PHI itself. - typedef DenseMap ReductionList; + /// ReductionList contains the reduction descriptors for all + /// of the reductions that were found in the loop. + typedef DenseMap ReductionList; /// Returns the maximum vectorization factor that we *can* use to vectorize /// this loop. This does not mean that it is profitable to vectorize this @@ -229,9 +245,6 @@ private: /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. bool AddReductionVar(PHINode *Phi, ReductionKind Kind); - /// Checks if a constant matches the reduction kind. - /// Sums starts with zero. Products start at one. - bool isReductionConstant(Value *V, ReductionKind Kind); /// Returns true if the instruction I can be a reduction variable of type /// 'Kind'. bool isReductionInstr(Instruction *I, ReductionKind Kind); @@ -628,6 +641,8 @@ void SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { typedef SmallVector PhiVector; BasicBlock &BB = *Orig->getHeader(); + Constant *Zero = ConstantInt::get( + IntegerType::getInt32Ty(BB.getContext()), 0); // In order to support reduction variables we need to be able to vectorize // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two @@ -803,29 +818,42 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { PHINode *VecRdxPhi = dyn_cast(WidenMap[RdxPhi]); assert(RdxPhi && "Unable to recover vectorized PHI"); - // Find the reduction variable. + // Find the reduction variable descriptor. assert(Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable"); - LoopVectorizationLegality::ReductionPair ReductionVar = + LoopVectorizationLegality::ReductionDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; + // We need to generate a reduction vector from the incoming scalar. + // To do so, we need to generate the 'identity' vector and overide + // one of the elements with the incoming scalar reduction. We need + // to do it in the vector-loop preheader. + Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); + // This is the vector-clone of the value that leaves the loop. - Value *VectorExit = getVectorValue(ReductionVar.first); + Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); Type *VecTy = VectorExit->getType(); - // This is the kind of reduction. - LoopVectorizationLegality::ReductionKind RdxKind = ReductionVar.second; - // Find the reduction identity variable. - // Zero for addition. One for Multiplication. - unsigned IdentitySclr = - (RdxKind == LoopVectorizationLegality::IntegerAdd ? 0 : 1); - Constant *Identity = getUniformVector(IdentitySclr, VecTy->getScalarType()); + // Find the reduction identity variable. The value of the enum is the + // identity. Zero for addition. One for Multiplication. + unsigned IdentitySclr = RdxDesc.Kind; + Constant *Identity = getUniformVector(IdentitySclr, + VecTy->getScalarType()); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + Value *VectorStart = Builder.CreateInsertElement(Identity, + RdxDesc.StartValue, Zero); + // Fix the vector-loop phi. // We created the induction variable so we know that the // preheader is the first entry. BasicBlock *VecPreheader = Induction->getIncomingBlock(0); - VecRdxPhi->addIncoming(Identity, VecPreheader); + + // Reductions do not have to start at zero. They can start with + // any loop invariant values. + VecRdxPhi->addIncoming(VectorStart, VecPreheader); unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); VecRdxPhi->addIncoming(Val, LoopVectorBody); @@ -837,10 +865,10 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); // This PHINode contains the vectorized reduction variable, or - // the identity vector, if we bypass the vector loop. + // the initial value vector, if we bypass the vector loop. PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); - NewPhi->addIncoming(Identity, LoopBypassBlock); - NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody); + NewPhi->addIncoming(VectorStart, LoopBypassBlock); + NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); // Extract the first scalar. Value *Scalar0 = @@ -849,7 +877,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { for (unsigned i=1; i < VF; ++i) { Value *Scalar1 = Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); - if (RdxKind == LoopVectorizationLegality::IntegerAdd) { + if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) { Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); } else { Scalar0 = Builder.CreateMul(Scalar0, Scalar1); @@ -865,11 +893,13 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { PHINode *LCSSAPhi = dyn_cast(LEI); if (!LCSSAPhi) continue; - // All PHINodes need to have a single entry edge, or two if we already fixed them. + // All PHINodes need to have a single entry edge, or two if + // we already fixed them. assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); - // We found our reduction value exit-PHI. Update it with the incoming bypass edge. - if (LCSSAPhi->getIncomingValue(0) == ReductionVar.first) { + // We found our reduction value exit-PHI. Update it with the + // incoming bypass edge. + if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { // Add an edge coming from the bypass. LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); break; @@ -881,7 +911,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); - (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, ReductionVar.first); + (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); }// end of for each redux variable. } @@ -1157,7 +1187,7 @@ bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) { } bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, - ReductionKind Kind) { + ReductionKind Kind) { if (Phi->getNumIncomingValues() != 2) return false; @@ -1167,10 +1197,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); - // We must have a constant that starts the reduction. - if (!isReductionConstant(RdxStart, Kind)) - return false; - // ExitInstruction is the single value which is used outside the loop. // We only allow for a single reduction value to be used outside the loop. // This includes users of the reduction, variables (which form a cycle @@ -1228,26 +1254,16 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, if (FoundStartPHI && ExitInstruction) { // This instruction is allowed to have out-of-loop users. AllowedExit.insert(ExitInstruction); - // Mark this as a reduction var. - Reductions[Phi] = std::make_pair(ExitInstruction, Kind); + + // Save the description of this reduction variable. + ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); + Reductions[Phi] = RD; return true; } } } bool -LoopVectorizationLegality::isReductionConstant(Value *V, ReductionKind Kind) { - ConstantInt *CI = dyn_cast(V); - if (!CI) - return false; - if (Kind == IntegerMult && CI->isOne()) - return true; - if (Kind == IntegerAdd && CI->isZero()) - return true; - return false; -} - -bool LoopVectorizationLegality::isReductionInstr(Instruction *I, ReductionKind Kind) { switch (I->getOpcode()) { -- cgit v1.1 From bb950854acbb5966875763eaae7ab58e48e4f5a9 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 21 Oct 2012 06:49:10 +0000 Subject: Fix a bug in the vectorization of wide load/store operations. We used a SCEV to detect that A[X] is consecutive. We assumed that X was the induction variable. But X can be any expression that uses the induction for example: X = i + 2; git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166388 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 5a79c33..1b6011b 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -740,10 +740,15 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { break; } + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0)); + // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast(Gep->clone()); - unsigned NumOperands = Gep->getNumOperands(); - Gep2->setOperand(NumOperands - 1, Induction); + Gep2->setOperand(NumOperands - 1, LastIndex); Ptr = Builder.Insert(Gep2); Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); Value *Val = getVectorValue(SI->getValueOperand()); @@ -764,10 +769,15 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { break; } + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0)); + // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast(Gep->clone()); - unsigned NumOperands = Gep->getNumOperands(); - Gep2->setOperand(NumOperands - 1, Induction); + Gep2->setOperand(NumOperands - 1, LastIndex); Ptr = Builder.Insert(Gep2); Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); LI = Builder.CreateLoad(Ptr); -- cgit v1.1 From b75209cb66ab5d7a155af3cffe0e552e69820e2c Mon Sep 17 00:00:00 2001 From: Jakub Staszak Date: Sun, 21 Oct 2012 15:29:19 +0000 Subject: Simplify code. No functionality change. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166392 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 1b6011b..7efb003 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1186,14 +1186,13 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { /// marked with the NoAlias attribute. bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) { assert(Val && "Invalid value"); - if (dyn_cast(Val)) + if (isa(Val)) return true; - if (dyn_cast(Val)) + if (isa(Val)) return true; - Argument *A = dyn_cast(Val); - if (!A) - return false; - return A->hasNoAliasAttr(); + if (Argument *A = dyn_cast(Val)) + return A->hasNoAliasAttr(); + return false; } bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, -- cgit v1.1 From ffe5008fd3013451fd3cfe8dd878c9a1d4abad21 Mon Sep 17 00:00:00 2001 From: Jakub Staszak Date: Sun, 21 Oct 2012 15:36:03 +0000 Subject: Simplify code. No functionality change. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166393 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 7efb003..dfa2909 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1230,7 +1230,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // If the instruction has no users then this is a broken // chain and can't be a reduction variable. - if (Iter->use_begin() == Iter->use_end()) + if (Iter->use_empty()) return false; // For each of the *users* of iter. -- cgit v1.1 From 5bb5a75b1e47443ba9b07a53e6e967d29108bab3 Mon Sep 17 00:00:00 2001 From: Anders Carlsson Date: Sun, 21 Oct 2012 16:26:35 +0000 Subject: Avoid an extra hash lookup when inserting a value into the widen map. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166395 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index dfa2909..a041169 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -398,13 +398,13 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { assert(!V->getType()->isVectorTy() && "Can't widen a vector"); // If we saved a vectorized copy of V, use it. - ValueMap::iterator it = WidenMap.find(V); - if (it != WidenMap.end()) - return it->second; + Value *&MapEntry = WidenMap[V]; + if (MapEntry) + return MapEntry; // Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); - WidenMap[V] = B; + MapEntry = B; return B; } -- cgit v1.1 From e0fa403e23ab5610471c2dbe5c032327ee1ed091 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 22 Oct 2012 03:52:53 +0000 Subject: Update the loop vectorizer docs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166408 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 55 +++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 17 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index a041169..0fbf3e0 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -31,7 +31,7 @@ // Karrenberg, R. and Hack, S. Whole Function Vectorization. // // Other ideas/concepts are from: -// A. Zaks and D. Nuzman. Autovectorization in GCC—two years later. +// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. // //===----------------------------------------------------------------------===// #define LV_NAME "loop-vectorize" @@ -75,10 +75,20 @@ namespace { // Forward declaration. class LoopVectorizationLegality; -/// Vectorize a simple loop. This class performs the widening of simple single -/// basic block loops into vectors. It does not perform any -/// vectorization-legality checks, and just does it. It widens the vectors -/// to a given vectorization factor (VF). +/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic +/// block to a specified vectorization factor (VF). +/// This class performs the widening of scalars into vectors, or multiple +/// scalars. This class also implements the following features: +/// * It inserts an epilogue loop for handling loops that don't have iteration +/// counts that are known to be a multiple of the vectorization factor. +/// * It handles the code generation for reduction variables. +/// * Scalarization (implementation using scalars) of un-vectorizable +/// instructions. +/// SingleBlockLoopVectorizer does not perform any vectorization-legality +/// checks, and relies on the caller to check for the different legality +/// aspects. The SingleBlockLoopVectorizer relies on the +/// LoopVectorizationLegality class to provide information about the induction +/// and reduction variables that were found to a given vectorization factor. class SingleBlockLoopVectorizer { public: /// Ctor. @@ -169,10 +179,19 @@ private: ValueMap WidenMap; }; -/// Perform the vectorization legality check. This class does not look at the -/// profitability of vectorization, only the legality. At the moment the checks -/// are very simple and focus on single basic block loops with a constant -/// iteration count and no reductions. +/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and +/// to what vectorization factor. +/// This class does not look at the profitability of vectorization, only the +/// legality. This class has two main kinds of checks: +/// * Memory checks - The code in canVectorizeMemory checks if vectorization +/// will change the order of memory accesses in a way that will change the +/// correctness of the program. +/// * Scalars checks - The code in canVectorizeBlock checks for a number +/// of different conditions, such as the availability of a single induction +/// variable, that all types are supported and vectorize-able, etc. +/// This code reflects the capabilities of SingleBlockLoopVectorizer. +/// This class is also used by SingleBlockLoopVectorizer for identifying +/// induction variable and the different reduction variables. class LoopVectorizationLegality { public: LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): @@ -222,8 +241,10 @@ public: /// Returns the reduction variables found in the loop. ReductionList *getReductionVars() { return &Reductions; } - /// Check that the GEP operands are all uniform except for the last index - /// which has to be the induction variable. + /// Check if the pointer returned by this GEP is consecutive + /// when the index is vectorized. This happens when the last + /// index of the GEP is consecutive, like the induction variable. + /// This check allows us to vectorize A[idx] into a wide load/store. bool isConsecutiveGep(Value *Ptr); private: @@ -281,8 +302,7 @@ struct LoopVectorize : public LoopPass { LoopInfo *LI; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { - - // Only vectorize innermost loops. + // We only vectorize innermost loops. if (!L->empty()) return false; @@ -297,7 +317,8 @@ struct LoopVectorize : public LoopPass { LoopVectorizationLegality LVL(L, SE, DL); unsigned MaxVF = LVL.getLoopMaxVF(); - // Check that we can vectorize using the chosen vectorization width. + // Check that we can vectorize this loop using the chosen vectorization + // width. if (MaxVF < DefaultVectorizationFactor) { DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n"); return false; @@ -305,7 +326,7 @@ struct LoopVectorize : public LoopPass { DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n"); - // If we decided that is is *legal* to vectorizer the loop. Do it. + // If we decided that it is *legal* to vectorizer the loop then do it. SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor); LB.vectorize(&LVL); @@ -461,7 +482,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { if (!IsVoidRetTy) VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); - // For each scalar that we create. + // For each scalar that we create: for (unsigned i = 0; i < VF; ++i) { Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) @@ -495,7 +516,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- vector loop bypass. + [ ] <-- vector loop bypass. / | / v | [ ] <-- vector pre header. -- cgit v1.1 From 565048e78a152aa69b66422037b67c0f627bbfec Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 22 Oct 2012 04:38:00 +0000 Subject: Vectorizer: optimize the generation of selects. If the condition is uniform, generate a scalar-cond select (i1 as selector). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166409 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 0fbf3e0..59e8e5e 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -725,12 +725,22 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { } case Instruction::Select: { // Widen selects. - // TODO: If the selector is loop invariant we can issue a select - // instruction with a scalar condition. - Value *A = getVectorValue(Inst->getOperand(0)); - Value *B = getVectorValue(Inst->getOperand(1)); - Value *C = getVectorValue(Inst->getOperand(2)); - WidenMap[Inst] = Builder.CreateSelect(A, B, C); + // If the selector is loop invariant we can create a select + // instruction with a scalar condition. Otherwise, use vector-select. + Value *Cond = Inst->getOperand(0); + bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), Orig); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + Cond = getVectorValue(Cond); + if (InvariantCond) + Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); + + Value *Op0 = getVectorValue(Inst->getOperand(1)); + Value *Op1 = getVectorValue(Inst->getOperand(2)); + WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); break; } -- cgit v1.1 From 5f7d81022398f332b222552f5d980c4e3f1c542c Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 22 Oct 2012 04:53:05 +0000 Subject: Rename a variable. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166410 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 59e8e5e..6fbf342 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -92,9 +92,9 @@ class LoopVectorizationLegality; class SingleBlockLoopVectorizer { public: /// Ctor. - SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li, + SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, LPPassManager *Lpm, unsigned VecWidth): - Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), + OrigLoop(Orig), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), Builder(Se->getContext()), Induction(0), OldInduction(0) { } // Perform the actual loop widening (vectorization). @@ -145,7 +145,7 @@ private: typedef DenseMap ValueMap; /// The original loop. - Loop *Orig; + Loop *OrigLoop; // Scev analysis to use. ScalarEvolution *SE; // Loop Info. @@ -541,11 +541,11 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal */ // This is the original scalar-loop preheader. - BasicBlock *BypassBlock = Orig->getLoopPreheader(); - BasicBlock *ExitBlock = Orig->getExitBlock(); + BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); + BasicBlock *ExitBlock = OrigLoop->getExitBlock(); assert(ExitBlock && "Must have an exit block"); - assert(Orig->getNumBlocks() == 1 && "Invalid loop"); + assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); assert(BypassBlock && "Invalid loop structure"); BasicBlock *VectorPH = @@ -559,7 +559,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.preheader"); // Find the induction variable. - BasicBlock *OldBasicBlock = Orig->getHeader(); + BasicBlock *OldBasicBlock = OrigLoop->getHeader(); OldInduction = Legal->getInduction(); assert(OldInduction && "We must have a single phi node."); Type *IdxTy = OldInduction->getType(); @@ -574,7 +574,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal Constant *Step = ConstantInt::get(IdxTy, VF); // Find the loop boundaries. - const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader()); + const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); // Get the total trip count from the count by adding 1. @@ -639,11 +639,11 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal // Register the new loop. Loop* Lp = new Loop(); - LPM->insertLoop(Lp, Orig->getParentLoop()); + LPM->insertLoop(Lp, OrigLoop->getParentLoop()); Lp->addBasicBlockToLoop(VecBody, LI->getBase()); - Loop *ParentLoop = Orig->getParentLoop(); + Loop *ParentLoop = OrigLoop->getParentLoop(); if (ParentLoop) { ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); @@ -661,7 +661,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal void SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { typedef SmallVector PhiVector; - BasicBlock &BB = *Orig->getHeader(); + BasicBlock &BB = *OrigLoop->getHeader(); Constant *Zero = ConstantInt::get( IntegerType::getInt32Ty(BB.getContext()), 0); @@ -728,7 +728,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // If the selector is loop invariant we can create a select // instruction with a scalar condition. Otherwise, use vector-select. Value *Cond = Inst->getOperand(0); - bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), Orig); + bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. @@ -958,7 +958,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { void SingleBlockLoopVectorizer::cleanup() { // The original basic block. - SE->forgetLoop(Orig); + SE->forgetLoop(OrigLoop); } unsigned LoopVectorizationLegality::getLoopMaxVF() { -- cgit v1.1 From 782090aa02484d7d1d335e3a7e209f1132aaa1a7 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 22 Oct 2012 18:27:56 +0000 Subject: Don't crash if the load/store pointer is not a GEP. Fix by Shivarama Rao git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166427 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 6fbf342..f944d9b 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -389,7 +389,7 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { } bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { - GetElementPtrInst *Gep = dyn_cast(Ptr); + GetElementPtrInst *Gep = dyn_cast_or_null(Ptr); if (!Gep) return false; -- cgit v1.1 From 6457001f31713ff26a707ddef616341052b1b296 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 23 Oct 2012 18:44:18 +0000 Subject: Use the AliasAnalysis isIdentifiedObj because it also understands mallocs and c++ news. PR14158. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166491 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index f944d9b..bead392 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -259,10 +259,6 @@ private: /// Returns true if BB is vectorizable bool canVectorizeMemory(BasicBlock &BB); - // Check if a pointer value is known to be disjoint. - // Example: Alloca, Global, NoAlias. - bool isIdentifiedSafeObject(Value* Val); - /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. bool AddReductionVar(PHINode *Phi, ReductionKind Kind); @@ -1178,7 +1174,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { GetUnderlyingObjects(*I, TempObjects, DL); for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); it != e; ++it) { - if (!isIdentifiedSafeObject(*it)) { + if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); return false; } @@ -1196,7 +1192,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { GetUnderlyingObjects(*I, TempObjects, DL); for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); it != e; ++it) { - if (!isIdentifiedSafeObject(*it)) { + if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); return false; } @@ -1213,19 +1209,6 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { return true; } -/// Checks if the value is a Global variable or if it is an Arguments -/// marked with the NoAlias attribute. -bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) { - assert(Val && "Invalid value"); - if (isa(Val)) - return true; - if (isa(Val)) - return true; - if (Argument *A = dyn_cast(Val)) - return A->hasNoAliasAttr(); - return false; -} - bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, ReductionKind Kind) { if (Phi->getNumIncomingValues() != 2) -- cgit v1.1 From 2f87640b86315beab8a5671cc23f524e59c58bd3 Mon Sep 17 00:00:00 2001 From: Micah Villmow Date: Wed, 24 Oct 2012 17:20:04 +0000 Subject: Delete a directory that wasn't supposed to be checked in yet. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166591 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index bead392..4cd171a 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -777,7 +777,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { GetElementPtrInst *Gep2 = cast(Gep->clone()); Gep2->setOperand(NumOperands - 1, LastIndex); Ptr = Builder.Insert(Gep2); - Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); + Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo(Ptr->getType())); Value *Val = getVectorValue(SI->getValueOperand()); Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); break; @@ -806,7 +806,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { GetElementPtrInst *Gep2 = cast(Gep->clone()); Gep2->setOperand(NumOperands - 1, LastIndex); Ptr = Builder.Insert(Gep2); - Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); + Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo(Ptr->getType())); LI = Builder.CreateLoad(Ptr); LI->setAlignment(Alignment); // Use this vector value for all users of the load. -- cgit v1.1 From b8bce928f4ffdf50eff69334f3e25b27848536b6 Mon Sep 17 00:00:00 2001 From: Micah Villmow Date: Wed, 24 Oct 2012 17:25:11 +0000 Subject: Back out r166591, not sure why this made it through since I cancelled the command. Bleh, sorry about this! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166596 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 4cd171a..bead392 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -777,7 +777,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { GetElementPtrInst *Gep2 = cast(Gep->clone()); Gep2->setOperand(NumOperands - 1, LastIndex); Ptr = Builder.Insert(Gep2); - Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo(Ptr->getType())); + Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); Value *Val = getVectorValue(SI->getValueOperand()); Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); break; @@ -806,7 +806,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { GetElementPtrInst *Gep2 = cast(Gep->clone()); Gep2->setOperand(NumOperands - 1, LastIndex); Ptr = Builder.Insert(Gep2); - Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo(Ptr->getType())); + Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); LI = Builder.CreateLoad(Ptr); LI->setAlignment(Alignment); // Use this vector value for all users of the load. -- cgit v1.1 From 50bec6f8c494957b00dd225ddf580d3e0b97b871 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 24 Oct 2012 20:36:32 +0000 Subject: LoopVectorizer: Add a basic cost model which uses the VTTI interface. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166620 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 303 ++++++++++++++++++++++++++--- 1 file changed, 273 insertions(+), 30 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index bead392..6f6685b 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -18,10 +18,13 @@ // // This pass has three parts: // 1. The main loop pass that drives the different parts. -// 2. LoopVectorizationLegality - A helper class that checks for the legality +// 2. LoopVectorizationLegality - A unit that checks for the legality // of the vectorization. -// 3. SingleBlockLoopVectorizer - A helper class that performs the actual +// 3. SingleBlockLoopVectorizer - A unit that performs the actual // widening of instructions. +// 4. LoopVectorizationCostModel - A unit that checks for the profitability +// of vectorization. It decides on the optimal vector width, which +// can be one, if vectorization is not profitable. //===----------------------------------------------------------------------===// // // The reduction-variable vectorization is based on the paper: @@ -51,13 +54,14 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/TargetTransformInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -67,13 +71,14 @@ using namespace llvm; static cl::opt -DefaultVectorizationFactor("default-loop-vectorize-width", - cl::init(4), cl::Hidden, - cl::desc("Set the default loop vectorization width")); +VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, + cl::desc("Set the default vectorization width. Zero is autoselect.")); + namespace { -// Forward declaration. +// Forward declarations. class LoopVectorizationLegality; +class LoopVectorizationCostModel; /// SingleBlockLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). @@ -229,11 +234,10 @@ public: /// of the reductions that were found in the loop. typedef DenseMap ReductionList; - /// Returns the maximum vectorization factor that we *can* use to vectorize - /// this loop. This does not mean that it is profitable to vectorize this - /// loop, only that it is legal to do so. This may be a large number. We - /// can vectorize to any SIMD width below this number. - unsigned getLoopMaxVF(); + /// Returns true if it is legal to vectorize this loop. + /// This does not mean that it is profitable to vectorize this + /// loop, only that it is legal to do so. + bool canVectorize(); /// Returns the Induction variable. PHINode *getInduction() {return Induction;} @@ -286,6 +290,49 @@ private: SmallPtrSet AllowedExit; }; +/// LoopVectorizationCostModel - estimates the expected speedups due to +/// vectorization. +/// In many cases vectorization is not profitable. This can happen because +/// of a number of reasons. In this class we mainly attempt to predict +/// the expected speedup/slowdowns due to the supported instruction set. +/// We use the VectorTargetTransformInfo to query the different backends +/// for the cost of different operations. +class LoopVectorizationCostModel { +public: + /// C'tor. + LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl, + LoopVectorizationLegality *Leg, + const VectorTargetTransformInfo *Vtti): + TheLoop(Lp), SE(Se), DL(Dl), Legal(Leg), VTTI(Vtti) { } + + /// Returns the most profitable vectorization factor for the loop that is + /// smaller or equal to the VF argument. This method checks every power + /// of two up to VF. + unsigned findBestVectorizationFactor(unsigned VF = 4); + +private: + /// Returns the expected execution cost. The unit of the cost does + /// not matter because we use the 'cost' units to compare different + /// vector widths. The cost that is returned is *not* normalized by + /// the factor width. + unsigned expectedCost(unsigned VF); + + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + unsigned getInstructionCost(Instruction *I, unsigned VF); + + /// The loop that we evaluate. + Loop *TheLoop; + /// Scev analysis. + ScalarEvolution *SE; + /// DataLayout analysis. + DataLayout *DL; + /// Vectorization legality. + LoopVectorizationLegality *Legal; + /// Vector target information. + const VectorTargetTransformInfo *VTTI; +}; + struct LoopVectorize : public LoopPass { static char ID; // Pass identification, replacement for typeid @@ -296,6 +343,7 @@ struct LoopVectorize : public LoopPass { ScalarEvolution *SE; DataLayout *DL; LoopInfo *LI; + TargetTransformInfo *TTI; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { // We only vectorize innermost loops. @@ -305,25 +353,42 @@ struct LoopVectorize : public LoopPass { SE = &getAnalysis(); DL = getAnalysisIfAvailable(); LI = &getAnalysis(); + TTI = getAnalysisIfAvailable(); DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); // Check if it is legal to vectorize the loop. LoopVectorizationLegality LVL(L, SE, DL); - unsigned MaxVF = LVL.getLoopMaxVF(); - - // Check that we can vectorize this loop using the chosen vectorization - // width. - if (MaxVF < DefaultVectorizationFactor) { - DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n"); + if (!LVL.canVectorize()) { + DEBUG(dbgs() << "LV: Not vectorizing.\n"); return false; } - DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n"); + // Select the preffered vectorization factor. + unsigned VF = 1; + if (VectorizationFactor == 0) { + const VectorTargetTransformInfo *VTTI = 0; + if (TTI) + VTTI = TTI->getVectorTargetTransformInfo(); + // Use the cost model. + LoopVectorizationCostModel CM(L, SE, DL, &LVL, VTTI); + VF = CM.findBestVectorizationFactor(); + + if (VF == 1) { + DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); + return false; + } + + } else { + // Use the user command flag. + VF = VectorizationFactor; + } + + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n"); // If we decided that it is *legal* to vectorizer the loop then do it. - SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor); + SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -656,6 +721,13 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal void SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { + //===------------------------------------------------===// + // + // Notice: any optimization or new instruction that go + // into the code below should be also be implemented in + // the cost-model. + // + //===------------------------------------------------===// typedef SmallVector PhiVector; BasicBlock &BB = *OrigLoop->getHeader(); Constant *Zero = ConstantInt::get( @@ -957,18 +1029,18 @@ void SingleBlockLoopVectorizer::cleanup() { SE->forgetLoop(OrigLoop); } -unsigned LoopVectorizationLegality::getLoopMaxVF() { +bool LoopVectorizationLegality::canVectorize() { if (!TheLoop->getLoopPreheader()) { assert(false && "No preheader!!"); DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); - return 1; + return false; } // We can only vectorize single basic block loops. unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1) { DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); - return 1; + return false; } // We need to have a loop header. @@ -978,22 +1050,22 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() { // Go over each instruction and look at memory deps. if (!canVectorizeBlock(*BB)) { DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); - return 1; + return false; } // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); if (ExitCount == SE->getCouldNotCompute()) { DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); - return 1; + return false; } DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); // Okay! We can vectorize. At this point we don't have any other mem analysis - // which may limit our maximum vectorization factor, so just return the - // maximum SIMD size. - return DefaultVectorizationFactor; + // which may limit our maximum vectorization factor, so just return true with + // no restrictions. + return true; } bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { @@ -1323,6 +1395,177 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { return true; } +unsigned +LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { + if (!VTTI) { + DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); + return 1; + } + + float Cost = expectedCost(1); + unsigned Width = 1; + DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); + for (unsigned i=2; i <= VF; i*=2) { + // Notice that the vector loop needs to be executed less times, so + // we need to divide the cost of the vector loops by the width of + // the vector elements. + float VectorCost = expectedCost(i) / (float)i; + DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << + (int)VectorCost << ".\n"); + if (VectorCost < Cost) { + Cost = VectorCost; + Width = i; + } + } + + DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); + return Width; +} + +unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { + // We can only estimate the cost of single basic block loops. + assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); + + BasicBlock *BB = TheLoop->getHeader(); + unsigned Cost = 0; + + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + Instruction *Inst = it; + Cost += getInstructionCost(Inst, VF); + } + + // Return the cost divided by VF, because we will be executing + // less iterations of the vector form. + return Cost; +} + +unsigned +LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { + assert(VTTI && "Invalid vector target transformation info"); + switch (I->getOpcode()) { + case Instruction::Br: { + return VTTI->getInstrCost(I->getOpcode()); + } + case Instruction::PHI: + // PHIs are handled the same as the binary instructions below. + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + Type *VTy = VectorType::get(I->getType(), VF); + return VTTI->getInstrCost(I->getOpcode(), VTy); + } + case Instruction::Select: { + SelectInst *SI = cast(I); + Type *VTy = VectorType::get(I->getType(), VF); + const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); + bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); + Type *CondTy = SI->getCondition()->getType(); + if (ScalarCond) + CondTy = VectorType::get(CondTy, VF); + + return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy); + } + case Instruction::ICmp: + case Instruction::FCmp: { + Type *VTy = VectorType::get(I->getOperand(0)->getType(), VF); + return VTTI->getInstrCost(I->getOpcode(), VTy); + } + case Instruction::Store: { + StoreInst *SI = cast(I); + Type *VTy = VectorType::get(SI->getValueOperand()->getType(), VF); + + // Scalarized stores. + if (!Legal->isConsecutiveGep(SI->getPointerOperand())) { + unsigned Cost = 0; + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy); + // The cost of extracting from the vector value. + Cost += VF * ExtCost; + // The cost of the scalar stores. + Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType()); + return Cost; + } + + // Wide stores. + return VTTI->getMemoryOpCost(I->getOpcode(), VTy, SI->getAlignment(), + SI->getPointerAddressSpace()); + } + case Instruction::Load: { + LoadInst *LI = cast(I); + Type *VTy = VectorType::get(I->getType(), VF); + + // Scalarized loads. + if (!Legal->isConsecutiveGep(LI->getPointerOperand())) { + unsigned Cost = 0; + unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy); + // The cost of inserting the loaded value into the result vector. + Cost += VF * InCost; + // The cost of the scalar stores. + Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType()); + return Cost; + } + + // Wide loads. + return VTTI->getMemoryOpCost(I->getOpcode(), VTy, LI->getAlignment(), + LI->getPointerAddressSpace()); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VectorType::get(I->getOperand(0)->getType(), VF); + Type *DstTy = VectorType::get(I->getType(), VF); + return VTTI->getInstrCost(I->getOpcode(), DstTy, SrcTy); + } + default: { + // We are scalarizing the instruction. Return the cost of the scalar + // instruction, plus the cost of insert and extract into vector + // elements, times the vector width. + unsigned Cost = 0; + Type *Ty = I->getType(); + + if (!Ty->isVoidTy()) { + Type *VTy = VectorType::get(Ty, VF); + unsigned InsCost = VTTI->getInstrCost(Instruction::InsertElement, VTy); + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy); + Cost += VF * (InsCost + ExtCost); + } + + /// We don't have any information on the scalar instruction, but maybe + /// the target has. + /// TODO: This may be a target-specific intrinsic. + /// Need to add API for that. + Cost += VF * VTTI->getInstrCost(I->getOpcode(), Ty); + + return Cost; + } + }// end of switch. +} + + } // namespace char LoopVectorize::ID = 0; -- cgit v1.1 From 7c6de1d6e53c47f2943874ab9d63dddfd19b503d Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 24 Oct 2012 20:58:40 +0000 Subject: whitespace git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166622 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 6f6685b..35f49e4 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1468,7 +1468,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::Or: case Instruction::Xor: { Type *VTy = VectorType::get(I->getType(), VF); - return VTTI->getInstrCost(I->getOpcode(), VTy); + return VTTI->getInstrCost(I->getOpcode(), VTy); } case Instruction::Select: { SelectInst *SI = cast(I); @@ -1476,8 +1476,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); - if (ScalarCond) - CondTy = VectorType::get(CondTy, VF); + if (ScalarCond) + CondTy = VectorType::get(CondTy, VF); return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy); } -- cgit v1.1 From 2652c50f74bc4a874c6a2e4b34ff2d52d479183f Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 24 Oct 2012 23:47:38 +0000 Subject: Implement a basic cost model for vector and scalar instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166642 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 49 +++++++++++++++++++----------- 1 file changed, 32 insertions(+), 17 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 35f49e4..483b9fc 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -300,10 +300,10 @@ private: class LoopVectorizationCostModel { public: /// C'tor. - LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl, + LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, LoopVectorizationLegality *Leg, const VectorTargetTransformInfo *Vtti): - TheLoop(Lp), SE(Se), DL(Dl), Legal(Leg), VTTI(Vtti) { } + TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } /// Returns the most profitable vectorization factor for the loop that is /// smaller or equal to the VF argument. This method checks every power @@ -325,8 +325,7 @@ private: Loop *TheLoop; /// Scev analysis. ScalarEvolution *SE; - /// DataLayout analysis. - DataLayout *DL; + /// Vectorization legality. LoopVectorizationLegality *Legal; /// Vector target information. @@ -372,7 +371,7 @@ struct LoopVectorize : public LoopPass { if (TTI) VTTI = TTI->getVectorTargetTransformInfo(); // Use the cost model. - LoopVectorizationCostModel CM(L, SE, DL, &LVL, VTTI); + LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); VF = CM.findBestVectorizationFactor(); if (VF == 1) { @@ -1432,11 +1431,12 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { Instruction *Inst = it; - Cost += getInstructionCost(Inst, VF); + unsigned C = getInstructionCost(Inst, VF); + Cost += C; + DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << + " For instruction: "<< *Inst << "\n"); } - // Return the cost divided by VF, because we will be executing - // less iterations of the vector form. return Cost; } @@ -1444,11 +1444,13 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { assert(VTTI && "Invalid vector target transformation info"); switch (I->getOpcode()) { + case Instruction::GetElementPtr: + return 0; case Instruction::Br: { return VTTI->getInstrCost(I->getOpcode()); } case Instruction::PHI: - // PHIs are handled the same as the binary instructions below. + return 0; case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -1493,11 +1495,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // Scalarized stores. if (!Legal->isConsecutiveGep(SI->getPointerOperand())) { unsigned Cost = 0; - unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy); - // The cost of extracting from the vector value. - Cost += VF * ExtCost; + if (VF != 1) { + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + VTy); + // The cost of extracting from the value vector and pointer vector. + Cost += VF * (ExtCost * 2); + } // The cost of the scalar stores. - Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType()); + Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), + VTy->getScalarType(), + SI->getAlignment(), + SI->getPointerAddressSpace()); return Cost; } @@ -1512,11 +1520,18 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // Scalarized loads. if (!Legal->isConsecutiveGep(LI->getPointerOperand())) { unsigned Cost = 0; - unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy); - // The cost of inserting the loaded value into the result vector. - Cost += VF * InCost; + if (VF != 1) { + unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy); + unsigned ExCost = VTTI->getInstrCost(Instruction::ExtractValue, VTy); + + // The cost of inserting the loaded value into the result vector, and + // extracting from a vector of pointers. + Cost += VF * (InCost + ExCost); + } // The cost of the scalar stores. - Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType()); + Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), VTy->getScalarType(), + LI->getAlignment(), + LI->getPointerAddressSpace()); return Cost; } -- cgit v1.1 From 8dbac7b529cfb73bcd0ceef514e5c1d247cf3baa Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 25 Oct 2012 00:08:41 +0000 Subject: Add support for additional reduction variables: AND, OR, XOR. Patch by Paul Redmond . git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166649 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 49 +++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 7 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 483b9fc..423c7a4 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -208,7 +208,10 @@ public: enum ReductionKind { NoReduction = -1, /// Not a reduction. IntegerAdd = 0, /// Sum of numbers. - IntegerMult = 1 /// Product of numbers. + IntegerMult = 1, /// Product of numbers. + IntegerOr = 2, /// Bitwise or logical OR of numbers. + IntegerAnd = 3, /// Bitwise or logical AND of numbers. + IntegerXor = 4 /// Bitwise or logical XOR of numbers. }; /// This POD struct holds information about reduction variables. @@ -981,14 +984,28 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Extract the first scalar. Value *Scalar0 = Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); - // Extract and sum the remaining vector elements. + // Extract and reduce the remaining vector elements. for (unsigned i=1; i < VF; ++i) { Value *Scalar1 = Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); - if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) { - Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); - } else { - Scalar0 = Builder.CreateMul(Scalar0, Scalar1); + switch (RdxDesc.Kind) { + case LoopVectorizationLegality::IntegerAdd: + Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerMult: + Scalar0 = Builder.CreateMul(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerOr: + Scalar0 = Builder.CreateOr(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerAnd: + Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerXor: + Scalar0 = Builder.CreateXor(Scalar0, Scalar1); + break; + default: + llvm_unreachable("Unknown reduction operation"); } } @@ -1099,7 +1116,19 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { continue; } if (AddReductionVar(Phi, IntegerMult)) { - DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n"); + DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerOr)) { + DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerAnd)) { + DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerXor)) { + DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); continue; } @@ -1373,6 +1402,12 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, case Instruction::UDiv: case Instruction::SDiv: return Kind == IntegerMult; + case Instruction::And: + return Kind == IntegerAnd; + case Instruction::Or: + return Kind == IntegerOr; + case Instruction::Xor: + return Kind == IntegerXor; } } -- cgit v1.1 From 3ef9dfa6858e25015c3e36b2f1a0ba5ebdea80d2 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 25 Oct 2012 21:03:48 +0000 Subject: LoopVectorize: Teach the cost model to query scalar costs as scalar types and not vectors of 1. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166715 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 102 +++++++++++++++++------------ 1 file changed, 61 insertions(+), 41 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 423c7a4..e47baf8 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -324,6 +324,11 @@ private: /// width. Vector width of one means scalar. unsigned getInstructionCost(Instruction *I, unsigned VF); + /// A helper function for converting Scalar types to vector types. + /// If the incoming type is void, we return void. If the VF is 1, we return + /// the scalar type. + static Type* ToVectorTy(Type *Scalar, unsigned VF); + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. @@ -1478,8 +1483,16 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { assert(VTTI && "Invalid vector target transformation info"); + + Type *RetTy = I->getType(); + Type *VectorTy = ToVectorTy(RetTy, VF); + + // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: + // We mark this instruction as zero-cost because scalar GEPs are usually + // lowered to the intruction addressing mode. At the moment we don't + // generate vector geps. return 0; case Instruction::Br: { return VTTI->getInstrCost(I->getOpcode()); @@ -1504,74 +1517,76 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - Type *VTy = VectorType::get(I->getType(), VF); - return VTTI->getInstrCost(I->getOpcode(), VTy); + return VTTI->getInstrCost(I->getOpcode(), VectorTy); } case Instruction::Select: { SelectInst *SI = cast(I); - Type *VTy = VectorType::get(I->getType(), VF); const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); if (ScalarCond) CondTy = VectorType::get(CondTy, VF); - return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy); + return VTTI->getInstrCost(I->getOpcode(), VectorTy, CondTy); } case Instruction::ICmp: case Instruction::FCmp: { - Type *VTy = VectorType::get(I->getOperand(0)->getType(), VF); - return VTTI->getInstrCost(I->getOpcode(), VTy); + Type *ValTy = I->getOperand(0)->getType(); + VectorTy = ToVectorTy(ValTy, VF); + return VTTI->getInstrCost(I->getOpcode(), VectorTy); } case Instruction::Store: { StoreInst *SI = cast(I); - Type *VTy = VectorType::get(SI->getValueOperand()->getType(), VF); + Type *ValTy = SI->getValueOperand()->getType(); + VectorTy = ToVectorTy(ValTy, VF); + + if (VF == 1) + return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, + SI->getAlignment(), SI->getPointerAddressSpace()); // Scalarized stores. if (!Legal->isConsecutiveGep(SI->getPointerOperand())) { unsigned Cost = 0; - if (VF != 1) { - unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, - VTy); - // The cost of extracting from the value vector and pointer vector. - Cost += VF * (ExtCost * 2); - } + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + ValTy); + // The cost of extracting from the value vector. + Cost += VF * (ExtCost); // The cost of the scalar stores. Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), - VTy->getScalarType(), + ValTy->getScalarType(), SI->getAlignment(), SI->getPointerAddressSpace()); return Cost; } // Wide stores. - return VTTI->getMemoryOpCost(I->getOpcode(), VTy, SI->getAlignment(), + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), SI->getPointerAddressSpace()); } case Instruction::Load: { LoadInst *LI = cast(I); - Type *VTy = VectorType::get(I->getType(), VF); + + if (VF == 1) + return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, + LI->getAlignment(), + LI->getPointerAddressSpace()); // Scalarized loads. if (!Legal->isConsecutiveGep(LI->getPointerOperand())) { unsigned Cost = 0; - if (VF != 1) { - unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy); - unsigned ExCost = VTTI->getInstrCost(Instruction::ExtractValue, VTy); - - // The cost of inserting the loaded value into the result vector, and - // extracting from a vector of pointers. - Cost += VF * (InCost + ExCost); - } + unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); + // The cost of inserting the loaded value into the result vector. + Cost += VF * (InCost); // The cost of the scalar stores. - Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), VTy->getScalarType(), + Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), + RetTy->getScalarType(), LI->getAlignment(), LI->getPointerAddressSpace()); return Cost; } // Wide loads. - return VTTI->getMemoryOpCost(I->getOpcode(), VTy, LI->getAlignment(), + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), LI->getPointerAddressSpace()); } case Instruction::ZExt: @@ -1586,35 +1601,40 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - Type *SrcTy = VectorType::get(I->getOperand(0)->getType(), VF); - Type *DstTy = VectorType::get(I->getType(), VF); - return VTTI->getInstrCost(I->getOpcode(), DstTy, SrcTy); + Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); + return VTTI->getInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } default: { // We are scalarizing the instruction. Return the cost of the scalar // instruction, plus the cost of insert and extract into vector // elements, times the vector width. unsigned Cost = 0; - Type *Ty = I->getType(); - if (!Ty->isVoidTy()) { - Type *VTy = VectorType::get(Ty, VF); - unsigned InsCost = VTTI->getInstrCost(Instruction::InsertElement, VTy); - unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy); - Cost += VF * (InsCost + ExtCost); - } + bool IsVoid = RetTy->isVoidTy(); - /// We don't have any information on the scalar instruction, but maybe - /// the target has. - /// TODO: This may be a target-specific intrinsic. - /// Need to add API for that. - Cost += VF * VTTI->getInstrCost(I->getOpcode(), Ty); + unsigned InsCost = (IsVoid ? 0 : + VTTI->getInstrCost(Instruction::InsertElement, + VectorTy)); + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + VectorTy); + + // The cost of inserting the results plus extracting each one of the + // operands. + Cost += VF * (InsCost + ExtCost * I->getNumOperands()); + + // The cost of executing VF copies of the scalar instruction. + Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); return Cost; } }// end of switch. } +Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { + if (Scalar->isVoidTy() || VF == 1) + return Scalar; + return VectorType::get(Scalar, VF); +} } // namespace -- cgit v1.1 From a5a3a61c5fdcee972791d4e08441ba6edf131b88 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 26 Oct 2012 23:49:28 +0000 Subject: Refactor the VectorTargetTransformInfo interface. Add getCostXXX calls for different families of opcodes, such as casts, arithmetic, cmp, etc. Port the LoopVectorizer to the new API. The LoopVectorizer now finds instructions which will remain uniform after vectorization. It uses this information when calculating the cost of these instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166836 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 61 +++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 9 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index e47baf8..1773812 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -108,7 +108,7 @@ public: createEmptyLoop(Legal); /// Widen each instruction in the old loop to a new one in the new loop. /// Use the Legality module to find the induction and reduction variables. - vectorizeLoop(Legal); + vectorizeLoop(Legal); // register the new loop. cleanup(); } @@ -254,6 +254,9 @@ public: /// This check allows us to vectorize A[idx] into a wide load/store. bool isConsecutiveGep(Value *Ptr); + /// Returns true if this instruction will remain scalar after vectorization. + bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -291,6 +294,9 @@ private: /// Allowed outside users. This holds the reduction /// vars which can be accessed from outside the loop. SmallPtrSet AllowedExit; + /// This set holds the variables which are known to be uniform after + /// vectorization. + SmallPtrSet Uniforms; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1177,9 +1183,40 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { return false; } - // If the memory dependencies do not prevent us from - // vectorizing, then vectorize. - return canVectorizeMemory(BB); + // Don't vectorize if the memory dependencies do not allow vectorization. + if (!canVectorizeMemory(BB)) + return false; + + // We now know that the loop is vectorizable! + // Collect variables that will remain uniform after vectorization. + std::vector Worklist; + + // Start with the conditional branch and walk up the block. + Worklist.push_back(BB.getTerminator()->getOperand(0)); + + while (Worklist.size()) { + Instruction *I = dyn_cast(Worklist.back()); + Worklist.pop_back(); + // Look at instructions inside this block. + if (!I) continue; + if (I->getParent() != &BB) continue; + + // Stop when reaching PHI nodes. + if (isa(I)) { + assert(I == Induction && "Found a uniform PHI that is not the induction"); + break; + } + + // This is a known uniform. + Uniforms.insert(I); + + // Insert all operands. + for (int i=0, Op = I->getNumOperands(); i < Op; ++i) { + Worklist.push_back(I->getOperand(i)); + } + } + + return true; } bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { @@ -1484,9 +1521,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { assert(VTTI && "Invalid vector target transformation info"); + // If we know that this instruction will remain uniform, check the cost of + // the scalar version. + if (Legal->isUniformAfterVectorization(I)) + VF = 1; + Type *RetTy = I->getType(); Type *VectorTy = ToVectorTy(RetTy, VF); + // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: @@ -1495,7 +1538,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // generate vector geps. return 0; case Instruction::Br: { - return VTTI->getInstrCost(I->getOpcode()); + return VTTI->getCFInstrCost(I->getOpcode()); } case Instruction::PHI: return 0; @@ -1517,7 +1560,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - return VTTI->getInstrCost(I->getOpcode(), VectorTy); + return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); } case Instruction::Select: { SelectInst *SI = cast(I); @@ -1527,13 +1570,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { if (ScalarCond) CondTy = VectorType::get(CondTy, VF); - return VTTI->getInstrCost(I->getOpcode(), VectorTy, CondTy); + return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); } case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); VectorTy = ToVectorTy(ValTy, VF); - return VTTI->getInstrCost(I->getOpcode(), VectorTy); + return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); } case Instruction::Store: { StoreInst *SI = cast(I); @@ -1602,7 +1645,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); - return VTTI->getInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } default: { // We are scalarizing the instruction. Return the cost of the scalar -- cgit v1.1 From f065a8467785015336432e3e6e584798d8b48d8e Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sat, 27 Oct 2012 04:11:32 +0000 Subject: 1. Fix a bug in getTypeConversion. When a *simple* type is split, we need to return the type of the split result. 2. Change the maximum vectorization width from 4 to 8. 3. A test for both. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166864 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 1773812..be197db 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -317,7 +317,7 @@ public: /// Returns the most profitable vectorization factor for the loop that is /// smaller or equal to the VF argument. This method checks every power /// of two up to VF. - unsigned findBestVectorizationFactor(unsigned VF = 4); + unsigned findBestVectorizationFactor(unsigned VF = 8); private: /// Returns the expected execution cost. The unit of the cost does -- cgit v1.1 From 369ff7b740c154dc4357c6d063f4957498a1e7b9 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 29 Oct 2012 21:52:38 +0000 Subject: LoopVectorize: Update and preserve the dominator tree info. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166970 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 46 ++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 9 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index be197db..dcdf0a3 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -55,6 +55,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/LoopInfo.h" @@ -98,8 +99,9 @@ class SingleBlockLoopVectorizer { public: /// Ctor. SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, - LPPassManager *Lpm, unsigned VecWidth): - OrigLoop(Orig), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), + DominatorTree *dt, LPPassManager *Lpm, + unsigned VecWidth): + OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth), Builder(Se->getContext()), Induction(0), OldInduction(0) { } // Perform the actual loop widening (vectorization). @@ -110,7 +112,7 @@ public: /// Use the Legality module to find the induction and reduction variables. vectorizeLoop(Legal); // register the new loop. - cleanup(); + updateAnalysis(); } private: @@ -119,7 +121,7 @@ private: /// Copy and widen the instructions from the old loop. void vectorizeLoop(LoopVectorizationLegality *Legal); /// Insert the new loop to the loop hierarchy and pass manager. - void cleanup(); + void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence /// of scalars. @@ -155,6 +157,8 @@ private: ScalarEvolution *SE; // Loop Info. LoopInfo *LI; + // Dominator Tree. + DominatorTree *DT; // Loop Pass Manager; LPPassManager *LPM; // The vectorization factor to use. @@ -165,6 +169,10 @@ private: // --- Vectorization state --- + /// The vector-loop preheader. + BasicBlock *LoopVectorPreHeader; + /// The scalar-loop preheader. + BasicBlock *LoopScalarPreHeader; /// Middle Block between the vector and the scalar. BasicBlock *LoopMiddleBlock; ///The ExitBlock of the scalar loop. @@ -357,6 +365,7 @@ struct LoopVectorize : public LoopPass { DataLayout *DL; LoopInfo *LI; TargetTransformInfo *TTI; + DominatorTree *DT; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { // We only vectorize innermost loops. @@ -367,6 +376,7 @@ struct LoopVectorize : public LoopPass { DL = getAnalysisIfAvailable(); LI = &getAnalysis(); TTI = getAnalysisIfAvailable(); + DT = &getAnalysis(); DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); @@ -401,7 +411,7 @@ struct LoopVectorize : public LoopPass { DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n"); // If we decided that it is *legal* to vectorizer the loop then do it. - SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF); + SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -414,6 +424,9 @@ struct LoopVectorize : public LoopPass { AU.addRequiredID(LCSSAID); AU.addRequired(); AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); } }; @@ -725,6 +738,8 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal } // Save the state. + LoopVectorPreHeader = VectorPH; + LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; LoopVectorBody = VecBody; @@ -855,8 +870,8 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); - Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); - LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0)); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast(Gep->clone()); @@ -885,7 +900,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); - LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0)); + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast(Gep->clone()); @@ -1051,9 +1066,22 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { }// end of for each redux variable. } -void SingleBlockLoopVectorizer::cleanup() { +void SingleBlockLoopVectorizer::updateAnalysis() { // The original basic block. SE->forgetLoop(OrigLoop); + + // Update the dominator tree information. + assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && + "Entry does not dominate exit."); + + DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); + DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); + DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); + DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); + DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); + DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); + + DEBUG(DT->verifyAnalysis()); } bool LoopVectorizationLegality::canVectorize() { -- cgit v1.1 From 803ea1c40aa059adf5b1baef0bf4510483565d07 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 30 Oct 2012 00:40:39 +0000 Subject: LoopVectorizer: change debug prints: Print the module identifier when deciding to vectorize. When deciding not to vectorize do not print the called function name because it can be null. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166989 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index dcdf0a3..0e6103e 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -408,7 +408,9 @@ struct LoopVectorize : public LoopPass { VF = VectorizationFactor; } - DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n"); + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< + L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< + "\n"); // If we decided that it is *legal* to vectorizer the loop then do it. SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF); @@ -597,7 +599,8 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { WidenMap[Instr] = VecResults; } -void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { +void +SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -1178,8 +1181,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { // We still don't handle functions. CallInst *CI = dyn_cast(I); if (CI) { - DEBUG(dbgs() << "LV: Found a call site:"<< - CI->getCalledFunction()->getName() << "\n"); + DEBUG(dbgs() << "LV: Found a call site.\n"); return false; } -- cgit v1.1 From cc03331caaeee00b1238654feda8c5a517e48c3a Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 30 Oct 2012 18:12:36 +0000 Subject: LoopVectorize: Fix a bug in the initialization of reduction variables. AND needs to start at all-one while XOR, and OR need to start at zero. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167032 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 0e6103e..ac82a66 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -211,8 +211,6 @@ public: TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } /// This represents the kinds of reductions that we support. - /// We use the enum values to hold the 'identity' value for - /// each operand. This value does not change the result if applied. enum ReductionKind { NoReduction = -1, /// Not a reduction. IntegerAdd = 0, /// Sum of numbers. @@ -523,7 +521,7 @@ SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { SmallVector Indices; // Create a vector of consecutive numbers from zero to VF. for (unsigned i = 0; i < VF; ++i) - Indices.push_back(ConstantInt::get(ScalarTy, Val)); + Indices.push_back(ConstantInt::get(ScalarTy, Val, true)); // Add the consecutive indices to the vector value. return ConstantVector::get(Indices); @@ -750,6 +748,23 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { LoopBypassBlock = BypassBlock; } + +static unsigned +getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { + switch (K) { + case LoopVectorizationLegality::IntegerXor: + case LoopVectorizationLegality::IntegerAdd: + case LoopVectorizationLegality::IntegerOr: + return 0; + case LoopVectorizationLegality::IntegerMult: + return 1; + case LoopVectorizationLegality::IntegerAnd: + return -1; + default: + llvm_unreachable("Unknown reduction kind"); + } +} + void SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { //===------------------------------------------------===// @@ -974,10 +989,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); Type *VecTy = VectorExit->getType(); - // Find the reduction identity variable. The value of the enum is the - // identity. Zero for addition. One for Multiplication. - unsigned IdentitySclr = RdxDesc.Kind; - Constant *Identity = getUniformVector(IdentitySclr, + // Find the reduction identity variable. Zero for addition, or, xor, + // one for multiplication, -1 for And. + Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), VecTy->getScalarType()); // This vector is the Identity vector where the first element is the -- cgit v1.1 From e709f5b600fd630c4f58b5dba14c8069a03093ea Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 30 Oct 2012 18:36:45 +0000 Subject: LoopVectorize: Add support for write-only loops when the write destination is a single pointer. Speedup SciMark by 1% git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167035 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index ac82a66..9e05cac 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1348,6 +1348,13 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { Reads.push_back(Ptr); } + // If we write (or read-write) to a single destination and there are no + // other reads in this loop then is it safe to vectorize. + if (ReadWrites.size() == 1 && Reads.size() == 0) { + DEBUG(dbgs() << "LV: Found a write-only loop!\n"); + return true; + } + // Now that the pointers are in two lists (Reads and ReadWrites), we // can check that there are no conflicts between each of the writes and // between the writes to the reads. -- cgit v1.1 From a368b89f2842530d07c0ac8e3b533882e165f197 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 30 Oct 2012 22:06:26 +0000 Subject: Add documentation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167055 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 9e05cac..431a847 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -749,16 +749,21 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { } +/// This function returns the identity element (or neutral element) for +/// the operation K. static unsigned getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { switch (K) { case LoopVectorizationLegality::IntegerXor: case LoopVectorizationLegality::IntegerAdd: case LoopVectorizationLegality::IntegerOr: + // Adding, Xoring, Oring zero to a number does not change it. return 0; case LoopVectorizationLegality::IntegerMult: + // Multiplying a number by 1 does not change it. return 1; case LoopVectorizationLegality::IntegerAnd: + // AND-ing a number with an all-1 value does not change it. return -1; default: llvm_unreachable("Unknown reduction kind"); -- cgit v1.1 From 462d1ca42831df53a3c6435d247776f0b85bd594 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 31 Oct 2012 00:45:26 +0000 Subject: Add support for loops that don't start with Zero. This is important for loops in the LAPACK test-suite. These loops start at 1 because they are auto-converted from fortran. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167084 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 31 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 431a847..e82dfa2 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -633,6 +633,10 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { BasicBlock *ExitBlock = OrigLoop->getExitBlock(); assert(ExitBlock && "Must have an exit block"); + // The loop index does not have to start at Zero. It starts with this value. + OldInduction = Legal->getInduction(); + Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock); + assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); assert(BypassBlock && "Invalid loop structure"); @@ -648,7 +652,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { "scalar.preheader"); // Find the induction variable. BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - OldInduction = Legal->getInduction(); assert(OldInduction && "We must have a single phi node."); Type *IdxTy = OldInduction->getType(); @@ -658,7 +661,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Generate the induction variable. Induction = Builder.CreatePHI(IdxTy, 2, "index"); - Constant *Zero = ConstantInt::get(IdxTy, 0); Constant *Step = ConstantInt::get(IdxTy, VF); // Find the loop boundaries. @@ -682,15 +684,22 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Count holds the overall loop count (N). Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); + + // Add the start index to the loop count to get the new end index. + Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); + // Now we need to generate the expression for N - (N % VF), which is // the part that the vectorized body will execute. Constant *CIVF = ConstantInt::get(IdxTy, VF); Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); + Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, + "end.idx.rnd.down", Loc); // Now, compare the new count to zero. If it is zero, jump to the scalar part. Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, - CountRoundDown, ConstantInt::getNullValue(IdxTy), + IdxEndRoundDown, + StartIdx, "cmp.zero", Loc); BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); // Remove the old terminator. @@ -699,8 +708,8 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. - Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", + Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, + IdxEndRoundDown, "cmp.n", MiddleBlock->getTerminator()); BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); @@ -709,10 +718,10 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Create i+1 and fill the PHINode. Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); - Induction->addIncoming(Zero, VectorPH); + Induction->addIncoming(StartIdx, VectorPH); Induction->addIncoming(NextIdx, VecBody); // Create the compare. - Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown); + Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); // Now we have two terminators. Remove the old one from the block. @@ -720,7 +729,7 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Fix the scalar body iteration count. unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); - OldInduction->setIncomingValue(BlockIdx, CountRoundDown); + OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown); // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); @@ -748,7 +757,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { LoopBypassBlock = BypassBlock; } - /// This function returns the identity element (or neutral element) for /// the operation K. static unsigned @@ -1518,10 +1526,9 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { return false; } const SCEV *Step = AR->getStepRecurrence(*SE); - const SCEV *Start = AR->getStart(); - if (!Step->isOne() || !Start->isZero()) { - DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n"); + if (!Step->isOne()) { + DEBUG(dbgs() << "LV: PHI stride does not equal one.\n"); return false; } return true; -- cgit v1.1 From 5b2c4dc5f8f70352f029d595797089821eb39b3c Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 31 Oct 2012 03:31:07 +0000 Subject: LoopVectorize: Do not vectorize loops with tiny constant trip counts. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167101 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index e82dfa2..fd728bd 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1145,6 +1145,14 @@ bool LoopVectorizationLegality::canVectorize() { return false; } + // Do not loop-vectorize loops with a tiny trip count. + unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB); + if (TC > 0 && TC < 16) { + DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << + "This loop is not worth vectorizing.\n"); + return false; + } + DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); // Okay! We can vectorize. At this point we don't have any other mem analysis -- cgit v1.1 From 2a5401cc75acf38092a7224cadaf895281d41b29 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 31 Oct 2012 16:14:06 +0000 Subject: Remove enum values since they are not used anymore. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167131 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index fd728bd..40235ef 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -212,12 +212,12 @@ public: /// This represents the kinds of reductions that we support. enum ReductionKind { - NoReduction = -1, /// Not a reduction. - IntegerAdd = 0, /// Sum of numbers. - IntegerMult = 1, /// Product of numbers. - IntegerOr = 2, /// Bitwise or logical OR of numbers. - IntegerAnd = 3, /// Bitwise or logical AND of numbers. - IntegerXor = 4 /// Bitwise or logical XOR of numbers. + NoReduction, /// Not a reduction. + IntegerAdd, /// Sum of numbers. + IntegerMult, /// Product of numbers. + IntegerOr, /// Bitwise or logical OR of numbers. + IntegerAnd, /// Bitwise or logical AND of numbers. + IntegerXor /// Bitwise or logical XOR of numbers. }; /// This POD struct holds information about reduction variables. -- cgit v1.1 From 4c1b4b1fe794437cbb245b11650d9e4001c9605e Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 31 Oct 2012 16:22:16 +0000 Subject: Put the threshold magic number in a variable. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167134 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 40235ef..94e56a1 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -75,6 +75,9 @@ static cl::opt VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Set the default vectorization width. Zero is autoselect.")); +/// We don't vectorize loops with a known constant trip count below this number. +const int TinyTripCountThreshold = 16; + namespace { // Forward declarations. @@ -1147,7 +1150,7 @@ bool LoopVectorizationLegality::canVectorize() { // Do not loop-vectorize loops with a tiny trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB); - if (TC > 0 && TC < 16) { + if (TC > 0 && TC < TinyTripCountThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing.\n"); return false; -- cgit v1.1 From e57b2cbce652d45ed5516e52ad82991bfa03cfd7 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 31 Oct 2012 21:40:39 +0000 Subject: LoopVectorize: Preserve NSW, NUW and IsExact flags. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167174 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 94e56a1..c9871e2 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -849,8 +849,19 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { BinaryOperator *BinOp = dyn_cast(Inst); Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); + // Use this vector value for all users of the original instruction. - WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); + WidenMap[Inst] = V; + + // Update the NSW, NUW and Exact flags. + BinaryOperator *VecOp = cast(V); + if (isa(BinOp)) { + VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); + VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); + } + if (isa(VecOp)) + VecOp->setIsExact(BinOp->isExact()); break; } case Instruction::Select: { -- cgit v1.1 From 6e48f0307758096d06d0e87875294c76df81dec1 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 2 Nov 2012 05:24:00 +0000 Subject: Fix sign compare warning. Patch by Mahesha HS. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167282 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index c9871e2..8928087 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -76,7 +76,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Set the default vectorization width. Zero is autoselect.")); /// We don't vectorize loops with a known constant trip count below this number. -const int TinyTripCountThreshold = 16; +const unsigned TinyTripCountThreshold = 16; namespace { @@ -1161,7 +1161,7 @@ bool LoopVectorizationLegality::canVectorize() { // Do not loop-vectorize loops with a tiny trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB); - if (TC > 0 && TC < TinyTripCountThreshold) { + if (TC > 0u && TC < TinyTripCountThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing.\n"); return false; -- cgit v1.1 From b1bf1eeede72b8c93505dd80fdf21aed0e205c7d Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 9 Nov 2012 07:09:44 +0000 Subject: Add support for memory runtime check. When we can, we calculate array bounds. If the arrays are found to be disjoint then we run the vectorized version of the loop. If they are not, we run the scalar code. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167608 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 224 ++++++++++++++++++++++++----- 1 file changed, 191 insertions(+), 33 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 8928087..b657993 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -78,6 +78,10 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, /// We don't vectorize loops with a known constant trip count below this number. const unsigned TinyTripCountThreshold = 16; +/// When performing a runtime memory check, do not check more than this +/// numner of pointers. Notice that the check is quadratic! +const unsigned RuntimeMemoryCheckThreshold = 2; + namespace { // Forward declarations. @@ -242,6 +246,15 @@ public: ReductionKind Kind; }; + // This POD struct holds information about the memory runtime legality + // check that a group of pointers do not overlap. + struct RuntimePointerCheck { + /// This flag indicates if we need to add the runtime check. + bool Need; + /// Holds the pointers that we need to check. + SmallVector Pointers; + }; + /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. typedef DenseMap ReductionList; @@ -263,9 +276,14 @@ public: /// This check allows us to vectorize A[idx] into a wide load/store. bool isConsecutiveGep(Value *Ptr); + /// Returns true if the value V is uniform within the loop. + bool isUniform(Value *V); + /// Returns true if this instruction will remain scalar after vectorization. bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} + /// Returns the information that we collected about runtime memory check. + RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; } private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -286,6 +304,8 @@ private: bool isReductionInstr(Instruction *I, ReductionKind Kind); /// Returns True, if 'Phi' is an induction variable. bool isInductionVariable(PHINode *Phi); + /// Return true if we + bool hasComputableBounds(Value *Ptr); /// The loop that we evaluate. Loop *TheLoop; @@ -306,6 +326,9 @@ private: /// This set holds the variables which are known to be uniform after /// vectorization. SmallPtrSet Uniforms; + /// We need to check that all of the pointers in this list are disjoint + /// at runtime. + RuntimePointerCheck PtrRtCheck; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -506,6 +529,10 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { return false; } +bool LoopVectorizationLegality::isUniform(Value *V) { + return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); +} + Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { assert(!V->getType()->isVectorTy() && "Can't widen a vector"); // If we saved a vectorized copy of V, use it. @@ -631,13 +658,29 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { ... */ + OldInduction = Legal->getInduction(); + assert(OldInduction && "We must have a single phi node."); + Type *IdxTy = OldInduction->getType(); + + // Find the loop boundaries. + const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); + assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); + + // Get the total trip count from the count by adding 1. + ExitCount = SE->getAddExpr(ExitCount, + SE->getConstant(ExitCount->getType(), 1)); + // We may need to extend the index in case there is a type mismatch. + // We know that the count starts at zero and does not overflow. + // We are using Zext because it should be less expensive. + if (ExitCount->getType() != IdxTy) + ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); + // This is the original scalar-loop preheader. BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); assert(ExitBlock && "Must have an exit block"); // The loop index does not have to start at Zero. It starts with this value. - OldInduction = Legal->getInduction(); Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock); assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); @@ -655,8 +698,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { "scalar.preheader"); // Find the induction variable. BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - assert(OldInduction && "We must have a single phi node."); - Type *IdxTy = OldInduction->getType(); // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. @@ -666,25 +707,11 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { Induction = Builder.CreatePHI(IdxTy, 2, "index"); Constant *Step = ConstantInt::get(IdxTy, VF); - // Find the loop boundaries. - const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); - assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); - - // Get the total trip count from the count by adding 1. - ExitCount = SE->getAddExpr(ExitCount, - SE->getConstant(ExitCount->getType(), 1)); - // Expand the trip count and place the new instructions in the preheader. // Notice that the pre-header does not change, only the loop body. SCEVExpander Exp(*SE, "induction"); Instruction *Loc = BypassBlock->getTerminator(); - // We may need to extend the index in case there is a type mismatch. - // We know that the count starts at zero and does not overflow. - // We are using Zext because it should be less expensive. - if (ExitCount->getType() != Induction->getType()) - ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); - // Count holds the overall loop count (N). Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); @@ -704,15 +731,85 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { IdxEndRoundDown, StartIdx, "cmp.zero", Loc); + + LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = + Legal->getRuntimePointerCheck(); + Value *MemoryRuntimeCheck = 0; + if (PtrRtCheck->Need) { + unsigned NumPointers = PtrRtCheck->Pointers.size(); + SmallVector Starts; + SmallVector Ends; + + // Use this type for pointer arithmetic. + Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType(); + + for (unsigned i=0; i < NumPointers; ++i) { + Value *Ptr = PtrRtCheck->Pointers[i]; + const SCEV *Sc = SE->getSCEV(Ptr); + + if (SE->isLoopInvariant(Sc, OrigLoop)) { + DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" << + *Ptr <<"\n"); + Starts.push_back(Ptr); + Ends.push_back(Ptr); + } else { + DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); + const SCEVAddRecExpr *AR = dyn_cast(Sc); + Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc); + const SCEV *Ex = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); + const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); + assert(!isa(ScEnd) && "Invalid scev range."); + Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc); + Starts.push_back(Start); + Ends.push_back(End); + } + } + + for (unsigned i=0; i < NumPointers; ++i) { + for (unsigned j=i+1; j < NumPointers; ++j) { + Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, + Starts[0], Ends[1], "bound0", Loc); + Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, + Starts[1], Ends[0], "bound1", Loc); + Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, + "found.conflict", Loc); + if (MemoryRuntimeCheck) { + MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, + MemoryRuntimeCheck, + IsConflict, + "conflict.rdx", Loc); + } else { + MemoryRuntimeCheck = IsConflict; + } + } + } + }// end of need-runtime-check code. + + // If we are using memory runtime checks, include them in. + if (MemoryRuntimeCheck) { + Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, + "CntOrMem", Loc); + } + BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); // Remove the old terminator. Loc->eraseFromParent(); + // We are going to resume the execution of the scalar loop. + // This PHI decides on what number to start. If we come from the + // vector loop then we need to start with the end index minus the + // index modulo VF. If we come from a bypass edge then we need to start + // from the real start. + PHINode* ResumeIndex = PHINode::Create(IdxTy, 2, "resume.idx", + MiddleBlock->getTerminator()); + ResumeIndex->addIncoming(StartIdx, BypassBlock); + ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); + // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, - IdxEndRoundDown, "cmp.n", + ResumeIndex, "cmp.n", MiddleBlock->getTerminator()); BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); @@ -732,7 +829,7 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Fix the scalar body iteration count. unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); - OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown); + OldInduction->setIncomingValue(BlockIdx, ResumeIndex); // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); @@ -905,7 +1002,12 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); Value *Ptr = SI->getPointerOperand(); unsigned Alignment = SI->getAlignment(); + + assert(!Legal->isUniform(Ptr) && + "We do not allow storing to uniform addresses"); + GetElementPtrInst *Gep = dyn_cast(Ptr); + // This store does not use GEPs. if (!Legal->isConsecutiveGep(Gep)) { scalarizeInstruction(Inst); @@ -935,8 +1037,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { unsigned Alignment = LI->getAlignment(); GetElementPtrInst *Gep = dyn_cast(Ptr); - // We don't have a gep. Scalarize the load. - if (!Legal->isConsecutiveGep(Gep)) { + // If we don't have a gep, or that the pointer is loop invariant, + // scalarize the load. + if (!Gep || Legal->isUniform(Gep) || !Legal->isConsecutiveGep(Gep)) { scalarizeInstruction(Inst); break; } @@ -1146,12 +1249,6 @@ bool LoopVectorizationLegality::canVectorize() { BasicBlock *BB = TheLoop->getHeader(); DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); - // Go over each instruction and look at memory deps. - if (!canVectorizeBlock(*BB)) { - DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); - return false; - } - // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); if (ExitCount == SE->getCouldNotCompute()) { @@ -1167,7 +1264,15 @@ bool LoopVectorizationLegality::canVectorize() { return false; } - DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); + // Go over each instruction and look at memory deps. + if (!canVectorizeBlock(*BB)) { + DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); + return false; + } + + DEBUG(dbgs() << "LV: We can vectorize this loop" << + (PtrRtCheck.Need ? " (with a runtime bound check)" : "") + <<"!\n"); // Okay! We can vectorize. At this point we don't have any other mem analysis // which may limit our maximum vectorization factor, so just return true with @@ -1304,6 +1409,8 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { // Holds the Load and Store *instructions*. ValueVector Loads; ValueVector Stores; + PtrRtCheck.Pointers.clear(); + PtrRtCheck.Need = false; // Scan the BB and collect legal loads and stores. for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { @@ -1361,6 +1468,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { StoreInst *ST = dyn_cast(*I); assert(ST && "Bad StoreInst"); Value* Ptr = ST->getPointerOperand(); + + if (isUniform(Ptr)) { + DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); + return false; + } + // If we did *not* see this pointer before, insert it to // the read-write list. At this phase it is only a 'write' list. if (Seen.insert(Ptr)) @@ -1390,6 +1503,39 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { return true; } + // Find pointers with computable bounds. We are going to use this information + // to place a runtime bound check. + bool RT = true; + for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) + if (hasComputableBounds(*I)) { + PtrRtCheck.Pointers.push_back(*I); + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); + } else { + RT = false; + break; + } + for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) + if (hasComputableBounds(*I)) { + PtrRtCheck.Pointers.push_back(*I); + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); + } else { + RT = false; + break; + } + + // Check that we did not collect too many pointers or found a + // unsizeable pointer. + if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { + PtrRtCheck.Pointers.clear(); + RT = false; + } + + PtrRtCheck.Need = RT; + + if (RT) { + DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); + } + // Now that the pointers are in two lists (Reads and ReadWrites), we // can check that there are no conflicts between each of the writes and // between the writes to the reads. @@ -1404,12 +1550,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { it != e; ++it) { if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); - return false; + return RT; } if (!WriteObjects.insert(*it)) { DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **it <<"\n"); - return false; + return RT; } } TempObjects.clear(); @@ -1422,18 +1568,21 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { it != e; ++it) { if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); - return false; + return RT; } if (WriteObjects.count(*it)) { DEBUG(dbgs() << "LV: Found a possible read/write reorder:" << **it <<"\n"); - return false; + return RT; } } TempObjects.clear(); } - // All is okay. + // It is safe to vectorize and we don't need any runtime checks. + DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); + PtrRtCheck.Pointers.clear(); + PtrRtCheck.Need = false; return true; } @@ -1556,6 +1705,15 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { return true; } +bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { + const SCEV *PhiScev = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast(PhiScev); + if (!AR) + return false; + + return AR->isAffine(); +} + unsigned LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { if (!VTTI) { -- cgit v1.1 From 9005b8d27d68caeb893d74aa75cd679972d86cef Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 11 Nov 2012 05:15:00 +0000 Subject: Fix a comment typo and add comments. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167684 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index b657993..a7ef248 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -79,7 +79,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, const unsigned TinyTripCountThreshold = 16; /// When performing a runtime memory check, do not check more than this -/// numner of pointers. Notice that the check is quadratic! +/// number of pointers. Notice that the check is quadratic! const unsigned RuntimeMemoryCheckThreshold = 2; namespace { @@ -118,7 +118,7 @@ public: /// Widen each instruction in the old loop to a new one in the new loop. /// Use the Legality module to find the induction and reduction variables. vectorizeLoop(Legal); - // register the new loop. + // Register the new loop and update the analysis passes. updateAnalysis(); } @@ -127,7 +127,8 @@ private: void createEmptyLoop(LoopVectorizationLegality *Legal); /// Copy and widen the instructions from the old loop. void vectorizeLoop(LoopVectorizationLegality *Legal); - /// Insert the new loop to the loop hierarchy and pass manager. + /// Insert the new loop to the loop hierarchy and pass manager + /// and update the analysis passes. void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence @@ -304,7 +305,7 @@ private: bool isReductionInstr(Instruction *I, ReductionKind Kind); /// Returns True, if 'Phi' is an induction variable. bool isInductionVariable(PHINode *Phi); - /// Return true if we + /// Return true if can compute the address bounds of Ptr within the loop. bool hasComputableBounds(Value *Ptr); /// The loop that we evaluate. -- cgit v1.1 From e123fd9c5956962d414693ee8ad83498413af6ce Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 14 Nov 2012 00:02:13 +0000 Subject: use the getSplat API. Patch by Paul Redmond. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167892 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index a7ef248..12e4db6 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -549,13 +549,7 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { Constant* SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { - SmallVector Indices; - // Create a vector of consecutive numbers from zero to VF. - for (unsigned i = 0; i < VF; ++i) - Indices.push_back(ConstantInt::get(ScalarTy, Val, true)); - - // Add the consecutive indices to the vector value. - return ConstantVector::get(Indices); + return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); } void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { -- cgit v1.1 From 77d6ae1b01f86cc82593c8f67118b7c2d6fd17c2 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 16 Nov 2012 06:51:17 +0000 Subject: LoopVectorize: Division reductions generate incorrect code. Remove the part of the code that deals with divs. Thanks to Paul Redmond for catching this while reviewing the code. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168142 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 -- 1 file changed, 2 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 12e4db6..31e0e86 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1671,8 +1671,6 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, case Instruction::Sub: return Kind == IntegerAdd; case Instruction::Mul: - case Instruction::UDiv: - case Instruction::SDiv: return Kind == IntegerMult; case Instruction::And: return Kind == IntegerAnd; -- cgit v1.1 From 9a6823516ffd6ed1787d923459c80a6fa4833914 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sat, 17 Nov 2012 00:27:03 +0000 Subject: LoopVectorizer: Add initial support for pointer induction variables (for example: *dst++ = *src++). At the moment we still require to have an integer induction variable (for example: i++). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168231 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 159 +++++++++++++++++++++++------ 1 file changed, 126 insertions(+), 33 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 31e0e86..3f1d82c 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -260,6 +260,10 @@ public: /// of the reductions that were found in the loop. typedef DenseMap ReductionList; + /// InductionList saves induction variables and maps them to the initial + /// value entring the loop. + typedef DenseMap InductionList; + /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -271,6 +275,9 @@ public: /// Returns the reduction variables found in the loop. ReductionList *getReductionVars() { return &Reductions; } + /// Returns the induction variables found in the loop. + InductionList *getInductionVars() { return &Inductions; } + /// Check if the pointer returned by this GEP is consecutive /// when the index is vectorized. This happens when the last /// index of the GEP is consecutive, like the induction variable. @@ -317,10 +324,16 @@ private: // --- vectorization state --- // - /// Holds the induction variable. + /// Holds the integer induction variable. This is the counter of the + /// loop. PHINode *Induction; /// Holds the reduction variables. ReductionList Reductions; + /// Holds all of the induction variables that we found in the loop. + /// Notice that inductions don't need to start at zero and that induction + /// variables can be pointers. + InductionList Inductions; + /// Allowed outside users. This holds the reduction /// vars which can be accessed from outside the loop. SmallPtrSet AllowedExit; @@ -791,14 +804,50 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { Loc->eraseFromParent(); // We are going to resume the execution of the scalar loop. - // This PHI decides on what number to start. If we come from the - // vector loop then we need to start with the end index minus the - // index modulo VF. If we come from a bypass edge then we need to start - // from the real start. - PHINode* ResumeIndex = PHINode::Create(IdxTy, 2, "resume.idx", - MiddleBlock->getTerminator()); - ResumeIndex->addIncoming(StartIdx, BypassBlock); - ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); + // Go over all of the induction variables that we found and fix the + // PHIs that are left in the scalar version of the loop. + // The starting values of PHI nodes depend on the counter of the last + // iteration in the vectorized loop. + // If we come from a bypass edge then we need to start from the original start + // value. + + // This variable saves the new starting index for the scalar loop. + Value *ResumeIndex = 0; + LoopVectorizationLegality::InductionList::iterator I, E; + LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); + for (I = List->begin(), E = List->end(); I != E; ++I) { + PHINode *OrigPhi = I->first; + PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", + MiddleBlock->getTerminator()); + Value *EndValue = 0; + if (OrigPhi->getType()->isIntegerTy()) { + // Handle the integer induction counter: + assert(OrigPhi == OldInduction && "Unknown integer PHI"); + // We know what the end value is. + EndValue = IdxEndRoundDown; + // We also know which PHI node holds it. + ResumeIndex = ResumeVal; + } else { + // For pointer induction variables, calculate the offset using + // the end index. + EndValue = GetElementPtrInst::Create(I->second, IdxEndRoundDown, + "ptr.ind.end", + BypassBlock->getTerminator()); + } + + // The new PHI merges the original incoming value, in case of a bypass, + // or the value at the end of the vectorized loop. + ResumeVal->addIncoming(I->second, BypassBlock); + ResumeVal->addIncoming(EndValue, VecBody); + + // Fix the scalar body counter (PHI node). + unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); + OrigPhi->setIncomingValue(BlockIdx, ResumeVal); + } + + // Make sure that we found the index where scalar loop needs to continue. + assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && + "Invalid resume Index"); // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. @@ -822,10 +871,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Now we have two terminators. Remove the old one from the block. VecBody->getTerminator()->eraseFromParent(); - // Fix the scalar body iteration count. - unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); - OldInduction->setIncomingValue(BlockIdx, ResumeIndex); - // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); @@ -895,7 +940,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // add the new incoming edges to the PHI. At this point all of the // instructions in the basic block are vectorized, so we can use them to // construct the PHI. - PhiVector PHIsToFix; + PhiVector RdxPHIsToFix; // For each instruction in the old loop. for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { @@ -911,13 +956,40 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Special handling for the induction var. if (OldInduction == Inst) continue; - // This is phase one of vectorizing PHIs. - // This has to be a reduction variable. - assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); - Type *VecTy = VectorType::get(Inst->getType(), VF); - WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi"); - PHIsToFix.push_back(P); - continue; + + // Handle reduction variables: + if (Legal->getReductionVars()->count(P)) { + // This is phase one of vectorizing PHIs. + Type *VecTy = VectorType::get(Inst->getType(), VF); + WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi"); + RdxPHIsToFix.push_back(P); + continue; + } + + // Handle pointer inductions: + if (Legal->getInductionVars()->count(P)) { + Value *StartIdx = Legal->getInductionVars()->lookup(OldInduction); + Value *StartPtr = Legal->getInductionVars()->lookup(P); + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, + "normalized.idx"); + // This is the first GEP in the sequence. + Value *FirstGep = Builder.CreateGEP(StartPtr, NormalizedIdx, + "induc.ptr"); + // This is the vector of results. Notice that we don't generate vector + // geps because scalar geps result in better code. + Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); + for (unsigned int i = 0; i < VF; ++i) { + Value *SclrGep = Builder.CreateGEP(FirstGep, Builder.getInt32(i), + "next.gep"); + VecVal = Builder.CreateInsertElement(VecVal, SclrGep, + Builder.getInt32(i), + "insert.gep"); + } + + WidenMap[Inst] = VecVal; + continue; + } } case Instruction::Add: case Instruction::FAdd: @@ -1092,7 +1164,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Create the 'reduced' values for each of the induction vars. // The reduced values are the vector values that we scalarize and combine // after the loop is finished. - for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end(); + for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); it != e; ++it) { PHINode *RdxPhi = *it; PHINode *VecRdxPhi = dyn_cast(WidenMap[RdxPhi]); @@ -1124,7 +1196,6 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Value *VectorStart = Builder.CreateInsertElement(Identity, RdxDesc.StartValue, Zero); - // Fix the vector-loop phi. // We created the induction variable so we know that the // preheader is the first entry. @@ -1276,23 +1347,33 @@ bool LoopVectorizationLegality::canVectorize() { } bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { + BasicBlock *PreHeader = TheLoop->getLoopPreheader(); + // Scan the instructions in the block and look for hazards. for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { Instruction *I = it; - PHINode *Phi = dyn_cast(I); - if (Phi) { + if (PHINode *Phi = dyn_cast(I)) { // This should not happen because the loop should be normalized. if (Phi->getNumIncomingValues() != 2) { DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } - // We only look at integer phi nodes. - if (!Phi->getType()->isIntegerTy()) { - DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); + + // This is the value coming from the preheader. + Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); + + // We only look at integer and pointer phi nodes. + if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) { + DEBUG(dbgs() << "LV: Found a pointer induction variable.\n"); + Inductions[Phi] = StartValue; + continue; + } else if (!Phi->getType()->isIntegerTy()) { + DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; } + // Handle integer PHIs: if (isInductionVariable(Phi)) { if (Induction) { DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); @@ -1300,6 +1381,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { } DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); Induction = Phi; + Inductions[Phi] = StartValue; continue; } if (AddReductionVar(Phi, IntegerAdd)) { @@ -1682,6 +1764,11 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, } bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { + Type *PhiTy = Phi->getType(); + // We only handle integer and pointer inductions variables. + if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) + return false; + // Check that the PHI is consecutive and starts at zero. const SCEV *PhiScev = SE->getSCEV(Phi); const SCEVAddRecExpr *AR = dyn_cast(PhiScev); @@ -1691,11 +1778,17 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { } const SCEV *Step = AR->getStepRecurrence(*SE); - if (!Step->isOne()) { - DEBUG(dbgs() << "LV: PHI stride does not equal one.\n"); - return false; - } - return true; + // Integer inductions need to have a stride of one. + if (PhiTy->isIntegerTy()) + return Step->isOne(); + + // Calculate the pointer stride and check if it is consecutive. + const SCEVConstant *C = dyn_cast(Step); + if (!C) return false; + + assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); + uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); + return (C->getValue()->equalsInt(Size)); } bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { -- cgit v1.1 From 0af63ac245eeb0cce206ed4dbc9abdc0d86742cb Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 25 Nov 2012 08:41:35 +0000 Subject: Add support for pointer induction variables even when there is no integer induction variable. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168558 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 287 ++++++++++++++++++----------- 1 file changed, 182 insertions(+), 105 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 3f1d82c..f906432 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -106,9 +106,10 @@ class SingleBlockLoopVectorizer { public: /// Ctor. SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, - DominatorTree *dt, LPPassManager *Lpm, + DominatorTree *dt, DataLayout *dl, + LPPassManager *Lpm, unsigned VecWidth): - OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth), + OrigLoop(Orig), SE(Se), LI(Li), DT(dt), DL(dl), LPM(Lpm), VF(VecWidth), Builder(Se->getContext()), Induction(0), OldInduction(0) { } // Perform the actual loop widening (vectorization). @@ -167,6 +168,8 @@ private: LoopInfo *LI; // Dominator Tree. DominatorTree *DT; + // Data Layout; + DataLayout *DL; // Loop Pass Manager; LPPassManager *LPM; // The vectorization factor to use. @@ -250,10 +253,36 @@ public: // This POD struct holds information about the memory runtime legality // check that a group of pointers do not overlap. struct RuntimePointerCheck { + RuntimePointerCheck(): Need(false) {} + + /// Reset the state of the pointer runtime information. + void reset() { + Need = false; + Pointers.clear(); + Starts.clear(); + Ends.clear(); + } + + /// Insert a pointer and calculate the start and end SCEVs. + void insert_pointer(ScalarEvolution *SE, Loop *Lp, Value *Ptr) { + const SCEV *Sc = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast(Sc); + assert(AR && "Invalid addrec expression"); + const SCEV *Ex = SE->getExitCount(Lp, Lp->getHeader()); + const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); + Pointers.push_back(Ptr); + Starts.push_back(AR->getStart()); + Ends.push_back(ScEnd); + } + /// This flag indicates if we need to add the runtime check. bool Need; /// Holds the pointers that we need to check. SmallVector Pointers; + /// Holds the pointer value at the beginning of the loop. + SmallVector Starts; + /// Holds the pointer value at the end of the loop. + SmallVector Ends; }; /// ReductionList contains the reduction descriptors for all @@ -278,11 +307,11 @@ public: /// Returns the induction variables found in the loop. InductionList *getInductionVars() { return &Inductions; } - /// Check if the pointer returned by this GEP is consecutive - /// when the index is vectorized. This happens when the last - /// index of the GEP is consecutive, like the induction variable. + /// Check if this pointer is consecutive when vectorizing. This happens + /// when the last index of the GEP is the induction variable, or that the + /// pointer itself is an induction variable. /// This check allows us to vectorize A[idx] into a wide load/store. - bool isConsecutiveGep(Value *Ptr); + bool isConsecutivePtr(Value *Ptr); /// Returns true if the value V is uniform within the loop. bool isUniform(Value *V); @@ -451,7 +480,7 @@ struct LoopVectorize : public LoopPass { "\n"); // If we decided that it is *legal* to vectorizer the loop then do it. - SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF); + SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, &LPM, VF); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -472,10 +501,6 @@ struct LoopVectorize : public LoopPass { }; Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { - // Instructions that access the old induction variable - // actually want to get the new one. - if (V == OldInduction) - V = Induction; // Create the types. LLVMContext &C = V->getContext(); Type *VTy = VectorType::get(V->getType(), VF); @@ -515,7 +540,14 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { return Builder.CreateAdd(Val, Cv, "induction"); } -bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { +bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { + assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); + + // If this pointer is an induction variable, return it. + PHINode *Phi = dyn_cast_or_null(Ptr); + if (Phi && getInductionVars()->count(Phi)) + return true; + GetElementPtrInst *Gep = dyn_cast_or_null(Ptr); if (!Gep) return false; @@ -576,7 +608,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // If we are accessing the old induction variable, use the new one. if (SrcOp == OldInduction) { - Params.push_back(getBroadcastInstrs(Induction)); + Params.push_back(getVectorValue(Induction)); continue; } @@ -666,9 +698,13 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { ... */ + // Some loops have a single integer induction variable, while other loops + // don't. One example is c++ iterators that often have multiple pointer + // induction variables. In the code below we also support a case where we + // don't have a single induction variable. OldInduction = Legal->getInduction(); - assert(OldInduction && "We must have a single phi node."); - Type *IdxTy = OldInduction->getType(); + Type *IdxTy = OldInduction ? OldInduction->getType() : + DL->getIntPtrType(SE->getContext()); // Find the loop boundaries. const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); @@ -677,19 +713,18 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Get the total trip count from the count by adding 1. ExitCount = SE->getAddExpr(ExitCount, SE->getConstant(ExitCount->getType(), 1)); - // We may need to extend the index in case there is a type mismatch. - // We know that the count starts at zero and does not overflow. - // We are using Zext because it should be less expensive. - if (ExitCount->getType() != IdxTy) - ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); // This is the original scalar-loop preheader. BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); assert(ExitBlock && "Must have an exit block"); - // The loop index does not have to start at Zero. It starts with this value. - Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock); + // The loop index does not have to start at Zero. Find the original start + // value from the induction PHI node. If we don't have an induction variable + // then we know that it starts at zero. + Value *StartIdx = OldInduction ? + OldInduction->getIncomingValueForBlock(BypassBlock): + ConstantInt::get(IdxTy, 0); assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); assert(BypassBlock && "Invalid loop structure"); @@ -721,7 +756,18 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { Instruction *Loc = BypassBlock->getTerminator(); // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); + Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), Loc); + + // We may need to extend the index in case there is a type mismatch. + // We know that the count starts at zero and does not overflow. + if (Count->getType() != IdxTy) { + // The exit count can be of pointer type. Convert it to the correct + // integer type. + if (ExitCount->getType()->isPointerTy()) + Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); + else + Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); + } // Add the start index to the loop count to get the new end index. Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); @@ -734,7 +780,8 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, "end.idx.rnd.down", Loc); - // Now, compare the new count to zero. If it is zero, jump to the scalar part. + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEndRoundDown, StartIdx, @@ -762,23 +809,21 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { Ends.push_back(Ptr); } else { DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); - const SCEVAddRecExpr *AR = dyn_cast(Sc); - Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc); - const SCEV *Ex = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); - const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); - assert(!isa(ScEnd) && "Invalid scev range."); - Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc); + + Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], + PtrArithTy, Loc); + Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); Starts.push_back(Start); Ends.push_back(End); } } - for (unsigned i=0; i < NumPointers; ++i) { - for (unsigned j=i+1; j < NumPointers; ++j) { + for (unsigned i = 0; i < NumPointers; ++i) { + for (unsigned j = i+1; j < NumPointers; ++j) { Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, - Starts[0], Ends[1], "bound0", Loc); + Starts[i], Ends[j], "bound0", Loc); Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, - Starts[1], Ends[0], "bound1", Loc); + Starts[j], Ends[i], "bound1", Loc); Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, "found.conflict", Loc); if (MemoryRuntimeCheck) { @@ -812,7 +857,7 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // value. // This variable saves the new starting index for the scalar loop. - Value *ResumeIndex = 0; + PHINode *ResumeIndex = 0; LoopVectorizationLegality::InductionList::iterator I, E; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); for (I = List->begin(), E = List->end(); I != E; ++I) { @@ -830,7 +875,7 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { } else { // For pointer induction variables, calculate the offset using // the end index. - EndValue = GetElementPtrInst::Create(I->second, IdxEndRoundDown, + EndValue = GetElementPtrInst::Create(I->second, CountRoundDown, "ptr.ind.end", BypassBlock->getTerminator()); } @@ -841,10 +886,22 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { ResumeVal->addIncoming(EndValue, VecBody); // Fix the scalar body counter (PHI node). - unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); + unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); OrigPhi->setIncomingValue(BlockIdx, ResumeVal); } + // If we are generating a new induction variable then we also need to + // generate the code that calculates the exit value. This value is not + // simply the end of the counter because we may skip the vectorized body + // in case of a runtime check. + if (!OldInduction){ + assert(!ResumeIndex && "Unexpected resume value found"); + ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", + MiddleBlock->getTerminator()); + ResumeIndex->addIncoming(StartIdx, BypassBlock); + ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); + } + // Make sure that we found the index where scalar loop needs to continue. assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && "Invalid resume Index"); @@ -953,43 +1010,54 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { continue; case Instruction::PHI:{ PHINode* P = cast(Inst); - // Special handling for the induction var. - if (OldInduction == Inst) - continue; - // Handle reduction variables: if (Legal->getReductionVars()->count(P)) { // This is phase one of vectorizing PHIs. Type *VecTy = VectorType::get(Inst->getType(), VF); - WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi"); + WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi", + LoopVectorBody->getFirstInsertionPt()); RdxPHIsToFix.push_back(P); continue; } - // Handle pointer inductions: - if (Legal->getInductionVars()->count(P)) { - Value *StartIdx = Legal->getInductionVars()->lookup(OldInduction); - Value *StartPtr = Legal->getInductionVars()->lookup(P); - // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, - "normalized.idx"); - // This is the first GEP in the sequence. - Value *FirstGep = Builder.CreateGEP(StartPtr, NormalizedIdx, - "induc.ptr"); - // This is the vector of results. Notice that we don't generate vector - // geps because scalar geps result in better code. - Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); - for (unsigned int i = 0; i < VF; ++i) { - Value *SclrGep = Builder.CreateGEP(FirstGep, Builder.getInt32(i), - "next.gep"); - VecVal = Builder.CreateInsertElement(VecVal, SclrGep, - Builder.getInt32(i), - "insert.gep"); - } - - WidenMap[Inst] = VecVal; + // This PHINode must be an induction variable. + // Make sure that we know about it. + assert(Legal->getInductionVars()->count(P) && + "Not an induction variable"); + + if (P->getType()->isIntegerTy()) { + assert(P == OldInduction && "Unexpected PHI"); + WidenMap[Inst] = getBroadcastInstrs(Induction); continue; } + + // Handle pointer inductions: + assert(P->getType()->isPointerTy() && "Unexpected type."); + Value *StartIdx = OldInduction ? + Legal->getInductionVars()->lookup(OldInduction) : + ConstantInt::get(Induction->getType(), 0); + + // This is the pointer value coming into the loop. + Value *StartPtr = Legal->getInductionVars()->lookup(P); + + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, + "normalized.idx"); + + // This is the vector of results. Notice that we don't generate vector + // geps because scalar geps result in better code. + Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); + for (unsigned int i = 0; i < VF; ++i) { + Constant *Idx = ConstantInt::get(Induction->getType(), i); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); + Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep"); + VecVal = Builder.CreateInsertElement(VecVal, SclrGep, + Builder.getInt32(i), + "insert.gep"); + } + + WidenMap[Inst] = VecVal; + continue; } case Instruction::Add: case Instruction::FAdd: @@ -1076,21 +1144,27 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { GetElementPtrInst *Gep = dyn_cast(Ptr); // This store does not use GEPs. - if (!Legal->isConsecutiveGep(Gep)) { + if (!Legal->isConsecutivePtr(Ptr)) { scalarizeInstruction(Inst); break; } - // The last index does not have to be the induction. It can be - // consecutive and be a function of the index. For example A[I+1]; - unsigned NumOperands = Gep->getNumOperands(); - Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); - LastIndex = Builder.CreateExtractElement(LastIndex, Zero); - - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast(Gep->clone()); - Gep2->setOperand(NumOperands - 1, LastIndex); - Ptr = Builder.Insert(Gep2); + if (Gep) { + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + } else { + // Use the induction element ptr. + assert(isa(Ptr) && "Invalid induction ptr"); + Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); + } Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); Value *Val = getVectorValue(SI->getValueOperand()); Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); @@ -1104,23 +1178,31 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { unsigned Alignment = LI->getAlignment(); GetElementPtrInst *Gep = dyn_cast(Ptr); - // If we don't have a gep, or that the pointer is loop invariant, + // If the pointer is loop invariant or if it is non consecutive, // scalarize the load. - if (!Gep || Legal->isUniform(Gep) || !Legal->isConsecutiveGep(Gep)) { + bool Con = Legal->isConsecutivePtr(Ptr); + if (Legal->isUniform(Ptr) || !Con) { scalarizeInstruction(Inst); break; } - // The last index does not have to be the induction. It can be - // consecutive and be a function of the index. For example A[I+1]; - unsigned NumOperands = Gep->getNumOperands(); - Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); - LastIndex = Builder.CreateExtractElement(LastIndex, Zero); + if (Gep) { + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + } else { + // Use the induction element ptr. + assert(isa(Ptr) && "Invalid induction ptr"); + Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); + } - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast(Gep->clone()); - Gep2->setOperand(NumOperands - 1, LastIndex); - Ptr = Builder.Insert(Gep2); Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); LI = Builder.CreateLoad(Ptr); LI->setAlignment(Alignment); @@ -1301,7 +1383,7 @@ bool LoopVectorizationLegality::canVectorize() { if (!TheLoop->getLoopPreheader()) { assert(false && "No preheader!!"); DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); - return false; + return false; } // We can only vectorize single basic block loops. @@ -1347,6 +1429,7 @@ bool LoopVectorizationLegality::canVectorize() { } bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { + BasicBlock *PreHeader = TheLoop->getLoopPreheader(); // Scan the instructions in the block and look for hazards. @@ -1440,8 +1523,8 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { } // next instr. if (!Induction) { - DEBUG(dbgs() << "LV: Did not find an induction var.\n"); - return false; + DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); + assert(getInductionVars()->size() && "No induction variables"); } // Don't vectorize if the memory dependencies do not allow vectorization. @@ -1458,15 +1541,10 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { while (Worklist.size()) { Instruction *I = dyn_cast(Worklist.back()); Worklist.pop_back(); - // Look at instructions inside this block. - if (!I) continue; - if (I->getParent() != &BB) continue; - // Stop when reaching PHI nodes. - if (isa(I)) { - assert(I == Induction && "Found a uniform PHI that is not the induction"); - break; - } + // Look at instructions inside this block. Stop when reaching PHI nodes. + if (!I || I->getParent() != &BB || isa(I)) + continue; // This is a known uniform. Uniforms.insert(I); @@ -1569,7 +1647,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { // If the address of i is unknown (for example A[B[i]]) then we may // read a few words, modify, and write a few words, and some of the // words may be written to the same address. - if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr)) + if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr)) Reads.push_back(Ptr); } @@ -1585,7 +1663,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { bool RT = true; for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) if (hasComputableBounds(*I)) { - PtrRtCheck.Pointers.push_back(*I); + PtrRtCheck.insert_pointer(SE, TheLoop, *I); DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); } else { RT = false; @@ -1593,7 +1671,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { } for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) if (hasComputableBounds(*I)) { - PtrRtCheck.Pointers.push_back(*I); + PtrRtCheck.insert_pointer(SE, TheLoop, *I); DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); } else { RT = false; @@ -1603,7 +1681,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { // Check that we did not collect too many pointers or found a // unsizeable pointer. if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { - PtrRtCheck.Pointers.clear(); + PtrRtCheck.reset(); RT = false; } @@ -1658,8 +1736,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { // It is safe to vectorize and we don't need any runtime checks. DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); - PtrRtCheck.Pointers.clear(); - PtrRtCheck.Need = false; + PtrRtCheck.reset(); return true; } @@ -1917,7 +1994,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { SI->getAlignment(), SI->getPointerAddressSpace()); // Scalarized stores. - if (!Legal->isConsecutiveGep(SI->getPointerOperand())) { + if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { unsigned Cost = 0; unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, ValTy); @@ -1944,7 +2021,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { LI->getPointerAddressSpace()); // Scalarized loads. - if (!Legal->isConsecutiveGep(LI->getPointerOperand())) { + if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { unsigned Cost = 0; unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); // The cost of inserting the loaded value into the result vector. -- cgit v1.1 From d7e0b7cdc59d04bec6cf5273cde3d2a16b13800e Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 25 Nov 2012 09:09:26 +0000 Subject: The induction-pointer work is inspired by a research paper. This commit adds a reference. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168559 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index f906432..84516eb 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -25,6 +25,7 @@ // 4. LoopVectorizationCostModel - A unit that checks for the profitability // of vectorization. It decides on the optimal vector width, which // can be one, if vectorization is not profitable. +// //===----------------------------------------------------------------------===// // // The reduction-variable vectorization is based on the paper: @@ -36,6 +37,9 @@ // Other ideas/concepts are from: // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. // +// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of +// Vectorizing Compilers. +// //===----------------------------------------------------------------------===// #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME -- cgit v1.1 From a8e2b2b68fed9883bd41335f57f1193ffcc22ed2 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 25 Nov 2012 09:13:57 +0000 Subject: Rename method. No functionality change. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168560 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 84516eb..8ed4caf 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -268,7 +268,7 @@ public: } /// Insert a pointer and calculate the start and end SCEVs. - void insert_pointer(ScalarEvolution *SE, Loop *Lp, Value *Ptr) { + void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) { const SCEV *Sc = SE->getSCEV(Ptr); const SCEVAddRecExpr *AR = dyn_cast(Sc); assert(AR && "Invalid addrec expression"); @@ -1667,7 +1667,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { bool RT = true; for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) if (hasComputableBounds(*I)) { - PtrRtCheck.insert_pointer(SE, TheLoop, *I); + PtrRtCheck.insert(SE, TheLoop, *I); DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); } else { RT = false; @@ -1675,7 +1675,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { } for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) if (hasComputableBounds(*I)) { - PtrRtCheck.insert_pointer(SE, TheLoop, *I); + PtrRtCheck.insert(SE, TheLoop, *I); DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); } else { RT = false; -- cgit v1.1 From 8c6b73666bdd08f15b31c00bd2fd663b632a1d65 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 25 Nov 2012 16:27:16 +0000 Subject: Refactor the ptr runtime check generation code. No functionality change. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168568 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 116 ++++++++++++++++------------- 1 file changed, 66 insertions(+), 50 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 8ed4caf..2ca5fea 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -128,6 +128,10 @@ public: } private: + /// Add code that checks at runtime if the accessed arrays overlap. + /// Returns the comperator value or NULL if no check is needed. + Value* addRuntimeCheck(LoopVectorizationLegality *Legal, + Instruction *Loc); /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(LoopVectorizationLegality *Legal); /// Copy and widen the instructions from the old loop. @@ -671,6 +675,67 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { WidenMap[Instr] = VecResults; } +Value* +SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, + Instruction *Loc) { + LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = + Legal->getRuntimePointerCheck(); + + if (!PtrRtCheck->Need) + return NULL; + + Value *MemoryRuntimeCheck = 0; + unsigned NumPointers = PtrRtCheck->Pointers.size(); + SmallVector Starts; + SmallVector Ends; + + SCEVExpander Exp(*SE, "induction"); + + // Use this type for pointer arithmetic. + Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType(); + + for (unsigned i=0; i < NumPointers; ++i) { + Value *Ptr = PtrRtCheck->Pointers[i]; + const SCEV *Sc = SE->getSCEV(Ptr); + + if (SE->isLoopInvariant(Sc, OrigLoop)) { + DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" << + *Ptr <<"\n"); + Starts.push_back(Ptr); + Ends.push_back(Ptr); + } else { + DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); + + Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], + PtrArithTy, Loc); + Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); + Starts.push_back(Start); + Ends.push_back(End); + } + } + + for (unsigned i = 0; i < NumPointers; ++i) { + for (unsigned j = i+1; j < NumPointers; ++j) { + Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, + Starts[i], Ends[j], "bound0", Loc); + Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, + Starts[j], Ends[i], "bound1", Loc); + Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, + "found.conflict", Loc); + if (MemoryRuntimeCheck) { + MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, + MemoryRuntimeCheck, + IsConflict, + "conflict.rdx", Loc); + } else { + MemoryRuntimeCheck = IsConflict; + } + } + } + + return MemoryRuntimeCheck; +} + void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { /* @@ -791,56 +856,7 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { StartIdx, "cmp.zero", Loc); - LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = - Legal->getRuntimePointerCheck(); - Value *MemoryRuntimeCheck = 0; - if (PtrRtCheck->Need) { - unsigned NumPointers = PtrRtCheck->Pointers.size(); - SmallVector Starts; - SmallVector Ends; - - // Use this type for pointer arithmetic. - Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType(); - - for (unsigned i=0; i < NumPointers; ++i) { - Value *Ptr = PtrRtCheck->Pointers[i]; - const SCEV *Sc = SE->getSCEV(Ptr); - - if (SE->isLoopInvariant(Sc, OrigLoop)) { - DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" << - *Ptr <<"\n"); - Starts.push_back(Ptr); - Ends.push_back(Ptr); - } else { - DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); - - Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], - PtrArithTy, Loc); - Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); - Starts.push_back(Start); - Ends.push_back(End); - } - } - - for (unsigned i = 0; i < NumPointers; ++i) { - for (unsigned j = i+1; j < NumPointers; ++j) { - Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, - Starts[i], Ends[j], "bound0", Loc); - Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, - Starts[j], Ends[i], "bound1", Loc); - Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, - "found.conflict", Loc); - if (MemoryRuntimeCheck) { - MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, - MemoryRuntimeCheck, - IsConflict, - "conflict.rdx", Loc); - } else { - MemoryRuntimeCheck = IsConflict; - } - } - } - }// end of need-runtime-check code. + Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, Loc); // If we are using memory runtime checks, include them in. if (MemoryRuntimeCheck) { -- cgit v1.1 From 00e8074ee24449e5d0e50b626290b499b8bd78b1 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 25 Nov 2012 16:39:01 +0000 Subject: Fix the document style. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168569 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 2ca5fea..9cba136 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -118,10 +118,10 @@ public: // Perform the actual loop widening (vectorization). void vectorize(LoopVectorizationLegality *Legal) { - ///Create a new empty loop. Unlink the old loop and connect the new one. + // Create a new empty loop. Unlink the old loop and connect the new one. createEmptyLoop(Legal); - /// Widen each instruction in the old loop to a new one in the new loop. - /// Use the Legality module to find the induction and reduction variables. + // Widen each instruction in the old loop to a new one in the new loop. + // Use the Legality module to find the induction and reduction variables. vectorizeLoop(Legal); // Register the new loop and update the analysis passes. updateAnalysis(); -- cgit v1.1 From 6bfc3481bd8995906af4c15131feeae665a197c6 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 25 Nov 2012 16:48:08 +0000 Subject: Move the max vector width to a constant parameter. No functionality change. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168570 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 9cba136..e79d526 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -86,6 +86,9 @@ const unsigned TinyTripCountThreshold = 16; /// number of pointers. Notice that the check is quadratic! const unsigned RuntimeMemoryCheckThreshold = 2; +/// This is the highest vector width that we try to generate. +const unsigned MaxVectorSize = 8; + namespace { // Forward declarations. @@ -130,7 +133,7 @@ public: private: /// Add code that checks at runtime if the accessed arrays overlap. /// Returns the comperator value or NULL if no check is needed. - Value* addRuntimeCheck(LoopVectorizationLegality *Legal, + Value* addRuntimeCheck(LoopVectorizationLegality *Legal, Instruction *Loc); /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(LoopVectorizationLegality *Legal); @@ -400,7 +403,7 @@ public: /// Returns the most profitable vectorization factor for the loop that is /// smaller or equal to the VF argument. This method checks every power /// of two up to VF. - unsigned findBestVectorizationFactor(unsigned VF = 8); + unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize); private: /// Returns the expected execution cost. The unit of the cost does @@ -856,13 +859,12 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { StartIdx, "cmp.zero", Loc); - Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, Loc); + Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, Loc); // If we are using memory runtime checks, include them in. - if (MemoryRuntimeCheck) { + if (MemoryRuntimeCheck) Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, "CntOrMem", Loc); - } BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); // Remove the old terminator. -- cgit v1.1 From 22689b6034dbde1c08e39b162fd59e1fd5d35b96 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 26 Nov 2012 19:51:46 +0000 Subject: Move the code that uses SCEVs prior to creating the new loops. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168601 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 67 ++++++++++++++++-------------- 1 file changed, 35 insertions(+), 32 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index e79d526..55733f7 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -113,10 +113,10 @@ class SingleBlockLoopVectorizer { public: /// Ctor. SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, - DominatorTree *dt, DataLayout *dl, + DominatorTree *Dt, DataLayout *Dl, LPPassManager *Lpm, unsigned VecWidth): - OrigLoop(Orig), SE(Se), LI(Li), DT(dt), DL(dl), LPM(Lpm), VF(VecWidth), + OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), LPM(Lpm), VF(VecWidth), Builder(Se->getContext()), Induction(0), OldInduction(0) { } // Perform the actual loop widening (vectorization). @@ -133,8 +133,8 @@ public: private: /// Add code that checks at runtime if the accessed arrays overlap. /// Returns the comperator value or NULL if no check is needed. - Value* addRuntimeCheck(LoopVectorizationLegality *Legal, - Instruction *Loc); + Value *addRuntimeCheck(LoopVectorizationLegality *Legal, + Instruction *Loc); /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(LoopVectorizationLegality *Legal); /// Copy and widen the instructions from the old loop. @@ -179,7 +179,7 @@ private: LoopInfo *LI; // Dominator Tree. DominatorTree *DT; - // Data Layout; + // Data Layout. DataLayout *DL; // Loop Pass Manager; LPPassManager *LPM; @@ -725,14 +725,14 @@ SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, Starts[j], Ends[i], "bound1", Loc); Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, "found.conflict", Loc); - if (MemoryRuntimeCheck) { + if (MemoryRuntimeCheck) MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, MemoryRuntimeCheck, IsConflict, "conflict.rdx", Loc); - } else { + else MemoryRuntimeCheck = IsConflict; - } + } } @@ -770,6 +770,11 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { ... */ + BasicBlock *OldBasicBlock = OrigLoop->getHeader(); + BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); + BasicBlock *ExitBlock = OrigLoop->getExitBlock(); + assert(ExitBlock && "Must have an exit block"); + // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer // induction variables. In the code below we also support a case where we @@ -786,10 +791,13 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { ExitCount = SE->getAddExpr(ExitCount, SE->getConstant(ExitCount->getType(), 1)); - // This is the original scalar-loop preheader. - BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); - BasicBlock *ExitBlock = OrigLoop->getExitBlock(); - assert(ExitBlock && "Must have an exit block"); + // Expand the trip count and place the new instructions in the preheader. + // Notice that the pre-header does not change, only the loop body. + SCEVExpander Exp(*SE, "induction"); + + // Count holds the overall loop count (N). + Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + BypassBlock->getTerminator()); // The loop index does not have to start at Zero. Find the original start // value from the induction PHI node. If we don't have an induction variable @@ -801,18 +809,23 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); assert(BypassBlock && "Invalid loop structure"); + // Generate the code that checks in runtime if arrays overlap. + Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, + BypassBlock->getTerminator()); + + // Split the single block loop into the two loop structure described above. BasicBlock *VectorPH = BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); - BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), - "vector.body"); - - BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), - "middle.block"); + BasicBlock *VecBody = + VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); + BasicBlock *MiddleBlock = + VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), - "scalar.preheader"); - // Find the induction variable. - BasicBlock *OldBasicBlock = OrigLoop->getHeader(); + MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + + // This is the location in which we add all of the logic for bypassing + // the new vector loop. + Instruction *Loc = BypassBlock->getTerminator(); // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. @@ -822,14 +835,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { Induction = Builder.CreatePHI(IdxTy, 2, "index"); Constant *Step = ConstantInt::get(IdxTy, VF); - // Expand the trip count and place the new instructions in the preheader. - // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*SE, "induction"); - Instruction *Loc = BypassBlock->getTerminator(); - - // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), Loc); - // We may need to extend the index in case there is a type mismatch. // We know that the count starts at zero and does not overflow. if (Count->getType() != IdxTy) { @@ -859,8 +864,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { StartIdx, "cmp.zero", Loc); - Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, Loc); - // If we are using memory runtime checks, include them in. if (MemoryRuntimeCheck) Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, @@ -1053,7 +1056,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { continue; } - // Handle pointer inductions: + // Handle pointer inductions. assert(P->getType()->isPointerTy() && "Unexpected type."); Value *StartIdx = OldInduction ? Legal->getInductionVars()->lookup(OldInduction) : -- cgit v1.1 From dddaad624e388529f1f933939000d6dbc07c107d Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 29 Nov 2012 19:25:41 +0000 Subject: When broadcasting invariant scalars into vectors, place the broadcast code in the preheader. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168927 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 40 ++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 11 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 55733f7..35e2d05 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -516,6 +516,17 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { LLVMContext &C = V->getContext(); Type *VTy = VectorType::get(V->getType(), VF); Type *I32 = IntegerType::getInt32Ty(C); + + // Save the current insertion location. + Instruction *Loc = Builder.GetInsertPoint(); + + // We need to place the broadcast of invariant variables outside the loop. + bool Invariant = (OrigLoop->isLoopInvariant(V) && V != Induction); + + // Place the code for broadcasting invariant variables in the new preheader. + if (Invariant) + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + Constant *Zero = ConstantInt::get(I32, 0); Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); Value *UndefVal = UndefValue::get(VTy); @@ -524,10 +535,11 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { // Broadcast the scalar into all locations in the vector. Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, "broadcast"); - // We are accessing the induction variable. Make sure to promote the - // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. - if (V == Induction) - return getConsecutiveVector(Shuf); + + // Restore the builder insertion point. + if (Invariant) + Builder.SetInsertPoint(Loc); + return Shuf; } @@ -571,7 +583,7 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) return false; - // We can emit wide load/stores only of the last index is the induction + // We can emit wide load/stores only if the last index is the induction // variable. const SCEV *Last = SE->getSCEV(LastIndex); if (const SCEVAddRecExpr *AR = dyn_cast(Last)) { @@ -591,6 +603,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) { } Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { + assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); // If we saved a vectorized copy of V, use it. Value *&MapEntry = WidenMap[V]; @@ -619,7 +632,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // If we are accessing the old induction variable, use the new one. if (SrcOp == OldInduction) { - Params.push_back(getVectorValue(Induction)); + Params.push_back(getVectorValue(SrcOp)); continue; } @@ -697,7 +710,7 @@ SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, // Use this type for pointer arithmetic. Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType(); - for (unsigned i=0; i < NumPointers; ++i) { + for (unsigned i = 0; i < NumPointers; ++i) { Value *Ptr = PtrRtCheck->Pointers[i]; const SCEV *Sc = SE->getSCEV(Ptr); @@ -1016,7 +1029,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // In order to support reduction variables we need to be able to vectorize // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two - // steages. First, we create a new vector PHI node with no incoming edges. + // stages. First, we create a new vector PHI node with no incoming edges. // We use this value when we vectorize all of the instructions that use the // PHI. Next, after all of the instructions in the block are complete we // add the new incoming edges to the PHI. At this point all of the @@ -1052,7 +1065,12 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { if (P->getType()->isIntegerTy()) { assert(P == OldInduction && "Unexpected PHI"); - WidenMap[Inst] = getBroadcastInstrs(Induction); + Value *Broadcasted = getBroadcastInstrs(Induction); + // After broadcasting the induction variable we need to make the + // vector consecutive by adding 0, 1, 2 ... + Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted); + + WidenMap[OldInduction] = ConsecutiveInduction; continue; } @@ -1387,7 +1405,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { } void SingleBlockLoopVectorizer::updateAnalysis() { - // The original basic block. + // Forget the original basic block. SE->forgetLoop(OrigLoop); // Update the dominator tree information. @@ -1575,7 +1593,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { Uniforms.insert(I); // Insert all operands. - for (int i=0, Op = I->getNumOperands(); i < Op; ++i) { + for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { Worklist.push_back(I->getOperand(i)); } } -- cgit v1.1 From f735a7f88dcb87cca4622ce45effa8134759b7d8 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 30 Nov 2012 17:27:53 +0000 Subject: Remove the use of LPPassManager. We can remove LPM because we dont need to run any additional loop passes on the new vector loop. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169016 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 35e2d05..d55b7bd 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -114,9 +114,8 @@ public: /// Ctor. SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, DominatorTree *Dt, DataLayout *Dl, - LPPassManager *Lpm, unsigned VecWidth): - OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), LPM(Lpm), VF(VecWidth), + OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth), Builder(Se->getContext()), Induction(0), OldInduction(0) { } // Perform the actual loop widening (vectorization). @@ -181,8 +180,6 @@ private: DominatorTree *DT; // Data Layout. DataLayout *DL; - // Loop Pass Manager; - LPPassManager *LPM; // The vectorization factor to use. unsigned VF; @@ -491,7 +488,7 @@ struct LoopVectorize : public LoopPass { "\n"); // If we decided that it is *legal* to vectorizer the loop then do it. - SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, &LPM, VF); + SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, VF); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -969,19 +966,22 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); - // Register the new loop. + // Create and register the new vector loop. Loop* Lp = new Loop(); - LPM->insertLoop(Lp, OrigLoop->getParentLoop()); - - Lp->addBasicBlockToLoop(VecBody, LI->getBase()); - Loop *ParentLoop = OrigLoop->getParentLoop(); + + // Insert the new loop into the loop nest and register the new basic blocks. if (ParentLoop) { + ParentLoop->addChildLoop(Lp); ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); + } else { + LI->addTopLevelLoop(Lp); } + Lp->addBasicBlockToLoop(VecBody, LI->getBase()); + // Save the state. LoopVectorPreHeader = VectorPH; LoopScalarPreHeader = ScalarPH; -- cgit v1.1 From d6964741f52d2553517209319fa214afe8b3812b Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 30 Nov 2012 22:37:11 +0000 Subject: minor cleanups git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169048 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index d55b7bd..8be31dc 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -712,7 +712,7 @@ SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, const SCEV *Sc = SE->getSCEV(Ptr); if (SE->isLoopInvariant(Sc, OrigLoop)) { - DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" << + DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << *Ptr <<"\n"); Starts.push_back(Ptr); Ends.push_back(Ptr); @@ -1423,11 +1423,7 @@ void SingleBlockLoopVectorizer::updateAnalysis() { } bool LoopVectorizationLegality::canVectorize() { - if (!TheLoop->getLoopPreheader()) { - assert(false && "No preheader!!"); - DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); - return false; - } + assert(TheLoop->getLoopPreheader() && "No preheader!!"); // We can only vectorize single basic block loops. unsigned NumBlocks = TheLoop->getNumBlocks(); @@ -2008,9 +2004,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: { + case Instruction::Xor: return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); - } case Instruction::Select: { SelectInst *SI = cast(I); const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); -- cgit v1.1 From d04a8d4b33ff316ca4cf961e06c9e312eff8e64f Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 3 Dec 2012 16:50:05 +0000 Subject: Use the new script to sort the includes of every file under lib. Sooooo many of these had incorrect or strange main module includes. I have manually inspected all of these, and fixed the main module include to be the nearest plausible thing I could find. If you own or care about any of these source files, I encourage you to take some time and check that these edits were sensible. I can't have broken anything (I strictly added headers, and reordered them, never removed), but they may not be the headers you'd really like to identify as containing the API being implemented. Many forward declarations and missing includes were added to a header files to allow them to parse cleanly when included first. The main module rule does in fact have its merits. =] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169131 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 37 +++++++++++++++--------------- 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 8be31dc..33b76ba 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -43,35 +43,36 @@ //===----------------------------------------------------------------------===// #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Instructions.h" -#include "llvm/LLVMContext.h" -#include "llvm/Pass.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Value.h" -#include "llvm/Function.h" -#include "llvm/Analysis/Verifier.h" -#include "llvm/Module.h" -#include "llvm/Type.h" +#include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/Dominators.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/TargetTransformInfo.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Constants.h" +#include "llvm/DataLayout.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/DataLayout.h" +#include "llvm/TargetTransformInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Type.h" +#include "llvm/Value.h" #include using namespace llvm; -- cgit v1.1 From dd8b1015c8e3dd8f7f9d6d5d63804d2e5ab89c20 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 3 Dec 2012 21:06:35 +0000 Subject: Add initial support for IF-conversion. This patch implements the first 1/3, which is the legality of the if-conversion transformation. The next step is to implement the cost-model for the if-converted code as well as the vectorization itself. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169152 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 387 +++++++++++++++++++---------- 1 file changed, 253 insertions(+), 134 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 33b76ba..8e2538f 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -80,6 +80,10 @@ static cl::opt VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Set the default vectorization width. Zero is autoselect.")); +static cl::opt +EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden, + cl::desc("Enable if-conversion during vectorization.")); + /// We don't vectorize loops with a known constant trip count below this number. const unsigned TinyTripCountThreshold = 16; @@ -219,16 +223,17 @@ private: /// * Memory checks - The code in canVectorizeMemory checks if vectorization /// will change the order of memory accesses in a way that will change the /// correctness of the program. -/// * Scalars checks - The code in canVectorizeBlock checks for a number -/// of different conditions, such as the availability of a single induction -/// variable, that all types are supported and vectorize-able, etc. -/// This code reflects the capabilities of SingleBlockLoopVectorizer. +/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory +/// checks for a number of different conditions, such as the availability of a +/// single induction variable, that all types are supported and vectorize-able, +/// etc. This code reflects the capabilities of SingleBlockLoopVectorizer. /// This class is also used by SingleBlockLoopVectorizer for identifying /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: - LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): - TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } + LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl, + DominatorTree *Dt): + TheLoop(Lp), SE(Se), DL(Dl), DT(Dt), Induction(0) { } /// This represents the kinds of reductions that we support. enum ReductionKind { @@ -277,7 +282,7 @@ public: const SCEV *Sc = SE->getSCEV(Ptr); const SCEVAddRecExpr *AR = dyn_cast(Sc); assert(AR && "Invalid addrec expression"); - const SCEV *Ex = SE->getExitCount(Lp, Lp->getHeader()); + const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); Pointers.push_back(Ptr); Starts.push_back(AR->getStart()); @@ -334,13 +339,28 @@ private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count /// and we only need to check individual instructions. - bool canVectorizeBlock(BasicBlock &BB); + bool canVectorizeInstrs(BasicBlock &BB); /// When we vectorize loops we may change the order in which /// we read and write from memory. This method checks if it is /// legal to vectorize the code, considering only memory constrains. /// Returns true if BB is vectorizable - bool canVectorizeMemory(BasicBlock &BB); + bool canVectorizeMemory(); + + /// Return true if we can vectorize this loop using the IF-conversion + /// transformation. + bool canVectorizeWithIfConvert(); + + /// Collect the variables that need to stay uniform after vectorization. + void collectLoopUniforms(); + + /// Return true if the block BB needs to be predicated in order for the loop + /// to be vectorized. + bool blockNeedsPredication(BasicBlock *BB); + + /// return true if all of the instructions in the block can be speculatively + /// executed. + bool blockCanBePredicated(BasicBlock *BB); /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. @@ -359,6 +379,8 @@ private: ScalarEvolution *SE; /// DataLayout analysis. DataLayout *DL; + // Dominators. + DominatorTree *DT; // --- vectorization state --- // @@ -458,7 +480,7 @@ struct LoopVectorize : public LoopPass { L->getHeader()->getParent()->getName() << "\"\n"); // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL); + LoopVectorizationLegality LVL(L, SE, DL, DT); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing.\n"); return false; @@ -1423,41 +1445,91 @@ void SingleBlockLoopVectorizer::updateAnalysis() { DEBUG(DT->verifyAnalysis()); } + +bool LoopVectorizationLegality::canVectorizeWithIfConvert() { + if (!EnableIfConversion) + return false; + + assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); + std::vector &LoopBlocks = TheLoop->getBlocksVector(); + + // Collect the blocks that need predication. + for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { + BasicBlock *BB = LoopBlocks[i]; + + // We must have at most two predecessors because we need to convert + // all PHIs to selects. + unsigned Preds = std::distance(pred_begin(BB), pred_end(BB)); + if (Preds > 2) + return false; + + // We must be able to predicate all blocks that needs to be predicated. + if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) + return false; + } + + // We can if-convert this loop. + return true; +} + bool LoopVectorizationLegality::canVectorize() { assert(TheLoop->getLoopPreheader() && "No preheader!!"); - // We can only vectorize single basic block loops. + // We can only vectorize innermost loops. + if (TheLoop->getSubLoopsVector().size()) + return false; + + // We must have a single backedge. + if (TheLoop->getNumBackEdges() != 1) + return false; + + // We must have a single exiting block. + if (!TheLoop->getExitingBlock()) + return false; + unsigned NumBlocks = TheLoop->getNumBlocks(); - if (NumBlocks != 1) { - DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); + + // Check if we can if-convert non single-bb loops. + if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { + DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); return false; } // We need to have a loop header. - BasicBlock *BB = TheLoop->getHeader(); - DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); + BasicBlock *Header = TheLoop->getHeader(); + BasicBlock *Latch = TheLoop->getLoopLatch(); + DEBUG(dbgs() << "LV: Found a loop: " << Header->getName() << "\n"); // ScalarEvolution needs to be able to find the exit count. - const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); + const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); if (ExitCount == SE->getCouldNotCompute()) { DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } // Do not loop-vectorize loops with a tiny trip count. - unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB); + unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); if (TC > 0u && TC < TinyTripCountThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing.\n"); return false; } + // Check if we can vectorize the instructions and CFG in this loop. + if (!canVectorizeInstrs(*Header)) { + DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); + return false; + } + // Go over each instruction and look at memory deps. - if (!canVectorizeBlock(*BB)) { - DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); + if (!canVectorizeMemory()) { + DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); return false; } + // Collect all of the variables that remain uniform after vectorization. + collectLoopUniforms(); + DEBUG(dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"); @@ -1468,122 +1540,138 @@ bool LoopVectorizationLegality::canVectorize() { return true; } -bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { - +bool LoopVectorizationLegality::canVectorizeInstrs(BasicBlock &BB) { BasicBlock *PreHeader = TheLoop->getLoopPreheader(); + BasicBlock *Header = TheLoop->getHeader(); - // Scan the instructions in the block and look for hazards. - for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { - Instruction *I = it; + // For each block in the loop + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); bb != be; ++bb) { - if (PHINode *Phi = dyn_cast(I)) { - // This should not happen because the loop should be normalized. - if (Phi->getNumIncomingValues() != 2) { - DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); - return false; - } + // Scan the instructions in the block and look for hazards. + for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { + Instruction *I = it; - // This is the value coming from the preheader. - Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); + if (PHINode *Phi = dyn_cast(I)) { + // This should not happen because the loop should be normalized. + if (Phi->getNumIncomingValues() != 2) { + DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + return false; + } - // We only look at integer and pointer phi nodes. - if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) { - DEBUG(dbgs() << "LV: Found a pointer induction variable.\n"); - Inductions[Phi] = StartValue; - continue; - } else if (!Phi->getType()->isIntegerTy()) { - DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); - return false; - } + // If this PHINode is not in the header block, then we know that we + // can convert it to select during if-conversion. + if (*bb != Header) { + continue; + } - // Handle integer PHIs: - if (isInductionVariable(Phi)) { - if (Induction) { - DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); + // This is the value coming from the preheader. + Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); + + // We only look at integer and pointer phi nodes. + if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) { + DEBUG(dbgs() << "LV: Found a pointer induction variable.\n"); + Inductions[Phi] = StartValue; + continue; + } else if (!Phi->getType()->isIntegerTy()) { + DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; } - DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); - Induction = Phi; - Inductions[Phi] = StartValue; - continue; - } - if (AddReductionVar(Phi, IntegerAdd)) { - DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, IntegerMult)) { - DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, IntegerOr)) { - DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, IntegerAnd)) { - DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, IntegerXor)) { - DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); - continue; - } - DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); - return false; - }// end of PHI handling + // Handle integer PHIs: + if (isInductionVariable(Phi)) { + if (Induction) { + DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); + return false; + } + DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); + Induction = Phi; + Inductions[Phi] = StartValue; + continue; + } + if (AddReductionVar(Phi, IntegerAdd)) { + DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerMult)) { + DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerOr)) { + DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerAnd)) { + DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerXor)) { + DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); + continue; + } - // We still don't handle functions. - CallInst *CI = dyn_cast(I); - if (CI) { - DEBUG(dbgs() << "LV: Found a call site.\n"); - return false; - } + DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); + return false; + }// end of PHI handling - // We do not re-vectorize vectors. - if (!VectorType::isValidElementType(I->getType()) && - !I->getType()->isVoidTy()) { - DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); - return false; - } + // We still don't handle functions. + CallInst *CI = dyn_cast(I); + if (CI) { + DEBUG(dbgs() << "LV: Found a call site.\n"); + return false; + } - // Reduction instructions are allowed to have exit users. - // All other instructions must not have external users. - if (!AllowedExit.count(I)) - //Check that all of the users of the loop are inside the BB. - for (Value::use_iterator it = I->use_begin(), e = I->use_end(); - it != e; ++it) { - Instruction *U = cast(*it); - // This user may be a reduction exit value. - BasicBlock *Parent = U->getParent(); - if (Parent != &BB) { - DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); - return false; + // We do not re-vectorize vectors. + if (!VectorType::isValidElementType(I->getType()) && + !I->getType()->isVoidTy()) { + DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); + return false; + } + + // Reduction instructions are allowed to have exit users. + // All other instructions must not have external users. + if (!AllowedExit.count(I)) + //Check that all of the users of the loop are inside the BB. + for (Value::use_iterator it = I->use_begin(), e = I->use_end(); + it != e; ++it) { + Instruction *U = cast(*it); + // This user may be a reduction exit value. + if (!TheLoop->contains(U)) { + DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + return false; + } } - } - } // next instr. + } // next instr. + + } if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); assert(getInductionVars()->size() && "No induction variables"); } - // Don't vectorize if the memory dependencies do not allow vectorization. - if (!canVectorizeMemory(BB)) - return false; + return true; +} +void LoopVectorizationLegality::collectLoopUniforms() { // We now know that the loop is vectorizable! // Collect variables that will remain uniform after vectorization. std::vector Worklist; + BasicBlock *Latch = TheLoop->getLoopLatch(); + // Start with the conditional branch and walk up the block. - Worklist.push_back(BB.getTerminator()->getOperand(0)); + Worklist.push_back(Latch->getTerminator()->getOperand(0)); while (Worklist.size()) { Instruction *I = dyn_cast(Worklist.back()); Worklist.pop_back(); - // Look at instructions inside this block. Stop when reaching PHI nodes. - if (!I || I->getParent() != &BB || isa(I)) + // Look at instructions inside this loop. + // Stop when reaching PHI nodes. + // TODO: we need to prevent loops but we do need to follow PHIs inside this + // loop. + if (!I || !TheLoop->contains(I) || isa(I)) continue; // This is a known uniform. @@ -1594,11 +1682,9 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { Worklist.push_back(I->getOperand(i)); } } - - return true; } -bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { +bool LoopVectorizationLegality::canVectorizeMemory() { typedef SmallVector ValueVector; typedef SmallPtrSet ValueSet; // Holds the Load and Store *instructions*. @@ -1607,35 +1693,40 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { PtrRtCheck.Pointers.clear(); PtrRtCheck.Need = false; - // Scan the BB and collect legal loads and stores. - for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { - Instruction *I = it; - - // If this is a load, save it. If this instruction can read from memory - // but is not a load, then we quit. Notice that we don't handle function - // calls that read or write. - if (I->mayReadFromMemory()) { - LoadInst *Ld = dyn_cast(I); - if (!Ld) return false; - if (!Ld->isSimple()) { - DEBUG(dbgs() << "LV: Found a non-simple load.\n"); - return false; + // For each block. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); bb != be; ++bb) { + + // Scan the BB and collect legal loads and stores. + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + + // If this is a load, save it. If this instruction can read from memory + // but is not a load, then we quit. Notice that we don't handle function + // calls that read or write. + if (it->mayReadFromMemory()) { + LoadInst *Ld = dyn_cast(it); + if (!Ld) return false; + if (!Ld->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple load.\n"); + return false; + } + Loads.push_back(Ld); + continue; } - Loads.push_back(Ld); - continue; - } - // Save store instructions. Abort if other instructions write to memory. - if (I->mayWriteToMemory()) { - StoreInst *St = dyn_cast(I); - if (!St) return false; - if (!St->isSimple()) { - DEBUG(dbgs() << "LV: Found a non-simple store.\n"); - return false; + // Save 'store' instructions. Abort if other instructions write to memory. + if (it->mayWriteToMemory()) { + StoreInst *St = dyn_cast(it); + if (!St) return false; + if (!St->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple store.\n"); + return false; + } + Stores.push_back(St); } - Stores.push_back(St); - } - } // next instr. + } // next instr. + } // next block. // Now we have two lists that hold the loads and the stores. // Next, we find the pointers that they use. @@ -1908,6 +1999,34 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { return (C->getValue()->equalsInt(Size)); } +bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { + assert(TheLoop->contains(BB) && "Unknown block used"); + + // Blocks that do not dominate the latch need predication. + BasicBlock* Latch = TheLoop->getLoopLatch(); + return !DT->dominates(BB, Latch); +} + +bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // We don't predicate loads/stores at the moment. + if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) + return false; + + // The isntructions below can trap. + switch (it->getOpcode()) { + default: continue; + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return false; + } + } + + return true; +} + bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { const SCEV *PhiScev = SE->getSCEV(Ptr); const SCEVAddRecExpr *AR = dyn_cast(PhiScev); -- cgit v1.1 From 56f1ce22dc80cde198afb3ed153c109b951217fa Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 3 Dec 2012 21:33:08 +0000 Subject: Now that we have a basic if-conversion infrastructure we can rename the "single basic block loop vectorizer" to "innermost loop vectorizer". git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169158 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 36 +++++++++++++++--------------- 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 8e2538f..ecf19b7 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -20,7 +20,7 @@ // 1. The main loop pass that drives the different parts. // 2. LoopVectorizationLegality - A unit that checks for the legality // of the vectorization. -// 3. SingleBlockLoopVectorizer - A unit that performs the actual +// 3. InnerLoopVectorizer - A unit that performs the actual // widening of instructions. // 4. LoopVectorizationCostModel - A unit that checks for the profitability // of vectorization. It decides on the optimal vector width, which @@ -100,7 +100,7 @@ namespace { class LoopVectorizationLegality; class LoopVectorizationCostModel; -/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic +/// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple /// scalars. This class also implements the following features: @@ -109,15 +109,15 @@ class LoopVectorizationCostModel; /// * It handles the code generation for reduction variables. /// * Scalarization (implementation using scalars) of un-vectorizable /// instructions. -/// SingleBlockLoopVectorizer does not perform any vectorization-legality +/// InnerLoopVectorizer does not perform any vectorization-legality /// checks, and relies on the caller to check for the different legality -/// aspects. The SingleBlockLoopVectorizer relies on the +/// aspects. The InnerLoopVectorizer relies on the /// LoopVectorizationLegality class to provide information about the induction /// and reduction variables that were found to a given vectorization factor. -class SingleBlockLoopVectorizer { +class InnerLoopVectorizer { public: /// Ctor. - SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, + InnerLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, DominatorTree *Dt, DataLayout *Dl, unsigned VecWidth): OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth), @@ -226,8 +226,8 @@ private: /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory /// checks for a number of different conditions, such as the availability of a /// single induction variable, that all types are supported and vectorize-able, -/// etc. This code reflects the capabilities of SingleBlockLoopVectorizer. -/// This class is also used by SingleBlockLoopVectorizer for identifying +/// etc. This code reflects the capabilities of InnerLoopVectorizer. +/// This class is also used by InnerLoopVectorizer for identifying /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: @@ -511,7 +511,7 @@ struct LoopVectorize : public LoopPass { "\n"); // If we decided that it is *legal* to vectorizer the loop then do it. - SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, VF); + InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -531,7 +531,7 @@ struct LoopVectorize : public LoopPass { }; -Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { +Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { // Create the types. LLVMContext &C = V->getContext(); Type *VTy = VectorType::get(V->getType(), VF); @@ -563,7 +563,7 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } -Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { +Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val) { assert(Val->getType()->isVectorTy() && "Must be a vector"); assert(Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer"); @@ -622,7 +622,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) { return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); } -Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { +Value *InnerLoopVectorizer::getVectorValue(Value *V) { assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); // If we saved a vectorized copy of V, use it. @@ -637,11 +637,11 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { } Constant* -SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { +InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); } -void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -712,7 +712,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { } Value* -SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, +InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, Instruction *Loc) { LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = Legal->getRuntimePointerCheck(); @@ -773,7 +773,7 @@ SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, } void -SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { +InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -1037,7 +1037,7 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { } void -SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { +InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { //===------------------------------------------------===// // // Notice: any optimization or new instruction that go @@ -1427,7 +1427,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { }// end of for each redux variable. } -void SingleBlockLoopVectorizer::updateAnalysis() { +void InnerLoopVectorizer::updateAnalysis() { // Forget the original basic block. SE->forgetLoop(OrigLoop); -- cgit v1.1 From fa72ee729a989ff340672034fd77832c1fd76326 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 3 Dec 2012 22:46:31 +0000 Subject: IF-conversion: teach the cost-model how to grade if-converted loops. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169171 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 41 +++++++++++++++++++----------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index ecf19b7..efbf0fd 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -321,6 +321,10 @@ public: /// Returns the induction variables found in the loop. InductionList *getInductionVars() { return &Inductions; } + /// Return true if the block BB needs to be predicated in order for the loop + /// to be vectorized. + bool blockNeedsPredication(BasicBlock *BB); + /// Check if this pointer is consecutive when vectorizing. This happens /// when the last index of the GEP is the induction variable, or that the /// pointer itself is an induction variable. @@ -354,10 +358,6 @@ private: /// Collect the variables that need to stay uniform after vectorization. void collectLoopUniforms(); - /// Return true if the block BB needs to be predicated in order for the loop - /// to be vectorized. - bool blockNeedsPredication(BasicBlock *BB); - /// return true if all of the instructions in the block can be speculatively /// executed. bool blockCanBePredicated(BasicBlock *BB); @@ -2064,19 +2064,29 @@ LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { } unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { - // We can only estimate the cost of single basic block loops. - assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); - - BasicBlock *BB = TheLoop->getHeader(); unsigned Cost = 0; - // For each instruction in the old loop. - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - Instruction *Inst = it; - unsigned C = getInstructionCost(Inst, VF); - Cost += C; - DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << - " For instruction: "<< *Inst << "\n"); + // For each block. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); bb != be; ++bb) { + unsigned BlockCost = 0; + BasicBlock *BB = *bb; + + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + + unsigned C = getInstructionCost(it, VF); + Cost += C; + DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << + VF << " For instruction: "<< *it << "\n"); + } + + // TODO: if-converted blocks can have a high-nest level. We need to + // calculate the loop nest level and multiply the cost accordingly. + if (Legal->blockNeedsPredication(*bb)) + BlockCost *= 2; + + Cost += BlockCost; } return Cost; @@ -2106,6 +2116,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { return VTTI->getCFInstrCost(I->getOpcode()); } case Instruction::PHI: + //TODO: IF-converted IFs become selects. return 0; case Instruction::Add: case Instruction::FAdd: -- cgit v1.1 From b8f842dce47e745b37505f156854755d8ad4c929 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 3 Dec 2012 22:57:09 +0000 Subject: minor renaming, documentation and cleanups. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169175 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 46 ++++++++++++++---------------- 1 file changed, 21 insertions(+), 25 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index efbf0fd..4dfe906 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -118,8 +118,7 @@ class InnerLoopVectorizer { public: /// Ctor. InnerLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, - DominatorTree *Dt, DataLayout *Dl, - unsigned VecWidth): + DominatorTree *Dt, DataLayout *Dl, unsigned VecWidth): OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth), Builder(Se->getContext()), Induction(0), OldInduction(0) { } @@ -343,12 +342,12 @@ private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count /// and we only need to check individual instructions. - bool canVectorizeInstrs(BasicBlock &BB); + bool canVectorizeInstrs(); /// When we vectorize loops we may change the order in which /// we read and write from memory. This method checks if it is /// legal to vectorize the code, considering only memory constrains. - /// Returns true if BB is vectorizable + /// Returns true if the loop is vectorizable bool canVectorizeMemory(); /// Return true if we can vectorize this loop using the IF-conversion @@ -358,7 +357,7 @@ private: /// Collect the variables that need to stay uniform after vectorization. void collectLoopUniforms(); - /// return true if all of the instructions in the block can be speculatively + /// Return true if all of the instructions in the block can be speculatively /// executed. bool blockCanBePredicated(BasicBlock *BB); @@ -1463,7 +1462,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (Preds > 2) return false; - // We must be able to predicate all blocks that needs to be predicated. + // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) return false; } @@ -1516,7 +1515,7 @@ bool LoopVectorizationLegality::canVectorize() { } // Check if we can vectorize the instructions and CFG in this loop. - if (!canVectorizeInstrs(*Header)) { + if (!canVectorizeInstrs()) { DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); return false; } @@ -1527,7 +1526,7 @@ bool LoopVectorizationLegality::canVectorize() { return false; } - // Collect all of the variables that remain uniform after vectorization. + // Collect all of the variables that remain uniform after vectorization. collectLoopUniforms(); DEBUG(dbgs() << "LV: We can vectorize this loop" << @@ -1540,19 +1539,19 @@ bool LoopVectorizationLegality::canVectorize() { return true; } -bool LoopVectorizationLegality::canVectorizeInstrs(BasicBlock &BB) { +bool LoopVectorizationLegality::canVectorizeInstrs() { BasicBlock *PreHeader = TheLoop->getLoopPreheader(); BasicBlock *Header = TheLoop->getHeader(); - // For each block in the loop + // For each block in the loop. for (Loop::block_iterator bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; ++bb) { // Scan the instructions in the block and look for hazards. - for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { - Instruction *I = it; + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { - if (PHINode *Phi = dyn_cast(I)) { + if (PHINode *Phi = dyn_cast(it)) { // This should not happen because the loop should be normalized. if (Phi->getNumIncomingValues() != 2) { DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); @@ -1561,9 +1560,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs(BasicBlock &BB) { // If this PHINode is not in the header block, then we know that we // can convert it to select during if-conversion. - if (*bb != Header) { + if (*bb != Header) continue; - } // This is the value coming from the preheader. Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); @@ -1615,26 +1613,26 @@ bool LoopVectorizationLegality::canVectorizeInstrs(BasicBlock &BB) { }// end of PHI handling // We still don't handle functions. - CallInst *CI = dyn_cast(I); + CallInst *CI = dyn_cast(it); if (CI) { DEBUG(dbgs() << "LV: Found a call site.\n"); return false; } // We do not re-vectorize vectors. - if (!VectorType::isValidElementType(I->getType()) && - !I->getType()->isVoidTy()) { + if (!VectorType::isValidElementType(it->getType()) && + !it->getType()->isVoidTy()) { DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); return false; } // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. - if (!AllowedExit.count(I)) + if (!AllowedExit.count(it)) //Check that all of the users of the loop are inside the BB. - for (Value::use_iterator it = I->use_begin(), e = I->use_end(); - it != e; ++it) { - Instruction *U = cast(*it); + for (Value::use_iterator I = it->use_begin(), E = it->use_end(); + I != E; ++I) { + Instruction *U = cast(*I); // This user may be a reduction exit value. if (!TheLoop->contains(U)) { DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); @@ -1657,7 +1655,6 @@ void LoopVectorizationLegality::collectLoopUniforms() { // We now know that the loop is vectorizable! // Collect variables that will remain uniform after vectorization. std::vector Worklist; - BasicBlock *Latch = TheLoop->getLoopLatch(); // Start with the conditional branch and walk up the block. @@ -1669,8 +1666,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { // Look at instructions inside this loop. // Stop when reaching PHI nodes. - // TODO: we need to prevent loops but we do need to follow PHIs inside this - // loop. + // TODO: we need to follow values all over the loop, not only in this block. if (!I || !TheLoop->contains(I) || isa(I)) continue; -- cgit v1.1 From 36422d887672dc9a21d9add99c675bdc9e4807dd Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Tue, 4 Dec 2012 00:49:28 +0000 Subject: Fix whitespace. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169194 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 4dfe906..b720c97 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1091,7 +1091,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // After broadcasting the induction variable we need to make the // vector consecutive by adding 0, 1, 2 ... Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted); - + WidenMap[OldInduction] = ConsecutiveInduction; continue; } @@ -2067,7 +2067,7 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { be = TheLoop->block_end(); bb != be; ++bb) { unsigned BlockCost = 0; BasicBlock *BB = *bb; - + // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { @@ -2081,7 +2081,7 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { // calculate the loop nest level and multiply the cost accordingly. if (Legal->blockNeedsPredication(*bb)) BlockCost *= 2; - + Cost += BlockCost; } @@ -2265,4 +2265,3 @@ namespace llvm { return new LoopVectorize(); } } - -- cgit v1.1 From 915584ed136c29b8fd9304268967f3f359befbb0 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Tue, 4 Dec 2012 00:49:34 +0000 Subject: LoopVectorize.cpp: Suppress a warning. [-Wunused-variable] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169195 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index b720c97..0e33228 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1495,9 +1495,9 @@ bool LoopVectorizationLegality::canVectorize() { } // We need to have a loop header. - BasicBlock *Header = TheLoop->getHeader(); BasicBlock *Latch = TheLoop->getLoopLatch(); - DEBUG(dbgs() << "LV: Found a loop: " << Header->getName() << "\n"); + DEBUG(dbgs() << "LV: Found a loop: " << + TheLoop->getHeader()->getName() << "\n"); // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); -- cgit v1.1 From 319d594e22c3db55114f233ca398f9760e3f6ed9 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 4 Dec 2012 06:15:11 +0000 Subject: Add the last part that is needed for vectorization of if-converted code. Added the code that actually performs the if-conversion during vectorization. We can now vectorize this code: for (int i=0; i b[i]) <------ IF inside the loop. k = k * 5 + 3; a[i] = k; <---- K is a phi node that becomes vector-select. } git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169217 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 413 ++++++++++++++++++----------- 1 file changed, 251 insertions(+), 162 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 0e33228..f538e08 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -50,6 +50,7 @@ #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" @@ -134,6 +135,9 @@ public: } private: + /// A small list of PHINodes. + typedef SmallVector PhiVector; + /// Add code that checks at runtime if the accessed arrays overlap. /// Returns the comperator value or NULL if no check is needed. Value *addRuntimeCheck(LoopVectorizationLegality *Legal, @@ -142,6 +146,19 @@ private: void createEmptyLoop(LoopVectorizationLegality *Legal); /// Copy and widen the instructions from the old loop. void vectorizeLoop(LoopVectorizationLegality *Legal); + + /// A helper function that computes the predicate of the block BB, assuming + /// that the header block of the loop is set to True. It returns the *entry* + /// mask for the block BB. + Value *createBlockInMask(BasicBlock *BB); + /// A helper function that computes the predicate of the edge between SRC + /// and DST. + Value *createEdgeMask(BasicBlock *Src, BasicBlock *Dst); + + /// A helper function to vectorize a single BB within the innermost loop. + void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, + PhiVector *PV); + /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. void updateAnalysis(); @@ -816,7 +833,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { DL->getIntPtrType(SE->getContext()); // Find the loop boundaries. - const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); + const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); // Get the total trip count from the count by adding 1. @@ -838,7 +855,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { OldInduction->getIncomingValueForBlock(BypassBlock): ConstantInt::get(IdxTy, 0); - assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); assert(BypassBlock && "Invalid loop structure"); // Generate the code that checks in runtime if arrays overlap. @@ -1044,7 +1060,6 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // the cost-model. // //===------------------------------------------------===// - typedef SmallVector PhiVector; BasicBlock &BB = *OrigLoop->getHeader(); Constant *Zero = ConstantInt::get( IntegerType::getInt32Ty(BB.getContext()), 0); @@ -1059,24 +1074,220 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // construct the PHI. PhiVector RdxPHIsToFix; - // For each instruction in the old loop. - for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { - Instruction *Inst = it; + // Scan the loop in a topological order to ensure that defs are vectorized + // before users. + LoopBlocksDFS DFS(OrigLoop); + DFS.perform(LI); + + // Vectorize all of the blocks in the original loop. + for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), + be = DFS.endRPO(); bb != be; ++bb) + vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix); + + // At this point every instruction in the original loop is widened to + // a vector form. We are almost done. Now, we need to fix the PHI nodes + // that we vectorized. The PHI nodes are currently empty because we did + // not want to introduce cycles. Notice that the remaining PHI nodes + // that we need to fix are reduction variables. + + // Create the 'reduced' values for each of the induction vars. + // The reduced values are the vector values that we scalarize and combine + // after the loop is finished. + for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); + it != e; ++it) { + PHINode *RdxPhi = *it; + PHINode *VecRdxPhi = dyn_cast(WidenMap[RdxPhi]); + assert(RdxPhi && "Unable to recover vectorized PHI"); + + // Find the reduction variable descriptor. + assert(Legal->getReductionVars()->count(RdxPhi) && + "Unable to find the reduction variable"); + LoopVectorizationLegality::ReductionDescriptor RdxDesc = + (*Legal->getReductionVars())[RdxPhi]; + + // We need to generate a reduction vector from the incoming scalar. + // To do so, we need to generate the 'identity' vector and overide + // one of the elements with the incoming scalar reduction. We need + // to do it in the vector-loop preheader. + Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); + + // This is the vector-clone of the value that leaves the loop. + Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); + Type *VecTy = VectorExit->getType(); + + // Find the reduction identity variable. Zero for addition, or, xor, + // one for multiplication, -1 for And. + Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), + VecTy->getScalarType()); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + Value *VectorStart = Builder.CreateInsertElement(Identity, + RdxDesc.StartValue, Zero); + + // Fix the vector-loop phi. + // We created the induction variable so we know that the + // preheader is the first entry. + BasicBlock *VecPreheader = Induction->getIncomingBlock(0); + + // Reductions do not have to start at zero. They can start with + // any loop invariant values. + VecRdxPhi->addIncoming(VectorStart, VecPreheader); + unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); + Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); + VecRdxPhi->addIncoming(Val, LoopVectorBody); + + // Before each round, move the insertion point right between + // the PHIs and the values we are going to write. + // This allows us to write both PHINodes and the extractelement + // instructions. + Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); + + // This PHINode contains the vectorized reduction variable, or + // the initial value vector, if we bypass the vector loop. + PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); + NewPhi->addIncoming(VectorStart, LoopBypassBlock); + NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); + + // Extract the first scalar. + Value *Scalar0 = + Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); + // Extract and reduce the remaining vector elements. + for (unsigned i=1; i < VF; ++i) { + Value *Scalar1 = + Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); + switch (RdxDesc.Kind) { + case LoopVectorizationLegality::IntegerAdd: + Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerMult: + Scalar0 = Builder.CreateMul(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerOr: + Scalar0 = Builder.CreateOr(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerAnd: + Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerXor: + Scalar0 = Builder.CreateXor(Scalar0, Scalar1); + break; + default: + llvm_unreachable("Unknown reduction operation"); + } + } + + // Now, we need to fix the users of the reduction variable + // inside and outside of the scalar remainder loop. + // We know that the loop is in LCSSA form. We need to update the + // PHI nodes in the exit blocks. + for (BasicBlock::iterator LEI = LoopExitBlock->begin(), + LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { + PHINode *LCSSAPhi = dyn_cast(LEI); + if (!LCSSAPhi) continue; + + // All PHINodes need to have a single entry edge, or two if + // we already fixed them. + assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); + + // We found our reduction value exit-PHI. Update it with the + // incoming bypass edge. + if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { + // Add an edge coming from the bypass. + LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); + break; + } + }// end of the LCSSA phi scan. + + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. + int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); + int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. + (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); + (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); + }// end of for each redux variable. +} + +Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { + assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && + "Invalid edge"); + + Value *SrcMask = createBlockInMask(Src); + + // The terminator has to be a branch inst! + BranchInst *BI = dyn_cast(Src->getTerminator()); + assert(BI && "Unexpected terminator found"); + + Value *EdgeMask = SrcMask; + if (BI->isConditional()) { + EdgeMask = getVectorValue(BI->getCondition()); + if (BI->getSuccessor(0) != Dst) + EdgeMask = Builder.CreateNot(EdgeMask); + } + + return Builder.CreateAnd(EdgeMask, SrcMask); +} - switch (Inst->getOpcode()) { +Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { + assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); + + // Loop incoming mask is all-one. + if (OrigLoop->getHeader() == BB) + return getVectorValue( + ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1)); + + // This is the block mask. We OR all incoming edges, and with zero. + Value *BlockMask = getVectorValue( + ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0)); + + // For each pred: + for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) + BlockMask = Builder.CreateOr(BlockMask, createEdgeMask(*it, BB)); + + return BlockMask; +} + +void +InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, + BasicBlock *BB, PhiVector *PV) { + Constant *Zero = + ConstantInt::get(IntegerType::getInt32Ty(BB->getContext()), 0); + + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + switch (it->getOpcode()) { case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the // loop control flow instructions. continue; case Instruction::PHI:{ - PHINode* P = cast(Inst); + PHINode* P = cast(it); // Handle reduction variables: if (Legal->getReductionVars()->count(P)) { // This is phase one of vectorizing PHIs. - Type *VecTy = VectorType::get(Inst->getType(), VF); - WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi", - LoopVectorBody->getFirstInsertionPt()); - RdxPHIsToFix.push_back(P); + Type *VecTy = VectorType::get(it->getType(), VF); + WidenMap[it] = + PHINode::Create(VecTy, 2, "vec.phi", + LoopVectorBody->getFirstInsertionPt()); + PV->push_back(P); + continue; + } + + // Check for PHI nodes that are lowered to vector selects. + if (P->getParent() != OrigLoop->getHeader()) { + // We know that all PHIs in non header blocks are converted into + // selects, so we don't have to worry about the insertion order and we + // can just use the builder. + + // At this point we generate the predication tree. There may be + // duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + Value *Cond = createBlockInMask(P->getIncomingBlock(0)); + WidenMap[P] = + Builder.CreateSelect(Cond, + getVectorValue(P->getIncomingValue(0)), + getVectorValue(P->getIncomingValue(1)), + "predphi"); continue; } @@ -1099,8 +1310,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Handle pointer inductions. assert(P->getType()->isPointerTy() && "Unexpected type."); Value *StartIdx = OldInduction ? - Legal->getInductionVars()->lookup(OldInduction) : - ConstantInt::get(Induction->getType(), 0); + Legal->getInductionVars()->lookup(OldInduction) : + ConstantInt::get(Induction->getType(), 0); // This is the pointer value coming into the loop. Value *StartPtr = Legal->getInductionVars()->lookup(P); @@ -1121,7 +1332,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { "insert.gep"); } - WidenMap[Inst] = VecVal; + WidenMap[it] = VecVal; continue; } case Instruction::Add: @@ -1143,13 +1354,13 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { case Instruction::Or: case Instruction::Xor: { // Just widen binops. - BinaryOperator *BinOp = dyn_cast(Inst); - Value *A = getVectorValue(Inst->getOperand(0)); - Value *B = getVectorValue(Inst->getOperand(1)); + BinaryOperator *BinOp = dyn_cast(it); + Value *A = getVectorValue(it->getOperand(0)); + Value *B = getVectorValue(it->getOperand(1)); // Use this vector value for all users of the original instruction. Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); - WidenMap[Inst] = V; + WidenMap[it] = V; // Update the NSW, NUW and Exact flags. BinaryOperator *VecOp = cast(V); @@ -1165,7 +1376,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Widen selects. // If the selector is loop invariant we can create a select // instruction with a scalar condition. Otherwise, use vector-select. - Value *Cond = Inst->getOperand(0); + Value *Cond = it->getOperand(0); bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); // The condition can be loop invariant but still defined inside the @@ -1176,29 +1387,29 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { if (InvariantCond) Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); - Value *Op0 = getVectorValue(Inst->getOperand(1)); - Value *Op1 = getVectorValue(Inst->getOperand(2)); - WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); + Value *Op0 = getVectorValue(it->getOperand(1)); + Value *Op1 = getVectorValue(it->getOperand(2)); + WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1); break; } case Instruction::ICmp: case Instruction::FCmp: { // Widen compares. Generate vector compares. - bool FCmp = (Inst->getOpcode() == Instruction::FCmp); - CmpInst *Cmp = dyn_cast(Inst); - Value *A = getVectorValue(Inst->getOperand(0)); - Value *B = getVectorValue(Inst->getOperand(1)); + bool FCmp = (it->getOpcode() == Instruction::FCmp); + CmpInst *Cmp = dyn_cast(it); + Value *A = getVectorValue(it->getOperand(0)); + Value *B = getVectorValue(it->getOperand(1)); if (FCmp) - WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); else - WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); + WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B); break; } case Instruction::Store: { // Attempt to issue a wide store. - StoreInst *SI = dyn_cast(Inst); + StoreInst *SI = dyn_cast(it); Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); Value *Ptr = SI->getPointerOperand(); unsigned Alignment = SI->getAlignment(); @@ -1210,7 +1421,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // This store does not use GEPs. if (!Legal->isConsecutivePtr(Ptr)) { - scalarizeInstruction(Inst); + scalarizeInstruction(it); break; } @@ -1237,7 +1448,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { } case Instruction::Load: { // Attempt to issue a wide load. - LoadInst *LI = dyn_cast(Inst); + LoadInst *LI = dyn_cast(it); Type *RetTy = VectorType::get(LI->getType(), VF); Value *Ptr = LI->getPointerOperand(); unsigned Alignment = LI->getAlignment(); @@ -1247,7 +1458,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // scalarize the load. bool Con = Legal->isConsecutivePtr(Ptr); if (Legal->isUniform(Ptr) || !Con) { - scalarizeInstruction(Inst); + scalarizeInstruction(it); break; } @@ -1272,7 +1483,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { LI = Builder.CreateLoad(Ptr); LI->setAlignment(Alignment); // Use this vector value for all users of the load. - WidenMap[Inst] = LI; + WidenMap[it] = LI; break; } case Instruction::ZExt: @@ -1288,144 +1499,22 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { case Instruction::FPTrunc: case Instruction::BitCast: { /// Vectorize bitcasts. - CastInst *CI = dyn_cast(Inst); - Value *A = getVectorValue(Inst->getOperand(0)); + CastInst *CI = dyn_cast(it); + Value *A = getVectorValue(it->getOperand(0)); Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); - WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); + WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy); break; } - + default: /// All other instructions are unsupported. Scalarize them. - scalarizeInstruction(Inst); + scalarizeInstruction(it); break; }// end of switch. }// end of for_each instr. - - // At this point every instruction in the original loop is widended to - // a vector form. We are almost done. Now, we need to fix the PHI nodes - // that we vectorized. The PHI nodes are currently empty because we did - // not want to introduce cycles. Notice that the remaining PHI nodes - // that we need to fix are reduction variables. - - // Create the 'reduced' values for each of the induction vars. - // The reduced values are the vector values that we scalarize and combine - // after the loop is finished. - for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); - it != e; ++it) { - PHINode *RdxPhi = *it; - PHINode *VecRdxPhi = dyn_cast(WidenMap[RdxPhi]); - assert(RdxPhi && "Unable to recover vectorized PHI"); - - // Find the reduction variable descriptor. - assert(Legal->getReductionVars()->count(RdxPhi) && - "Unable to find the reduction variable"); - LoopVectorizationLegality::ReductionDescriptor RdxDesc = - (*Legal->getReductionVars())[RdxPhi]; - - // We need to generate a reduction vector from the incoming scalar. - // To do so, we need to generate the 'identity' vector and overide - // one of the elements with the incoming scalar reduction. We need - // to do it in the vector-loop preheader. - Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); - - // This is the vector-clone of the value that leaves the loop. - Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); - Type *VecTy = VectorExit->getType(); - - // Find the reduction identity variable. Zero for addition, or, xor, - // one for multiplication, -1 for And. - Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), - VecTy->getScalarType()); - - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - Value *VectorStart = Builder.CreateInsertElement(Identity, - RdxDesc.StartValue, Zero); - - // Fix the vector-loop phi. - // We created the induction variable so we know that the - // preheader is the first entry. - BasicBlock *VecPreheader = Induction->getIncomingBlock(0); - - // Reductions do not have to start at zero. They can start with - // any loop invariant values. - VecRdxPhi->addIncoming(VectorStart, VecPreheader); - unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); - Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); - VecRdxPhi->addIncoming(Val, LoopVectorBody); - - // Before each round, move the insertion point right between - // the PHIs and the values we are going to write. - // This allows us to write both PHINodes and the extractelement - // instructions. - Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); - - // This PHINode contains the vectorized reduction variable, or - // the initial value vector, if we bypass the vector loop. - PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); - NewPhi->addIncoming(VectorStart, LoopBypassBlock); - NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); - - // Extract the first scalar. - Value *Scalar0 = - Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); - // Extract and reduce the remaining vector elements. - for (unsigned i=1; i < VF; ++i) { - Value *Scalar1 = - Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); - switch (RdxDesc.Kind) { - case LoopVectorizationLegality::IntegerAdd: - Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); - break; - case LoopVectorizationLegality::IntegerMult: - Scalar0 = Builder.CreateMul(Scalar0, Scalar1); - break; - case LoopVectorizationLegality::IntegerOr: - Scalar0 = Builder.CreateOr(Scalar0, Scalar1); - break; - case LoopVectorizationLegality::IntegerAnd: - Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); - break; - case LoopVectorizationLegality::IntegerXor: - Scalar0 = Builder.CreateXor(Scalar0, Scalar1); - break; - default: - llvm_unreachable("Unknown reduction operation"); - } - } - - // Now, we need to fix the users of the reduction variable - // inside and outside of the scalar remainder loop. - // We know that the loop is in LCSSA form. We need to update the - // PHI nodes in the exit blocks. - for (BasicBlock::iterator LEI = LoopExitBlock->begin(), - LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { - PHINode *LCSSAPhi = dyn_cast(LEI); - if (!LCSSAPhi) continue; - - // All PHINodes need to have a single entry edge, or two if - // we already fixed them. - assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); - - // We found our reduction value exit-PHI. Update it with the - // incoming bypass edge. - if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { - // Add an edge coming from the bypass. - LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); - break; - } - }// end of the LCSSA phi scan. - - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. - int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); - int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. - (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); - (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); - }// end of for each redux variable. } + void InnerLoopVectorizer::updateAnalysis() { // Forget the original basic block. SE->forgetLoop(OrigLoop); -- cgit v1.1 From e6f2df961065e3b723ef0fc78441f07c5577acd2 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 4 Dec 2012 07:11:52 +0000 Subject: Give scalar if-converted blocks half the score because they are not always executed due to CF. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169223 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index f538e08..1de5b30 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2159,17 +2159,17 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - unsigned C = getInstructionCost(it, VF); Cost += C; DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << VF << " For instruction: "<< *it << "\n"); } - // TODO: if-converted blocks can have a high-nest level. We need to - // calculate the loop nest level and multiply the cost accordingly. - if (Legal->blockNeedsPredication(*bb)) - BlockCost *= 2; + // We assume that if-converted blocks have a 50% chance of being executed. + // When the code is scalar then some of the blocks are avoided due to CF. + // When the code is vectorized we execute all code paths. + if (Legal->blockNeedsPredication(*bb) && VF == 1) + BlockCost /= 2; Cost += BlockCost; } -- cgit v1.1 From f6088d126e9110180f7db231bb58da9230a09537 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 4 Dec 2012 18:17:33 +0000 Subject: Add support for reduction variables when IF-conversion is enabled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169288 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 33 +++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 1de5b30..3502e9e 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1133,8 +1133,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Reductions do not have to start at zero. They can start with // any loop invariant values. VecRdxPhi->addIncoming(VectorStart, VecPreheader); - unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); - Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); + Value *Val = + getVectorValue(RdxPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); VecRdxPhi->addIncoming(Val, LoopVectorBody); // Before each round, move the insertion point right between @@ -1201,8 +1201,11 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Fix the scalar loop reduction variable with the incoming reduction sum // from the vector body and from the backedge value. - int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); - int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. + int IncomingEdgeBlockIdx = + (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); + assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); + // Pick the other block. + int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); }// end of for each redux variable. @@ -1961,11 +1964,13 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, if (Phi->getNumIncomingValues() != 2) return false; - // Find the possible incoming reduction variable. - BasicBlock *BB = Phi->getParent(); - int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); - int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. - Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); + // Reduction variables are only found in the loop header block. + if (Phi->getParent() != TheLoop->getHeader()) + return false; + + // Obtain the reduction start value from the value that comes from the loop + // preheader. + Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); // ExitInstruction is the single value which is used outside the loop. // We only allow for a single reduction value to be used outside the loop. @@ -2003,9 +2008,17 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, FoundStartPHI = true; continue; } + + // We allow in-loop PHINodes which are not the original reduction PHI + // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE + // structure) then don't skip this PHI. + if (isa(U) && U->getParent() != TheLoop->getHeader() && + TheLoop->contains(U->getParent()) && Iter->getNumUses() > 1) + continue; + // Check if we found the exit user. BasicBlock *Parent = U->getParent(); - if (Parent != BB) { + if (!TheLoop->contains(Parent)) { // We must have a single exit instruction. if (ExitInstruction != 0) return false; -- cgit v1.1 From e570dee4b03cca54bbf27a7f7a3299c5cdc3d087 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 4 Dec 2012 22:40:22 +0000 Subject: Fix a bug in vectorization of if-converted reduction variables. If the reduction variable is not used outside the loop then we ran into an endless loop. This change checks if we found the original PHI. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169324 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 34 ++++++++++++++++++------------ 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 3502e9e..ac62b11 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1985,20 +1985,20 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // Also, we can't have multiple block-local users. Instruction *Iter = Phi; while (true) { + // If the instruction has no users then this is a broken + // chain and can't be a reduction variable. + if (Iter->use_empty()) + return false; + // Any reduction instr must be of one of the allowed kinds. if (!isReductionInstr(Iter, Kind)) return false; - // Did we found a user inside this block ? + // Did we find a user inside this block ? bool FoundInBlockUser = false; // Did we reach the initial PHI node ? bool FoundStartPHI = false; - // If the instruction has no users then this is a broken - // chain and can't be a reduction variable. - if (Iter->use_empty()) - return false; - // For each of the *users* of iter. for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); it != e; ++it) { @@ -2009,21 +2009,22 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, continue; } - // We allow in-loop PHINodes which are not the original reduction PHI - // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE - // structure) then don't skip this PHI. - if (isa(U) && U->getParent() != TheLoop->getHeader() && - TheLoop->contains(U->getParent()) && Iter->getNumUses() > 1) - continue; - // Check if we found the exit user. BasicBlock *Parent = U->getParent(); if (!TheLoop->contains(Parent)) { - // We must have a single exit instruction. + // Exit if you find multiple outside users. if (ExitInstruction != 0) return false; ExitInstruction = Iter; } + + // We allow in-loop PHINodes which are not the original reduction PHI + // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE + // structure) then don't skip this PHI. + if (isa(U) && U->getParent() != TheLoop->getHeader() && + TheLoop->contains(U) && Iter->getNumUses() > 1) + continue; + // We can't have multiple inside users. if (FoundInBlockUser) return false; @@ -2043,6 +2044,11 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, Reductions[Phi] = RD; return true; } + + // If we've reached the start PHI but did not find an outside user then + // this is dead code. Abort. + if (FoundStartPHI) + return false; } } -- cgit v1.1 From dc5eff5b98aba0fe0e8104dfa7429c9a759684c2 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 4 Dec 2012 22:59:52 +0000 Subject: Enable if-conversion during vectorization. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169331 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index ac62b11..20e073b 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -82,7 +82,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Set the default vectorization width. Zero is autoselect.")); static cl::opt -EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden, +EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); /// We don't vectorize loops with a known constant trip count below this number. -- cgit v1.1 From 46c5f79789947b75ff3b0cc107f133fb0c5ffb5e Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 4 Dec 2012 23:25:24 +0000 Subject: LoopVectorizer: Increase the number of pointers that can be tested at runtime. If we cant prove statically that the pointers are disjoint then we add the runtime check. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169334 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 20e073b..166ad33 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -90,7 +90,7 @@ const unsigned TinyTripCountThreshold = 16; /// When performing a runtime memory check, do not check more than this /// number of pointers. Notice that the check is quadratic! -const unsigned RuntimeMemoryCheckThreshold = 2; +const unsigned RuntimeMemoryCheckThreshold = 4; /// This is the highest vector width that we try to generate. const unsigned MaxVectorSize = 8; -- cgit v1.1 From b48fdbc811b0bf7dec0780b4d03169d8ee835d03 Mon Sep 17 00:00:00 2001 From: Paul Redmond Date: Sun, 9 Dec 2012 19:46:31 +0000 Subject: test commit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169709 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 166ad33..66ac0b4 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -139,7 +139,7 @@ private: typedef SmallVector PhiVector; /// Add code that checks at runtime if the accessed arrays overlap. - /// Returns the comperator value or NULL if no check is needed. + /// Returns the comparator value or NULL if no check is needed. Value *addRuntimeCheck(LoopVectorizationLegality *Legal, Instruction *Loc); /// Create an empty loop, based on the loop ranges of the old loop. -- cgit v1.1 From 880166684e5af0f5b4bfe26870b9f7813e537354 Mon Sep 17 00:00:00 2001 From: Paul Redmond Date: Sun, 9 Dec 2012 20:42:17 +0000 Subject: LoopVectorize: support vectorizing intrinsic calls - added function to VectorTargetTransformInfo to query cost of intrinsics - vectorize trivially vectorizable intrinsic calls such as sin, cos, log, etc. Reviewed by: Nadav git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169711 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 57 ++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 66ac0b4..c93c2bf 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -62,6 +62,7 @@ #include "llvm/DerivedTypes.h" #include "llvm/Function.h" #include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" #include "llvm/Pass.h" @@ -1051,6 +1052,35 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { } } +static bool +isTriviallyVectorizableIntrinsic(Instruction *Inst) { + IntrinsicInst *II = dyn_cast(Inst); + if (!II) + return false; + switch (II->getIntrinsicID()) { + case Intrinsic::sqrt: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::fabs: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::pow: + case Intrinsic::fma: + return true; + default: + return false; + } + return false; +} + void InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { //===------------------------------------------------===// @@ -1509,8 +1539,22 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, break; } + case Instruction::Call: { + assert(isTriviallyVectorizableIntrinsic(it)); + Module *M = BB->getParent()->getParent(); + IntrinsicInst *II = cast(it); + Intrinsic::ID ID = II->getIntrinsicID(); + SmallVector Args; + for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) + Args.push_back(getVectorValue(II->getArgOperand(i))); + Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) }; + Function *F = Intrinsic::getDeclaration(M, ID, Tys); + WidenMap[it] = Builder.CreateCall(F, Args); + break; + } + default: - /// All other instructions are unsupported. Scalarize them. + // All other instructions are unsupported. Scalarize them. scalarizeInstruction(it); break; }// end of switch. @@ -1706,7 +1750,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // We still don't handle functions. CallInst *CI = dyn_cast(it); - if (CI) { + if (CI && !isTriviallyVectorizableIntrinsic(it)) { DEBUG(dbgs() << "LV: Found a call site.\n"); return false; } @@ -2326,6 +2370,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } + case Instruction::Call: { + assert(isTriviallyVectorizableIntrinsic(I)); + IntrinsicInst *II = cast(I); + Type *RetTy = ToVectorTy(II->getType(), VF); + SmallVector Tys; + for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) + Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF)); + return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); + } default: { // We are scalarizing the instruction. Return the cost of the scalar // instruction, plus the cost of insert and extract into vector -- cgit v1.1 From f0d19bd1291ee1d2ffee4bbe0aef12b814aff789 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 10 Dec 2012 19:25:06 +0000 Subject: Add support for reverse induction variables. For example: while (i--) sum+=A[i]; git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169752 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 281 ++++++++++++++++++++--------- 1 file changed, 191 insertions(+), 90 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index c93c2bf..593fb79 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -175,9 +175,9 @@ private: /// element. Value *getBroadcastInstrs(Value *V); - /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. - /// for each element in the vector. Starting from zero. - Value *getConsecutiveVector(Value* Val); + /// This function adds 0, 1, 2 ... to each vector element, starting at zero. + /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). + Value *getConsecutiveVector(Value* Val, bool Negate = false); /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. @@ -252,7 +252,7 @@ public: DominatorTree *Dt): TheLoop(Lp), SE(Se), DL(Dl), DT(Dt), Induction(0) { } - /// This represents the kinds of reductions that we support. + /// This enum represents the kinds of reductions that we support. enum ReductionKind { NoReduction, /// Not a reduction. IntegerAdd, /// Sum of numbers. @@ -262,6 +262,14 @@ public: IntegerXor /// Bitwise or logical XOR of numbers. }; + /// This enum represents the kinds of inductions that we support. + enum InductionKind { + NoInduction, /// Not an induction variable. + IntInduction, /// Integer induction variable. Step = 1. + ReverseIntInduction, /// Reverse int induction variable. Step = -1. + PtrInduction /// Pointer induction variable. Step = sizeof(elem). + }; + /// This POD struct holds information about reduction variables. struct ReductionDescriptor { // Default C'tor @@ -316,13 +324,25 @@ public: SmallVector Ends; }; + /// A POD for saving information about induction variables. + struct InductionInfo { + /// Ctors. + InductionInfo(Value *Start, InductionKind K): + StartValue(Start), IK(K) {}; + InductionInfo(): StartValue(0), IK(NoInduction) {}; + /// Start value. + Value *StartValue; + /// Induction kind. + InductionKind IK; + }; + /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. typedef DenseMap ReductionList; - /// InductionList saves induction variables and maps them to the initial - /// value entring the loop. - typedef DenseMap InductionList; + /// InductionList saves induction variables and maps them to the + /// induction descriptor. + typedef DenseMap InductionList; /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this @@ -385,8 +405,9 @@ private: /// Returns true if the instruction I can be a reduction variable of type /// 'Kind'. bool isReductionInstr(Instruction *I, ReductionKind Kind); - /// Returns True, if 'Phi' is an induction variable. - bool isInductionVariable(PHINode *Phi); + /// Returns the induction kind of Phi. This function may return NoInduction + /// if the PHI is not an induction variable. + InductionKind isInductionVariable(PHINode *Phi); /// Return true if can compute the address bounds of Ptr within the loop. bool hasComputableBounds(Value *Ptr); @@ -558,7 +579,9 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { Instruction *Loc = Builder.GetInsertPoint(); // We need to place the broadcast of invariant variables outside the loop. - bool Invariant = (OrigLoop->isLoopInvariant(V) && V != Induction); + Instruction *Instr = dyn_cast(V); + bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); + bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; // Place the code for broadcasting invariant variables in the new preheader. if (Invariant) @@ -580,19 +603,19 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } -Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val) { +Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) { assert(Val->getType()->isVectorTy() && "Must be a vector"); assert(Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer"); // Create the types. Type *ITy = Val->getType()->getScalarType(); VectorType *Ty = cast(Val->getType()); - unsigned VLen = Ty->getNumElements(); + int VLen = Ty->getNumElements(); SmallVector Indices; // Create a vector of consecutive numbers from zero to VF. - for (unsigned i = 0; i < VLen; ++i) - Indices.push_back(ConstantInt::get(ITy, i)); + for (int i = 0; i < VLen; ++i) + Indices.push_back(ConstantInt::get(ITy, Negate ? (-i): i )); // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); @@ -603,10 +626,13 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val) { bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); - // If this pointer is an induction variable, return it. + // If this value is a pointer induction variable we know it is consecutive. PHINode *Phi = dyn_cast_or_null(Ptr); - if (Phi && getInductionVars()->count(Phi)) - return true; + if (Phi && Inductions.count(Phi)) { + InductionInfo II = Inductions[Phi]; + if (PtrInduction == II.IK) + return true; + } GetElementPtrInst *Gep = dyn_cast_or_null(Ptr); if (!Gep) @@ -730,7 +756,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { Value* InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, - Instruction *Loc) { + Instruction *Loc) { LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = Legal->getRuntimePointerCheck(); @@ -745,7 +771,7 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, SCEVExpander Exp(*SE, "induction"); // Use this type for pointer arithmetic. - Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType(); + Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0); for (unsigned i = 0; i < NumPointers; ++i) { Value *Ptr = PtrRtCheck->Pointers[i]; @@ -759,8 +785,7 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, } else { DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); - Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], - PtrArithTy, Loc); + Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); Starts.push_back(Start); Ends.push_back(End); @@ -769,10 +794,16 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, for (unsigned i = 0; i < NumPointers; ++i) { for (unsigned j = i+1; j < NumPointers; ++j) { + Instruction::CastOps Op = Instruction::BitCast; + Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc); + Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc); + Value *End0 = CastInst::Create(Op, Ends[i], PtrArithTy, "bc", Loc); + Value *End1 = CastInst::Create(Op, Ends[j], PtrArithTy, "bc", Loc); + Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, - Starts[i], Ends[j], "bound0", Loc); + Start0, End1, "bound0", Loc); Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, - Starts[j], Ends[i], "bound1", Loc); + Start1, End0, "bound1", Loc); Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, "found.conflict", Loc); if (MemoryRuntimeCheck) @@ -936,27 +967,54 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); for (I = List->begin(), E = List->end(); I != E; ++I) { PHINode *OrigPhi = I->first; + LoopVectorizationLegality::InductionInfo II = I->second; PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", MiddleBlock->getTerminator()); Value *EndValue = 0; - if (OrigPhi->getType()->isIntegerTy()) { + switch (II.IK) { + case LoopVectorizationLegality::NoInduction: + llvm_unreachable("Unknown induction"); + case LoopVectorizationLegality::IntInduction: { // Handle the integer induction counter: + assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); assert(OrigPhi == OldInduction && "Unknown integer PHI"); // We know what the end value is. EndValue = IdxEndRoundDown; // We also know which PHI node holds it. ResumeIndex = ResumeVal; - } else { + break; + } + case LoopVectorizationLegality::ReverseIntInduction: { + // Convert the CountRoundDown variable to the PHI size. + unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits(); + unsigned IISize = II.StartValue->getType()->getScalarSizeInBits(); + Value *CRD = CountRoundDown; + if (CRDSize > IISize) + CRD = CastInst::Create(Instruction::Trunc, CountRoundDown, + II.StartValue->getType(), + "tr.crd", BypassBlock->getTerminator()); + else if (CRDSize < IISize) + CRD = CastInst::Create(Instruction::SExt, CountRoundDown, + II.StartValue->getType(), + "sext.crd", BypassBlock->getTerminator()); + // Handle reverse integer induction counter: + EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", + BypassBlock->getTerminator()); + break; + } + case LoopVectorizationLegality::PtrInduction: { // For pointer induction variables, calculate the offset using // the end index. - EndValue = GetElementPtrInst::Create(I->second, CountRoundDown, + EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end", BypassBlock->getTerminator()); + break; } + }// end of case // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - ResumeVal->addIncoming(I->second, BypassBlock); + ResumeVal->addIncoming(II.StartValue, BypassBlock); ResumeVal->addIncoming(EndValue, VecBody); // Fix the scalar body counter (PHI node). @@ -1188,19 +1246,19 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); switch (RdxDesc.Kind) { case LoopVectorizationLegality::IntegerAdd: - Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); + Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx"); break; case LoopVectorizationLegality::IntegerMult: - Scalar0 = Builder.CreateMul(Scalar0, Scalar1); + Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx"); break; case LoopVectorizationLegality::IntegerOr: - Scalar0 = Builder.CreateOr(Scalar0, Scalar1); + Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx"); break; case LoopVectorizationLegality::IntegerAnd: - Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); + Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx"); break; case LoopVectorizationLegality::IntegerXor: - Scalar0 = Builder.CreateXor(Scalar0, Scalar1); + Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx"); break; default: llvm_unreachable("Unknown reduction operation"); @@ -1282,7 +1340,7 @@ Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { void InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, - BasicBlock *BB, PhiVector *PV) { + BasicBlock *BB, PhiVector *PV) { Constant *Zero = ConstantInt::get(IntegerType::getInt32Ty(BB->getContext()), 0); @@ -1329,45 +1387,77 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); - if (P->getType()->isIntegerTy()) { + LoopVectorizationLegality::InductionInfo II = + Legal->getInductionVars()->lookup(P); + + switch (II.IK) { + case LoopVectorizationLegality::NoInduction: + llvm_unreachable("Unknown induction"); + case LoopVectorizationLegality::IntInduction: { assert(P == OldInduction && "Unexpected PHI"); Value *Broadcasted = getBroadcastInstrs(Induction); // After broadcasting the induction variable we need to make the // vector consecutive by adding 0, 1, 2 ... Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted); - WidenMap[OldInduction] = ConsecutiveInduction; continue; } + case LoopVectorizationLegality::ReverseIntInduction: + case LoopVectorizationLegality::PtrInduction: + // Handle reverse integer and pointer inductions. + Value *StartIdx = 0; + // If we have a single integer induction variable then use it. + // Otherwise, start counting at zero. + if (OldInduction) { + LoopVectorizationLegality::InductionInfo OldII = + Legal->getInductionVars()->lookup(OldInduction); + StartIdx = OldII.StartValue; + } else { + StartIdx = ConstantInt::get(Induction->getType(), 0); + } + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, + "normalized.idx"); + + // Handle the reverse integer induction variable case. + if (LoopVectorizationLegality::ReverseIntInduction == II.IK) { + IntegerType *DstTy = cast(II.StartValue->getType()); + Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, + "resize.norm.idx"); + Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, + "reverse.idx"); + + // This is a new value so do not hoist it out. + Value *Broadcasted = getBroadcastInstrs(ReverseInd); + // After broadcasting the induction variable we need to make the + // vector consecutive by adding ... -3, -2, -1, 0. + Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted, + true); + WidenMap[it] = ConsecutiveInduction; + continue; + } - // Handle pointer inductions. - assert(P->getType()->isPointerTy() && "Unexpected type."); - Value *StartIdx = OldInduction ? - Legal->getInductionVars()->lookup(OldInduction) : - ConstantInt::get(Induction->getType(), 0); - - // This is the pointer value coming into the loop. - Value *StartPtr = Legal->getInductionVars()->lookup(P); - - // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, - "normalized.idx"); - - // This is the vector of results. Notice that we don't generate vector - // geps because scalar geps result in better code. - Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); - for (unsigned int i = 0; i < VF; ++i) { - Constant *Idx = ConstantInt::get(Induction->getType(), i); - Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); - Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep"); - VecVal = Builder.CreateInsertElement(VecVal, SclrGep, - Builder.getInt32(i), - "insert.gep"); + // Handle the pointer induction variable case. + assert(P->getType()->isPointerTy() && "Unexpected type."); + + // This is the vector of results. Notice that we don't generate vector + // geps because scalar geps result in better code. + Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); + for (unsigned int i = 0; i < VF; ++i) { + Constant *Idx = ConstantInt::get(Induction->getType(), i); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); + Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, "next.gep"); + VecVal = Builder.CreateInsertElement(VecVal, SclrGep, + Builder.getInt32(i), + "insert.gep"); + } + + WidenMap[it] = VecVal; + continue; } - WidenMap[it] = VecVal; - continue; - } + }// End of PHI. + case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -1561,7 +1651,6 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, }// end of for_each instr. } - void InnerLoopVectorizer::updateAnalysis() { // Forget the original basic block. SE->forgetLoop(OrigLoop); @@ -1580,7 +1669,6 @@ void InnerLoopVectorizer::updateAnalysis() { DEBUG(DT->verifyAnalysis()); } - bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) return false; @@ -1694,35 +1782,39 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { return false; } + // Check that this PHI type is allowed. + if (!Phi->getType()->isIntegerTy() && + !Phi->getType()->isPointerTy()) { + DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); + return false; + } + // If this PHINode is not in the header block, then we know that we - // can convert it to select during if-conversion. + // can convert it to select during if-conversion. No need to check if + // the PHIs in this block are induction or reduction variables. if (*bb != Header) continue; // This is the value coming from the preheader. Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); + // Check if this is an induction variable. + InductionKind IK = isInductionVariable(Phi); + + if (NoInduction != IK) { + // Int inductions are special because we only allow one IV. + if (IK == IntInduction) { + if (Induction) { + DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); + return false; + } + Induction = Phi; + } - // We only look at integer and pointer phi nodes. - if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) { - DEBUG(dbgs() << "LV: Found a pointer induction variable.\n"); - Inductions[Phi] = StartValue; + DEBUG(dbgs() << "LV: Found an induction variable.\n"); + Inductions[Phi] = InductionInfo(StartValue, IK); continue; - } else if (!Phi->getType()->isIntegerTy()) { - DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); - return false; } - // Handle integer PHIs: - if (isInductionVariable(Phi)) { - if (Induction) { - DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); - return false; - } - DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); - Induction = Phi; - Inductions[Phi] = StartValue; - continue; - } if (AddReductionVar(Phi, IntegerAdd)) { DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); continue; @@ -2119,32 +2211,42 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, } } -bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { +LoopVectorizationLegality::InductionKind +LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { Type *PhiTy = Phi->getType(); // We only handle integer and pointer inductions variables. if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) - return false; + return NoInduction; // Check that the PHI is consecutive and starts at zero. const SCEV *PhiScev = SE->getSCEV(Phi); const SCEVAddRecExpr *AR = dyn_cast(PhiScev); if (!AR) { DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); - return false; + return NoInduction; } const SCEV *Step = AR->getStepRecurrence(*SE); // Integer inductions need to have a stride of one. - if (PhiTy->isIntegerTy()) - return Step->isOne(); + if (PhiTy->isIntegerTy()) { + if (Step->isOne()) + return IntInduction; + if (Step->isAllOnesValue()) + return ReverseIntInduction; + return NoInduction; + } // Calculate the pointer stride and check if it is consecutive. const SCEVConstant *C = dyn_cast(Step); - if (!C) return false; + if (!C) + return NoInduction; assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); - return (C->getValue()->equalsInt(Size)); + if (C->getValue()->equalsInt(Size)) + return PtrInduction; + + return NoInduction; } bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { @@ -2252,7 +2354,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { Type *RetTy = I->getType(); Type *VectorTy = ToVectorTy(RetTy, VF); - // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: -- cgit v1.1 From d1d92bf953d51560e979337cadcc9d7e62fdd79e Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 10 Dec 2012 21:39:02 +0000 Subject: Split the LoopVectorizer into H and CPP. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169771 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 1486 ++++++++++------------------ 1 file changed, 535 insertions(+), 951 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 593fb79..feeecec 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6,45 +6,7 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops -// and generates target-independent LLVM-IR. Legalization of the IR is done -// in the codegen. However, the vectorizes uses (will use) the codegen -// interfaces to generate IR that is likely to result in an optimal binary. -// -// The loop vectorizer combines consecutive loop iteration into a single -// 'wide' iteration. After this transformation the index is incremented -// by the SIMD vector width, and not by one. -// -// This pass has three parts: -// 1. The main loop pass that drives the different parts. -// 2. LoopVectorizationLegality - A unit that checks for the legality -// of the vectorization. -// 3. InnerLoopVectorizer - A unit that performs the actual -// widening of instructions. -// 4. LoopVectorizationCostModel - A unit that checks for the profitability -// of vectorization. It decides on the optimal vector width, which -// can be one, if vectorization is not profitable. -// -//===----------------------------------------------------------------------===// -// -// The reduction-variable vectorization is based on the paper: -// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. -// -// Variable uniformity checks are inspired by: -// Karrenberg, R. and Hack, S. Whole Function Vectorization. -// -// Other ideas/concepts are from: -// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. -// -// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of -// Vectorizing Compilers. -// -//===----------------------------------------------------------------------===// -#define LV_NAME "loop-vectorize" -#define DEBUG_TYPE LV_NAME -#include "llvm/Transforms/Vectorize.h" -#include "llvm/ADT/SmallVector.h" +#include "LoopVectorize.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" @@ -52,7 +14,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" @@ -73,423 +35,21 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Vectorize.h" #include "llvm/Type.h" #include "llvm/Value.h" -#include -using namespace llvm; static cl::opt VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, - cl::desc("Set the default vectorization width. Zero is autoselect.")); + cl::desc("Sets the SIMD width. Zero is autoselect.")); static cl::opt EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); -/// We don't vectorize loops with a known constant trip count below this number. -const unsigned TinyTripCountThreshold = 16; - -/// When performing a runtime memory check, do not check more than this -/// number of pointers. Notice that the check is quadratic! -const unsigned RuntimeMemoryCheckThreshold = 4; - -/// This is the highest vector width that we try to generate. -const unsigned MaxVectorSize = 8; - namespace { -// Forward declarations. -class LoopVectorizationLegality; -class LoopVectorizationCostModel; - -/// InnerLoopVectorizer vectorizes loops which contain only one basic -/// block to a specified vectorization factor (VF). -/// This class performs the widening of scalars into vectors, or multiple -/// scalars. This class also implements the following features: -/// * It inserts an epilogue loop for handling loops that don't have iteration -/// counts that are known to be a multiple of the vectorization factor. -/// * It handles the code generation for reduction variables. -/// * Scalarization (implementation using scalars) of un-vectorizable -/// instructions. -/// InnerLoopVectorizer does not perform any vectorization-legality -/// checks, and relies on the caller to check for the different legality -/// aspects. The InnerLoopVectorizer relies on the -/// LoopVectorizationLegality class to provide information about the induction -/// and reduction variables that were found to a given vectorization factor. -class InnerLoopVectorizer { -public: - /// Ctor. - InnerLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, - DominatorTree *Dt, DataLayout *Dl, unsigned VecWidth): - OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth), - Builder(Se->getContext()), Induction(0), OldInduction(0) { } - - // Perform the actual loop widening (vectorization). - void vectorize(LoopVectorizationLegality *Legal) { - // Create a new empty loop. Unlink the old loop and connect the new one. - createEmptyLoop(Legal); - // Widen each instruction in the old loop to a new one in the new loop. - // Use the Legality module to find the induction and reduction variables. - vectorizeLoop(Legal); - // Register the new loop and update the analysis passes. - updateAnalysis(); - } - -private: - /// A small list of PHINodes. - typedef SmallVector PhiVector; - - /// Add code that checks at runtime if the accessed arrays overlap. - /// Returns the comparator value or NULL if no check is needed. - Value *addRuntimeCheck(LoopVectorizationLegality *Legal, - Instruction *Loc); - /// Create an empty loop, based on the loop ranges of the old loop. - void createEmptyLoop(LoopVectorizationLegality *Legal); - /// Copy and widen the instructions from the old loop. - void vectorizeLoop(LoopVectorizationLegality *Legal); - - /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True. It returns the *entry* - /// mask for the block BB. - Value *createBlockInMask(BasicBlock *BB); - /// A helper function that computes the predicate of the edge between SRC - /// and DST. - Value *createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - - /// A helper function to vectorize a single BB within the innermost loop. - void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, - PhiVector *PV); - - /// Insert the new loop to the loop hierarchy and pass manager - /// and update the analysis passes. - void updateAnalysis(); - - /// This instruction is un-vectorizable. Implement it as a sequence - /// of scalars. - void scalarizeInstruction(Instruction *Instr); - - /// Create a broadcast instruction. This method generates a broadcast - /// instruction (shuffle) for loop invariant values and for the induction - /// value. If this is the induction variable then we extend it to N, N+1, ... - /// this is needed because each iteration in the loop corresponds to a SIMD - /// element. - Value *getBroadcastInstrs(Value *V); - - /// This function adds 0, 1, 2 ... to each vector element, starting at zero. - /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). - Value *getConsecutiveVector(Value* Val, bool Negate = false); - - /// When we go over instructions in the basic block we rely on previous - /// values within the current basic block or on loop invariant values. - /// When we widen (vectorize) values we place them in the map. If the values - /// are not within the map, they have to be loop invariant, so we simply - /// broadcast them into a vector. - Value *getVectorValue(Value *V); - - /// Get a uniform vector of constant integers. We use this to get - /// vectors of ones and zeros for the reduction code. - Constant* getUniformVector(unsigned Val, Type* ScalarTy); - - typedef DenseMap ValueMap; - - /// The original loop. - Loop *OrigLoop; - // Scev analysis to use. - ScalarEvolution *SE; - // Loop Info. - LoopInfo *LI; - // Dominator Tree. - DominatorTree *DT; - // Data Layout. - DataLayout *DL; - // The vectorization factor to use. - unsigned VF; - - // The builder that we use - IRBuilder<> Builder; - - // --- Vectorization state --- - - /// The vector-loop preheader. - BasicBlock *LoopVectorPreHeader; - /// The scalar-loop preheader. - BasicBlock *LoopScalarPreHeader; - /// Middle Block between the vector and the scalar. - BasicBlock *LoopMiddleBlock; - ///The ExitBlock of the scalar loop. - BasicBlock *LoopExitBlock; - ///The vector loop body. - BasicBlock *LoopVectorBody; - ///The scalar loop body. - BasicBlock *LoopScalarBody; - ///The first bypass block. - BasicBlock *LoopBypassBlock; - - /// The new Induction variable which was added to the new block. - PHINode *Induction; - /// The induction variable of the old basic block. - PHINode *OldInduction; - // Maps scalars to widened vectors. - ValueMap WidenMap; -}; - -/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and -/// to what vectorization factor. -/// This class does not look at the profitability of vectorization, only the -/// legality. This class has two main kinds of checks: -/// * Memory checks - The code in canVectorizeMemory checks if vectorization -/// will change the order of memory accesses in a way that will change the -/// correctness of the program. -/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory -/// checks for a number of different conditions, such as the availability of a -/// single induction variable, that all types are supported and vectorize-able, -/// etc. This code reflects the capabilities of InnerLoopVectorizer. -/// This class is also used by InnerLoopVectorizer for identifying -/// induction variable and the different reduction variables. -class LoopVectorizationLegality { -public: - LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl, - DominatorTree *Dt): - TheLoop(Lp), SE(Se), DL(Dl), DT(Dt), Induction(0) { } - - /// This enum represents the kinds of reductions that we support. - enum ReductionKind { - NoReduction, /// Not a reduction. - IntegerAdd, /// Sum of numbers. - IntegerMult, /// Product of numbers. - IntegerOr, /// Bitwise or logical OR of numbers. - IntegerAnd, /// Bitwise or logical AND of numbers. - IntegerXor /// Bitwise or logical XOR of numbers. - }; - - /// This enum represents the kinds of inductions that we support. - enum InductionKind { - NoInduction, /// Not an induction variable. - IntInduction, /// Integer induction variable. Step = 1. - ReverseIntInduction, /// Reverse int induction variable. Step = -1. - PtrInduction /// Pointer induction variable. Step = sizeof(elem). - }; - - /// This POD struct holds information about reduction variables. - struct ReductionDescriptor { - // Default C'tor - ReductionDescriptor(): - StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} - - // C'tor. - ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): - StartValue(Start), LoopExitInstr(Exit), Kind(K) {} - - // The starting value of the reduction. - // It does not have to be zero! - Value *StartValue; - // The instruction who's value is used outside the loop. - Instruction *LoopExitInstr; - // The kind of the reduction. - ReductionKind Kind; - }; - - // This POD struct holds information about the memory runtime legality - // check that a group of pointers do not overlap. - struct RuntimePointerCheck { - RuntimePointerCheck(): Need(false) {} - - /// Reset the state of the pointer runtime information. - void reset() { - Need = false; - Pointers.clear(); - Starts.clear(); - Ends.clear(); - } - - /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) { - const SCEV *Sc = SE->getSCEV(Ptr); - const SCEVAddRecExpr *AR = dyn_cast(Sc); - assert(AR && "Invalid addrec expression"); - const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); - const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); - Pointers.push_back(Ptr); - Starts.push_back(AR->getStart()); - Ends.push_back(ScEnd); - } - - /// This flag indicates if we need to add the runtime check. - bool Need; - /// Holds the pointers that we need to check. - SmallVector Pointers; - /// Holds the pointer value at the beginning of the loop. - SmallVector Starts; - /// Holds the pointer value at the end of the loop. - SmallVector Ends; - }; - - /// A POD for saving information about induction variables. - struct InductionInfo { - /// Ctors. - InductionInfo(Value *Start, InductionKind K): - StartValue(Start), IK(K) {}; - InductionInfo(): StartValue(0), IK(NoInduction) {}; - /// Start value. - Value *StartValue; - /// Induction kind. - InductionKind IK; - }; - - /// ReductionList contains the reduction descriptors for all - /// of the reductions that were found in the loop. - typedef DenseMap ReductionList; - - /// InductionList saves induction variables and maps them to the - /// induction descriptor. - typedef DenseMap InductionList; - - /// Returns true if it is legal to vectorize this loop. - /// This does not mean that it is profitable to vectorize this - /// loop, only that it is legal to do so. - bool canVectorize(); - - /// Returns the Induction variable. - PHINode *getInduction() {return Induction;} - - /// Returns the reduction variables found in the loop. - ReductionList *getReductionVars() { return &Reductions; } - - /// Returns the induction variables found in the loop. - InductionList *getInductionVars() { return &Inductions; } - - /// Return true if the block BB needs to be predicated in order for the loop - /// to be vectorized. - bool blockNeedsPredication(BasicBlock *BB); - - /// Check if this pointer is consecutive when vectorizing. This happens - /// when the last index of the GEP is the induction variable, or that the - /// pointer itself is an induction variable. - /// This check allows us to vectorize A[idx] into a wide load/store. - bool isConsecutivePtr(Value *Ptr); - - /// Returns true if the value V is uniform within the loop. - bool isUniform(Value *V); - - /// Returns true if this instruction will remain scalar after vectorization. - bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} - - /// Returns the information that we collected about runtime memory check. - RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; } -private: - /// Check if a single basic block loop is vectorizable. - /// At this point we know that this is a loop with a constant trip count - /// and we only need to check individual instructions. - bool canVectorizeInstrs(); - - /// When we vectorize loops we may change the order in which - /// we read and write from memory. This method checks if it is - /// legal to vectorize the code, considering only memory constrains. - /// Returns true if the loop is vectorizable - bool canVectorizeMemory(); - - /// Return true if we can vectorize this loop using the IF-conversion - /// transformation. - bool canVectorizeWithIfConvert(); - - /// Collect the variables that need to stay uniform after vectorization. - void collectLoopUniforms(); - - /// Return true if all of the instructions in the block can be speculatively - /// executed. - bool blockCanBePredicated(BasicBlock *BB); - - /// Returns True, if 'Phi' is the kind of reduction variable for type - /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. - bool AddReductionVar(PHINode *Phi, ReductionKind Kind); - /// Returns true if the instruction I can be a reduction variable of type - /// 'Kind'. - bool isReductionInstr(Instruction *I, ReductionKind Kind); - /// Returns the induction kind of Phi. This function may return NoInduction - /// if the PHI is not an induction variable. - InductionKind isInductionVariable(PHINode *Phi); - /// Return true if can compute the address bounds of Ptr within the loop. - bool hasComputableBounds(Value *Ptr); - - /// The loop that we evaluate. - Loop *TheLoop; - /// Scev analysis. - ScalarEvolution *SE; - /// DataLayout analysis. - DataLayout *DL; - // Dominators. - DominatorTree *DT; - - // --- vectorization state --- // - - /// Holds the integer induction variable. This is the counter of the - /// loop. - PHINode *Induction; - /// Holds the reduction variables. - ReductionList Reductions; - /// Holds all of the induction variables that we found in the loop. - /// Notice that inductions don't need to start at zero and that induction - /// variables can be pointers. - InductionList Inductions; - - /// Allowed outside users. This holds the reduction - /// vars which can be accessed from outside the loop. - SmallPtrSet AllowedExit; - /// This set holds the variables which are known to be uniform after - /// vectorization. - SmallPtrSet Uniforms; - /// We need to check that all of the pointers in this list are disjoint - /// at runtime. - RuntimePointerCheck PtrRtCheck; -}; - -/// LoopVectorizationCostModel - estimates the expected speedups due to -/// vectorization. -/// In many cases vectorization is not profitable. This can happen because -/// of a number of reasons. In this class we mainly attempt to predict -/// the expected speedup/slowdowns due to the supported instruction set. -/// We use the VectorTargetTransformInfo to query the different backends -/// for the cost of different operations. -class LoopVectorizationCostModel { -public: - /// C'tor. - LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, - LoopVectorizationLegality *Leg, - const VectorTargetTransformInfo *Vtti): - TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } - - /// Returns the most profitable vectorization factor for the loop that is - /// smaller or equal to the VF argument. This method checks every power - /// of two up to VF. - unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize); - -private: - /// Returns the expected execution cost. The unit of the cost does - /// not matter because we use the 'cost' units to compare different - /// vector widths. The cost that is returned is *not* normalized by - /// the factor width. - unsigned expectedCost(unsigned VF); - - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - unsigned getInstructionCost(Instruction *I, unsigned VF); - - /// A helper function for converting Scalar types to vector types. - /// If the incoming type is void, we return void. If the VF is 1, we return - /// the scalar type. - static Type* ToVectorTy(Type *Scalar, unsigned VF); - - /// The loop that we evaluate. - Loop *TheLoop; - /// Scev analysis. - ScalarEvolution *SE; - - /// Vectorization legality. - LoopVectorizationLegality *Legal; - /// Vector target information. - const VectorTargetTransformInfo *VTTI; -}; - +/// The LoopVectorize Pass. struct LoopVectorize : public LoopPass { static char ID; // Pass identification, replacement for typeid @@ -569,6 +129,26 @@ struct LoopVectorize : public LoopPass { }; +}// namespace + +//===----------------------------------------------------------------------===// +// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and +// LoopVectorizationCostModel. +//===----------------------------------------------------------------------===// + +void +LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, + Loop *Lp, Value *Ptr) { + const SCEV *Sc = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast(Sc); + assert(AR && "Invalid addrec expression"); + const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); + const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); + Pointers.push_back(Ptr); + Starts.push_back(AR->getStart()); + Ends.push_back(ScEnd); +} + Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { // Create the types. LLVMContext &C = V->getContext(); @@ -594,7 +174,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); // Broadcast the scalar into all locations in the vector. Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, - "broadcast"); + "broadcast"); // Restore the builder insertion point. if (Invariant) @@ -758,7 +338,7 @@ Value* InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, Instruction *Loc) { LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = - Legal->getRuntimePointerCheck(); + Legal->getRuntimePointerCheck(); if (!PtrRtCheck->Need) return NULL; @@ -827,26 +407,26 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- vector loop bypass. - / | - / v -| [ ] <-- vector pre header. -| | -| v -| [ ] \ -| [ ]_| <-- vector loop. -| | - \ v + [ ] <-- vector loop bypass. + / | + / v + | [ ] <-- vector pre header. + | | + | v + | [ ] \ + | [ ]_| <-- vector loop. + | | + \ v >[ ] <--- middle-block. - / | - / v -| [ ] <--- new preheader. -| | -| v -| [ ] \ -| [ ]_| <-- old scalar loop to handle remainder. - \ | - \ v + / | + / v + | [ ] <--- new preheader. + | | + | v + | [ ] \ + | [ ]_| <-- old scalar loop to handle remainder. + \ | + \ v >[ ] <-- exit block. ... */ @@ -862,7 +442,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // don't have a single induction variable. OldInduction = Legal->getInduction(); Type *IdxTy = OldInduction ? OldInduction->getType() : - DL->getIntPtrType(SE->getContext()); + DL->getIntPtrType(SE->getContext()); // Find the loop boundaries. const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); @@ -884,8 +464,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // value from the induction PHI node. If we don't have an induction variable // then we know that it starts at zero. Value *StartIdx = OldInduction ? - OldInduction->getIncomingValueForBlock(BypassBlock): - ConstantInt::get(IdxTy, 0); + OldInduction->getIncomingValueForBlock(BypassBlock): + ConstantInt::get(IdxTy, 0); assert(BypassBlock && "Invalid loop structure"); @@ -895,13 +475,13 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Split the single block loop into the two loop structure described above. BasicBlock *VectorPH = - BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); + BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); BasicBlock *VecBody = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); + VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); BasicBlock *MiddleBlock = - VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); + VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); // This is the location in which we add all of the logic for bypassing // the new vector loop. @@ -958,8 +538,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // PHIs that are left in the scalar version of the loop. // The starting values of PHI nodes depend on the counter of the last // iteration in the vectorized loop. - // If we come from a bypass edge then we need to start from the original start - // value. + // If we come from a bypass edge then we need to start from the original + // start value. // This variable saves the new starting index for the scalar loop. PHINode *ResumeIndex = 0; @@ -969,7 +549,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { PHINode *OrigPhi = I->first; LoopVectorizationLegality::InductionInfo II = I->second; PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", - MiddleBlock->getTerminator()); + MiddleBlock->getTerminator()); Value *EndValue = 0; switch (II.IK) { case LoopVectorizationLegality::NoInduction: @@ -1149,8 +729,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // //===------------------------------------------------===// BasicBlock &BB = *OrigLoop->getHeader(); - Constant *Zero = ConstantInt::get( - IntegerType::getInt32Ty(BB.getContext()), 0); + Constant *Zero = + ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0); // In order to support reduction variables we need to be able to vectorize // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two @@ -1191,7 +771,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { assert(Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable"); LoopVectorizationLegality::ReductionDescriptor RdxDesc = - (*Legal->getReductionVars())[RdxPhi]; + (*Legal->getReductionVars())[RdxPhi]; // We need to generate a reduction vector from the incoming scalar. // To do so, we need to generate the 'identity' vector and overide @@ -1211,7 +791,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // This vector is the Identity vector where the first element is the // incoming scalar reduction. Value *VectorStart = Builder.CreateInsertElement(Identity, - RdxDesc.StartValue, Zero); + RdxDesc.StartValue, Zero); // Fix the vector-loop phi. // We created the induction variable so we know that the @@ -1239,29 +819,29 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Extract the first scalar. Value *Scalar0 = - Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); + Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); // Extract and reduce the remaining vector elements. for (unsigned i=1; i < VF; ++i) { Value *Scalar1 = - Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); + Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); switch (RdxDesc.Kind) { - case LoopVectorizationLegality::IntegerAdd: - Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx"); - break; - case LoopVectorizationLegality::IntegerMult: - Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx"); - break; - case LoopVectorizationLegality::IntegerOr: - Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx"); - break; - case LoopVectorizationLegality::IntegerAnd: - Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx"); - break; - case LoopVectorizationLegality::IntegerXor: - Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx"); - break; - default: - llvm_unreachable("Unknown reduction operation"); + case LoopVectorizationLegality::IntegerAdd: + Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx"); + break; + case LoopVectorizationLegality::IntegerMult: + Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx"); + break; + case LoopVectorizationLegality::IntegerOr: + Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx"); + break; + case LoopVectorizationLegality::IntegerAnd: + Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx"); + break; + case LoopVectorizationLegality::IntegerXor: + Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx"); + break; + default: + llvm_unreachable("Unknown reduction operation"); } } @@ -1323,13 +903,14 @@ Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); // Loop incoming mask is all-one. - if (OrigLoop->getHeader() == BB) - return getVectorValue( - ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1)); + if (OrigLoop->getHeader() == BB) { + Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); + return getVectorValue(C); + } // This is the block mask. We OR all incoming edges, and with zero. - Value *BlockMask = getVectorValue( - ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0)); + Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); + Value *BlockMask = getVectorValue(Zero); // For each pred: for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) @@ -1347,306 +928,308 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { switch (it->getOpcode()) { - case Instruction::Br: - // Nothing to do for PHIs and BR, since we already took care of the - // loop control flow instructions. - continue; - case Instruction::PHI:{ - PHINode* P = cast(it); - // Handle reduction variables: - if (Legal->getReductionVars()->count(P)) { - // This is phase one of vectorizing PHIs. - Type *VecTy = VectorType::get(it->getType(), VF); - WidenMap[it] = + case Instruction::Br: + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + continue; + case Instruction::PHI:{ + PHINode* P = cast(it); + // Handle reduction variables: + if (Legal->getReductionVars()->count(P)) { + // This is phase one of vectorizing PHIs. + Type *VecTy = VectorType::get(it->getType(), VF); + WidenMap[it] = PHINode::Create(VecTy, 2, "vec.phi", LoopVectorBody->getFirstInsertionPt()); - PV->push_back(P); - continue; - } - - // Check for PHI nodes that are lowered to vector selects. - if (P->getParent() != OrigLoop->getHeader()) { - // We know that all PHIs in non header blocks are converted into - // selects, so we don't have to worry about the insertion order and we - // can just use the builder. - - // At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - Value *Cond = createBlockInMask(P->getIncomingBlock(0)); - WidenMap[P] = - Builder.CreateSelect(Cond, - getVectorValue(P->getIncomingValue(0)), - getVectorValue(P->getIncomingValue(1)), - "predphi"); - continue; - } - - // This PHINode must be an induction variable. - // Make sure that we know about it. - assert(Legal->getInductionVars()->count(P) && - "Not an induction variable"); + PV->push_back(P); + continue; + } - LoopVectorizationLegality::InductionInfo II = - Legal->getInductionVars()->lookup(P); + // Check for PHI nodes that are lowered to vector selects. + if (P->getParent() != OrigLoop->getHeader()) { + // We know that all PHIs in non header blocks are converted into + // selects, so we don't have to worry about the insertion order and we + // can just use the builder. + + // At this point we generate the predication tree. There may be + // duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + Value *Cond = createBlockInMask(P->getIncomingBlock(0)); + WidenMap[P] = + Builder.CreateSelect(Cond, + getVectorValue(P->getIncomingValue(0)), + getVectorValue(P->getIncomingValue(1)), + "predphi"); + continue; + } - switch (II.IK) { - case LoopVectorizationLegality::NoInduction: - llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IntInduction: { - assert(P == OldInduction && "Unexpected PHI"); - Value *Broadcasted = getBroadcastInstrs(Induction); + // This PHINode must be an induction variable. + // Make sure that we know about it. + assert(Legal->getInductionVars()->count(P) && + "Not an induction variable"); + + LoopVectorizationLegality::InductionInfo II = + Legal->getInductionVars()->lookup(P); + + switch (II.IK) { + case LoopVectorizationLegality::NoInduction: + llvm_unreachable("Unknown induction"); + case LoopVectorizationLegality::IntInduction: { + assert(P == OldInduction && "Unexpected PHI"); + Value *Broadcasted = getBroadcastInstrs(Induction); + // After broadcasting the induction variable we need to make the + // vector consecutive by adding 0, 1, 2 ... + Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted); + WidenMap[OldInduction] = ConsecutiveInduction; + continue; + } + case LoopVectorizationLegality::ReverseIntInduction: + case LoopVectorizationLegality::PtrInduction: + // Handle reverse integer and pointer inductions. + Value *StartIdx = 0; + // If we have a single integer induction variable then use it. + // Otherwise, start counting at zero. + if (OldInduction) { + LoopVectorizationLegality::InductionInfo OldII = + Legal->getInductionVars()->lookup(OldInduction); + StartIdx = OldII.StartValue; + } else { + StartIdx = ConstantInt::get(Induction->getType(), 0); + } + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, + "normalized.idx"); + + // Handle the reverse integer induction variable case. + if (LoopVectorizationLegality::ReverseIntInduction == II.IK) { + IntegerType *DstTy = cast(II.StartValue->getType()); + Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, + "resize.norm.idx"); + Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, + "reverse.idx"); + + // This is a new value so do not hoist it out. + Value *Broadcasted = getBroadcastInstrs(ReverseInd); // After broadcasting the induction variable we need to make the - // vector consecutive by adding 0, 1, 2 ... - Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted); - WidenMap[OldInduction] = ConsecutiveInduction; + // vector consecutive by adding ... -3, -2, -1, 0. + Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted, + true); + WidenMap[it] = ConsecutiveInduction; continue; } - case LoopVectorizationLegality::ReverseIntInduction: - case LoopVectorizationLegality::PtrInduction: - // Handle reverse integer and pointer inductions. - Value *StartIdx = 0; - // If we have a single integer induction variable then use it. - // Otherwise, start counting at zero. - if (OldInduction) { - LoopVectorizationLegality::InductionInfo OldII = - Legal->getInductionVars()->lookup(OldInduction); - StartIdx = OldII.StartValue; - } else { - StartIdx = ConstantInt::get(Induction->getType(), 0); - } - // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, - "normalized.idx"); - - // Handle the reverse integer induction variable case. - if (LoopVectorizationLegality::ReverseIntInduction == II.IK) { - IntegerType *DstTy = cast(II.StartValue->getType()); - Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, - "resize.norm.idx"); - Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, - "reverse.idx"); - - // This is a new value so do not hoist it out. - Value *Broadcasted = getBroadcastInstrs(ReverseInd); - // After broadcasting the induction variable we need to make the - // vector consecutive by adding ... -3, -2, -1, 0. - Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted, - true); - WidenMap[it] = ConsecutiveInduction; - continue; - } - - // Handle the pointer induction variable case. - assert(P->getType()->isPointerTy() && "Unexpected type."); - - // This is the vector of results. Notice that we don't generate vector - // geps because scalar geps result in better code. - Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); - for (unsigned int i = 0; i < VF; ++i) { - Constant *Idx = ConstantInt::get(Induction->getType(), i); - Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); - Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, "next.gep"); - VecVal = Builder.CreateInsertElement(VecVal, SclrGep, - Builder.getInt32(i), - "insert.gep"); - } - WidenMap[it] = VecVal; - continue; + // Handle the pointer induction variable case. + assert(P->getType()->isPointerTy() && "Unexpected type."); + + // This is the vector of results. Notice that we don't generate + // vector geps because scalar geps result in better code. + Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); + for (unsigned int i = 0; i < VF; ++i) { + Constant *Idx = ConstantInt::get(Induction->getType(), i); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, + "gep.idx"); + Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, + "next.gep"); + VecVal = Builder.CreateInsertElement(VecVal, SclrGep, + Builder.getInt32(i), + "insert.gep"); } - }// End of PHI. - - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen binops. - BinaryOperator *BinOp = dyn_cast(it); - Value *A = getVectorValue(it->getOperand(0)); - Value *B = getVectorValue(it->getOperand(1)); - - // Use this vector value for all users of the original instruction. - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); - WidenMap[it] = V; - - // Update the NSW, NUW and Exact flags. - BinaryOperator *VecOp = cast(V); - if (isa(BinOp)) { - VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); - VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); - } - if (isa(VecOp)) - VecOp->setIsExact(BinOp->isExact()); - break; - } - case Instruction::Select: { - // Widen selects. - // If the selector is loop invariant we can create a select - // instruction with a scalar condition. Otherwise, use vector-select. - Value *Cond = it->getOperand(0); - bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - Cond = getVectorValue(Cond); - if (InvariantCond) - Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); - - Value *Op0 = getVectorValue(it->getOperand(1)); - Value *Op1 = getVectorValue(it->getOperand(2)); - WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1); - break; + WidenMap[it] = VecVal; + continue; } - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (it->getOpcode() == Instruction::FCmp); - CmpInst *Cmp = dyn_cast(it); - Value *A = getVectorValue(it->getOperand(0)); - Value *B = getVectorValue(it->getOperand(1)); - if (FCmp) - WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); - else - WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B); - break; + }// End of PHI. + + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen binops. + BinaryOperator *BinOp = dyn_cast(it); + Value *A = getVectorValue(it->getOperand(0)); + Value *B = getVectorValue(it->getOperand(1)); + + // Use this vector value for all users of the original instruction. + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); + WidenMap[it] = V; + + // Update the NSW, NUW and Exact flags. + BinaryOperator *VecOp = cast(V); + if (isa(BinOp)) { + VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); + VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); } + if (isa(VecOp)) + VecOp->setIsExact(BinOp->isExact()); + break; + } + case Instruction::Select: { + // Widen selects. + // If the selector is loop invariant we can create a select + // instruction with a scalar condition. Otherwise, use vector-select. + Value *Cond = it->getOperand(0); + bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + Cond = getVectorValue(Cond); + if (InvariantCond) + Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); + + Value *Op0 = getVectorValue(it->getOperand(1)); + Value *Op1 = getVectorValue(it->getOperand(2)); + WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1); + break; + } - case Instruction::Store: { - // Attempt to issue a wide store. - StoreInst *SI = dyn_cast(it); - Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); - Value *Ptr = SI->getPointerOperand(); - unsigned Alignment = SI->getAlignment(); + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (it->getOpcode() == Instruction::FCmp); + CmpInst *Cmp = dyn_cast(it); + Value *A = getVectorValue(it->getOperand(0)); + Value *B = getVectorValue(it->getOperand(1)); + if (FCmp) + WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + else + WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B); + break; + } - assert(!Legal->isUniform(Ptr) && - "We do not allow storing to uniform addresses"); + case Instruction::Store: { + // Attempt to issue a wide store. + StoreInst *SI = dyn_cast(it); + Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); + Value *Ptr = SI->getPointerOperand(); + unsigned Alignment = SI->getAlignment(); - GetElementPtrInst *Gep = dyn_cast(Ptr); + assert(!Legal->isUniform(Ptr) && + "We do not allow storing to uniform addresses"); - // This store does not use GEPs. - if (!Legal->isConsecutivePtr(Ptr)) { - scalarizeInstruction(it); - break; - } + GetElementPtrInst *Gep = dyn_cast(Ptr); - if (Gep) { - // The last index does not have to be the induction. It can be - // consecutive and be a function of the index. For example A[I+1]; - unsigned NumOperands = Gep->getNumOperands(); - Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); - LastIndex = Builder.CreateExtractElement(LastIndex, Zero); - - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast(Gep->clone()); - Gep2->setOperand(NumOperands - 1, LastIndex); - Ptr = Builder.Insert(Gep2); - } else { - // Use the induction element ptr. - assert(isa(Ptr) && "Invalid induction ptr"); - Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); - } - Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); - Value *Val = getVectorValue(SI->getValueOperand()); - Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); + // This store does not use GEPs. + if (!Legal->isConsecutivePtr(Ptr)) { + scalarizeInstruction(it); break; } - case Instruction::Load: { - // Attempt to issue a wide load. - LoadInst *LI = dyn_cast(it); - Type *RetTy = VectorType::get(LI->getType(), VF); - Value *Ptr = LI->getPointerOperand(); - unsigned Alignment = LI->getAlignment(); - GetElementPtrInst *Gep = dyn_cast(Ptr); - - // If the pointer is loop invariant or if it is non consecutive, - // scalarize the load. - bool Con = Legal->isConsecutivePtr(Ptr); - if (Legal->isUniform(Ptr) || !Con) { - scalarizeInstruction(it); - break; - } - if (Gep) { - // The last index does not have to be the induction. It can be - // consecutive and be a function of the index. For example A[I+1]; - unsigned NumOperands = Gep->getNumOperands(); - Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); - LastIndex = Builder.CreateExtractElement(LastIndex, Zero); - - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast(Gep->clone()); - Gep2->setOperand(NumOperands - 1, LastIndex); - Ptr = Builder.Insert(Gep2); - } else { - // Use the induction element ptr. - assert(isa(Ptr) && "Invalid induction ptr"); - Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); - } - - Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); - LI = Builder.CreateLoad(Ptr); - LI->setAlignment(Alignment); - // Use this vector value for all users of the load. - WidenMap[it] = LI; - break; + if (Gep) { + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + } else { + // Use the induction element ptr. + assert(isa(Ptr) && "Invalid induction ptr"); + Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - /// Vectorize bitcasts. - CastInst *CI = dyn_cast(it); - Value *A = getVectorValue(it->getOperand(0)); - Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); - WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy); + Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); + Value *Val = getVectorValue(SI->getValueOperand()); + Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); + break; + } + case Instruction::Load: { + // Attempt to issue a wide load. + LoadInst *LI = dyn_cast(it); + Type *RetTy = VectorType::get(LI->getType(), VF); + Value *Ptr = LI->getPointerOperand(); + unsigned Alignment = LI->getAlignment(); + GetElementPtrInst *Gep = dyn_cast(Ptr); + + // If the pointer is loop invariant or if it is non consecutive, + // scalarize the load. + bool Con = Legal->isConsecutivePtr(Ptr); + if (Legal->isUniform(Ptr) || !Con) { + scalarizeInstruction(it); break; } - - case Instruction::Call: { - assert(isTriviallyVectorizableIntrinsic(it)); - Module *M = BB->getParent()->getParent(); - IntrinsicInst *II = cast(it); - Intrinsic::ID ID = II->getIntrinsicID(); - SmallVector Args; - for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) - Args.push_back(getVectorValue(II->getArgOperand(i))); - Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) }; - Function *F = Intrinsic::getDeclaration(M, ID, Tys); - WidenMap[it] = Builder.CreateCall(F, Args); - break; + + if (Gep) { + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + } else { + // Use the induction element ptr. + assert(isa(Ptr) && "Invalid induction ptr"); + Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); } - default: - // All other instructions are unsupported. Scalarize them. - scalarizeInstruction(it); - break; + Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); + LI = Builder.CreateLoad(Ptr); + LI->setAlignment(Alignment); + // Use this vector value for all users of the load. + WidenMap[it] = LI; + break; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + /// Vectorize bitcasts. + CastInst *CI = dyn_cast(it); + Value *A = getVectorValue(it->getOperand(0)); + Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); + WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy); + break; + } + + case Instruction::Call: { + assert(isTriviallyVectorizableIntrinsic(it)); + Module *M = BB->getParent()->getParent(); + IntrinsicInst *II = cast(it); + Intrinsic::ID ID = II->getIntrinsicID(); + SmallVector Args; + for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) + Args.push_back(getVectorValue(II->getArgOperand(i))); + Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) }; + Function *F = Intrinsic::getDeclaration(M, ID, Tys); + WidenMap[it] = Builder.CreateCall(F, Args); + break; + } + + default: + // All other instructions are unsupported. Scalarize them. + scalarizeInstruction(it); + break; }// end of switch. }// end of for_each instr. } @@ -1958,8 +1541,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // Check if we see any stores. If there are no stores, then we don't // care if the pointers are *restrict*. if (!Stores.size()) { - DEBUG(dbgs() << "LV: Found a read-only loop!\n"); - return true; + DEBUG(dbgs() << "LV: Found a read-only loop!\n"); + return true; } // Holds the read and read-write *pointers* that we find. @@ -2171,15 +1754,15 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // We found a reduction var if we have reached the original // phi node and we only have a single instruction with out-of-loop // users. - if (FoundStartPHI && ExitInstruction) { - // This instruction is allowed to have out-of-loop users. - AllowedExit.insert(ExitInstruction); + if (FoundStartPHI && ExitInstruction) { + // This instruction is allowed to have out-of-loop users. + AllowedExit.insert(ExitInstruction); - // Save the description of this reduction variable. - ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); - Reductions[Phi] = RD; - return true; - } + // Save the description of this reduction variable. + ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); + Reductions[Phi] = RD; + return true; + } // If we've reached the start PHI but did not find an outside user then // this is dead code. Abort. @@ -2191,24 +1774,24 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, bool LoopVectorizationLegality::isReductionInstr(Instruction *I, ReductionKind Kind) { - switch (I->getOpcode()) { - default: - return false; - case Instruction::PHI: - // possibly. - return true; - case Instruction::Add: - case Instruction::Sub: - return Kind == IntegerAdd; - case Instruction::Mul: - return Kind == IntegerMult; - case Instruction::And: - return Kind == IntegerAnd; - case Instruction::Or: - return Kind == IntegerOr; - case Instruction::Xor: - return Kind == IntegerXor; - } + switch (I->getOpcode()) { + default: + return false; + case Instruction::PHI: + // possibly. + return true; + case Instruction::Add: + case Instruction::Sub: + return Kind == IntegerAdd; + case Instruction::Mul: + return Kind == IntegerMult; + case Instruction::And: + return Kind == IntegerAnd; + case Instruction::Or: + return Kind == IntegerOr; + case Instruction::Xor: + return Kind == IntegerXor; + } } LoopVectorizationLegality::InductionKind @@ -2265,12 +1848,12 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { // The isntructions below can trap. switch (it->getOpcode()) { - default: continue; - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::URem: - case Instruction::SRem: - return false; + default: continue; + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return false; } } @@ -2356,153 +1939,154 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { - case Instruction::GetElementPtr: - // We mark this instruction as zero-cost because scalar GEPs are usually - // lowered to the intruction addressing mode. At the moment we don't - // generate vector geps. - return 0; - case Instruction::Br: { - return VTTI->getCFInstrCost(I->getOpcode()); - } - case Instruction::PHI: - //TODO: IF-converted IFs become selects. - return 0; - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); - case Instruction::Select: { - SelectInst *SI = cast(I); - const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); - bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); - Type *CondTy = SI->getCondition()->getType(); - if (ScalarCond) - CondTy = VectorType::get(CondTy, VF); - - return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); - } - case Instruction::ICmp: - case Instruction::FCmp: { - Type *ValTy = I->getOperand(0)->getType(); - VectorTy = ToVectorTy(ValTy, VF); - return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); - } - case Instruction::Store: { - StoreInst *SI = cast(I); - Type *ValTy = SI->getValueOperand()->getType(); - VectorTy = ToVectorTy(ValTy, VF); - - if (VF == 1) - return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, - SI->getAlignment(), SI->getPointerAddressSpace()); - - // Scalarized stores. - if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { - unsigned Cost = 0; - unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, - ValTy); - // The cost of extracting from the value vector. - Cost += VF * (ExtCost); - // The cost of the scalar stores. - Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), - ValTy->getScalarType(), - SI->getAlignment(), - SI->getPointerAddressSpace()); - return Cost; - } - - // Wide stores. - return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), + case Instruction::GetElementPtr: + // We mark this instruction as zero-cost because scalar GEPs are usually + // lowered to the intruction addressing mode. At the moment we don't + // generate vector geps. + return 0; + case Instruction::Br: { + return VTTI->getCFInstrCost(I->getOpcode()); + } + case Instruction::PHI: + //TODO: IF-converted IFs become selects. + return 0; + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); + case Instruction::Select: { + SelectInst *SI = cast(I); + const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); + bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); + Type *CondTy = SI->getCondition()->getType(); + if (ScalarCond) + CondTy = VectorType::get(CondTy, VF); + + return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); + } + case Instruction::ICmp: + case Instruction::FCmp: { + Type *ValTy = I->getOperand(0)->getType(); + VectorTy = ToVectorTy(ValTy, VF); + return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); + } + case Instruction::Store: { + StoreInst *SI = cast(I); + Type *ValTy = SI->getValueOperand()->getType(); + VectorTy = ToVectorTy(ValTy, VF); + + if (VF == 1) + return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, + SI->getAlignment(), SI->getPointerAddressSpace()); + + // Scalarized stores. + if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { + unsigned Cost = 0; + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + ValTy); + // The cost of extracting from the value vector. + Cost += VF * (ExtCost); + // The cost of the scalar stores. + Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), + ValTy->getScalarType(), + SI->getAlignment(), + SI->getPointerAddressSpace()); + return Cost; } - case Instruction::Load: { - LoadInst *LI = cast(I); - - if (VF == 1) - return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, - LI->getAlignment(), - LI->getPointerAddressSpace()); - - // Scalarized loads. - if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { - unsigned Cost = 0; - unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); - // The cost of inserting the loaded value into the result vector. - Cost += VF * (InCost); - // The cost of the scalar stores. - Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), - RetTy->getScalarType(), - LI->getAlignment(), - LI->getPointerAddressSpace()); - return Cost; - } - // Wide loads. - return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), + // Wide stores. + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), + SI->getPointerAddressSpace()); + } + case Instruction::Load: { + LoadInst *LI = cast(I); + + if (VF == 1) + return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, + LI->getAlignment(), LI->getPointerAddressSpace()); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); - return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); - } - case Instruction::Call: { - assert(isTriviallyVectorizableIntrinsic(I)); - IntrinsicInst *II = cast(I); - Type *RetTy = ToVectorTy(II->getType(), VF); - SmallVector Tys; - for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) - Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF)); - return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); - } - default: { - // We are scalarizing the instruction. Return the cost of the scalar - // instruction, plus the cost of insert and extract into vector - // elements, times the vector width. + + // Scalarized loads. + if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { unsigned Cost = 0; + unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); + // The cost of inserting the loaded value into the result vector. + Cost += VF * (InCost); + // The cost of the scalar stores. + Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), + RetTy->getScalarType(), + LI->getAlignment(), + LI->getPointerAddressSpace()); + return Cost; + } - bool IsVoid = RetTy->isVoidTy(); + // Wide loads. + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), + LI->getPointerAddressSpace()); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); + return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + } + case Instruction::Call: { + assert(isTriviallyVectorizableIntrinsic(I)); + IntrinsicInst *II = cast(I); + Type *RetTy = ToVectorTy(II->getType(), VF); + SmallVector Tys; + for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) + Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF)); + return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); + } + default: { + // We are scalarizing the instruction. Return the cost of the scalar + // instruction, plus the cost of insert and extract into vector + // elements, times the vector width. + unsigned Cost = 0; - unsigned InsCost = (IsVoid ? 0 : - VTTI->getInstrCost(Instruction::InsertElement, - VectorTy)); + bool IsVoid = RetTy->isVoidTy(); - unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, - VectorTy); + unsigned InsCost = (IsVoid ? 0 : + VTTI->getInstrCost(Instruction::InsertElement, + VectorTy)); - // The cost of inserting the results plus extracting each one of the - // operands. - Cost += VF * (InsCost + ExtCost * I->getNumOperands()); + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + VectorTy); - // The cost of executing VF copies of the scalar instruction. - Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); - return Cost; - } + // The cost of inserting the results plus extracting each one of the + // operands. + Cost += VF * (InsCost + ExtCost * I->getNumOperands()); + + // The cost of executing VF copies of the scalar instruction. + Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); + return Cost; + } }// end of switch. } @@ -2512,8 +2096,6 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { return VectorType::get(Scalar, VF); } -} // namespace - char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) @@ -2527,3 +2109,5 @@ namespace llvm { return new LoopVectorize(); } } + + -- cgit v1.1 From cfb6285fdb27fba97ddc3ebf6485110d9a157ac5 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 11 Dec 2012 04:55:10 +0000 Subject: Fix PR14565. Don't if-convert loops that have switch statements in them. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169813 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index feeecec..a691240 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1263,6 +1263,10 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { BasicBlock *BB = LoopBlocks[i]; + // We don't support switch statements inside loops. + if (!isa(BB->getTerminator())) + return false; + // We must have at most two predecessors because we need to convert // all PHIs to selects. unsigned Preds = std::distance(pred_begin(BB), pred_end(BB)); -- cgit v1.1 From 5e9efa10fc7eea582bb1ad216364fb2c467e4477 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 11 Dec 2012 18:58:10 +0000 Subject: Loop Vectorize: optimize the vectorization of trunc(induction_var). The truncation is now done on scalars. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169904 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index a691240..9550e8a 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1204,8 +1204,20 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - /// Vectorize bitcasts. CastInst *CI = dyn_cast(it); + /// Optimize the special case where the source is the induction + /// variable. Notice that we can only optimize the 'trunc' case + /// because: a. FP conversions lose precision, b. sext/zext may wrap, + /// c. other casts depend on pointer size. + if (CI->getOperand(0) == OldInduction && + it->getOpcode() == Instruction::Trunc) { + Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, + CI->getType()); + Value *Broadcasted = getBroadcastInstrs(ScalarCast); + WidenMap[it] = getConsecutiveVector(Broadcasted); + break; + } + /// Vectorize casts. Value *A = getVectorValue(it->getOperand(0)); Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy); -- cgit v1.1 From 655d2c5354fcd44c329d99428c7d9196bc78dbad Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 11 Dec 2012 21:30:14 +0000 Subject: PR14574. Fix a bug in the code that calculates the mask the converted PHIs in if-conversion. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169916 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 9550e8a..baa655c 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -954,7 +954,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. - Value *Cond = createBlockInMask(P->getIncomingBlock(0)); + Value *Cond = createEdgeMask(P->getIncomingBlock(0), P->getParent()); WidenMap[P] = Builder.CreateSelect(Cond, getVectorValue(P->getIncomingValue(0)), -- cgit v1.1 From 5da804150d418b8b4956b84013f8f67df553c543 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 12 Dec 2012 01:11:46 +0000 Subject: LoopVectorizer: When -Os is used, vectorize only loops that dont require a tail loop. There is no testcase because I dont know of a way to initialize the loop vectorizer pass without adding an additional hidden flag. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169950 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 82 ++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 22 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index baa655c..d796716 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -51,9 +51,12 @@ namespace { /// The LoopVectorize Pass. struct LoopVectorize : public LoopPass { - static char ID; // Pass identification, replacement for typeid + /// Pass identification, replacement for typeid + static char ID; + /// Optimize for size. Do not generate tail loops. + bool OptForSize; - LoopVectorize() : LoopPass(ID) { + explicit LoopVectorize(bool OptSz = false) : LoopPass(ID), OptForSize(OptSz) { initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } @@ -85,23 +88,17 @@ struct LoopVectorize : public LoopPass { } // Select the preffered vectorization factor. - unsigned VF = 1; - if (VectorizationFactor == 0) { - const VectorTargetTransformInfo *VTTI = 0; - if (TTI) - VTTI = TTI->getVectorTargetTransformInfo(); - // Use the cost model. - LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); - VF = CM.findBestVectorizationFactor(); - - if (VF == 1) { - DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); - return false; - } - - } else { - // Use the user command flag. - VF = VectorizationFactor; + const VectorTargetTransformInfo *VTTI = 0; + if (TTI) + VTTI = TTI->getVectorTargetTransformInfo(); + // Use the cost model. + LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); + unsigned VF = CM.selectVectorizationFactor(OptForSize, + VectorizationFactor); + + if (VF == 1) { + DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); + return false; } DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< @@ -1886,7 +1883,48 @@ bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { } unsigned -LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { +LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, + unsigned UserVF) { + if (OptForSize && Legal->getRuntimePointerCheck()->Need) { + DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); + return 1; + } + + // Find the trip count. + unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); + DEBUG(dbgs() << "LV: Found trip count:"< Date: Wed, 12 Dec 2012 01:31:10 +0000 Subject: fix a typo. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169953 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index d796716..5c5e9af 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1859,7 +1859,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) return false; - // The isntructions below can trap. + // The instructions below can trap. switch (it->getOpcode()) { default: continue; case Instruction::UDiv: -- cgit v1.1 From db65ff39faee4cab994d3b7ece1a31dfd9343818 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 12 Dec 2012 01:33:47 +0000 Subject: Fix the ascii drawing that was ruined when I split the H and CPP git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169955 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 5c5e9af..da073c5 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -404,27 +404,27 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- vector loop bypass. - / | - / v + [ ] <-- vector loop bypass. + / | + / v | [ ] <-- vector pre header. | | | v | [ ] \ | [ ]_| <-- vector loop. | | - \ v - >[ ] <--- middle-block. - / | - / v + \ v + >[ ] <--- middle-block. + / | + / v | [ ] <--- new preheader. | | | v | [ ] \ | [ ]_| <-- old scalar loop to handle remainder. - \ | - \ v - >[ ] <-- exit block. + \ | + \ v + >[ ] <-- exit block. ... */ -- cgit v1.1 From ae3b652f5cc19d83b6466d4fa70a7d1c7fb6d06c Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 12 Dec 2012 19:29:45 +0000 Subject: LoopVectorizer: Use the "optsize" attribute to decide if we are allowed to increase the function size. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170004 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index da073c5..749b664 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -53,10 +53,8 @@ namespace { struct LoopVectorize : public LoopPass { /// Pass identification, replacement for typeid static char ID; - /// Optimize for size. Do not generate tail loops. - bool OptForSize; - explicit LoopVectorize(bool OptSz = false) : LoopPass(ID), OptForSize(OptSz) { + explicit LoopVectorize() : LoopPass(ID) { initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } @@ -93,8 +91,15 @@ struct LoopVectorize : public LoopPass { VTTI = TTI->getVectorTargetTransformInfo(); // Use the cost model. LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); + + // Check the function attribues to find out if this function should be + // optimized for size. + Function *F = L->getHeader()->getParent(); + bool OptForSize = + F->getFnAttributes().hasAttribute(Attributes::OptimizeForSize); + unsigned VF = CM.selectVectorizationFactor(OptForSize, - VectorizationFactor); + VectorizationFactor); if (VF == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); @@ -2159,8 +2164,8 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { - Pass *createLoopVectorizePass(bool OptForSize = false) { - return new LoopVectorize(OptForSize); + Pass *createLoopVectorizePass() { + return new LoopVectorize(); } } -- cgit v1.1 From d0b144c04c10c62ad51d0e171680d9ce95af4078 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 12 Dec 2012 19:39:36 +0000 Subject: Fix indentation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170005 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 749b664..cdd130f 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -95,11 +95,10 @@ struct LoopVectorize : public LoopPass { // Check the function attribues to find out if this function should be // optimized for size. Function *F = L->getHeader()->getParent(); - bool OptForSize = - F->getFnAttributes().hasAttribute(Attributes::OptimizeForSize); + Attributes::AttrVal SzAttr= Attributes::OptimizeForSize; + bool OptForSize = F->getFnAttributes().hasAttribute(SzAttr); - unsigned VF = CM.selectVectorizationFactor(OptForSize, - VectorizationFactor); + unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); if (VF == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); -- cgit v1.1 From 807dad62a0d4f1a1bbcb76fdc447634f76308252 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 13 Dec 2012 00:21:03 +0000 Subject: Teach the cost model about the optimization in r169904: Truncation of induction variables costs the same as scalar trunc. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170051 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index cdd130f..475bea1 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -106,8 +106,7 @@ struct LoopVectorize : public LoopPass { } DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< - L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< - "\n"); + F->getParent()->getModuleIdentifier()<<"\n"); // If we decided that it is *legal* to vectorizer the loop then do it. InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF); @@ -1849,6 +1848,15 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { return NoInduction; } +bool LoopVectorizationLegality::isInductionVariable(const Value *V) { + Value *In0 = const_cast(V); + PHINode *PN = dyn_cast_or_null(In0); + if (!PN) + return false; + + return Inductions.count(PN); +} + bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { assert(TheLoop->contains(BB) && "Unknown block used"); @@ -2110,6 +2118,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { + // We optimize the truncation of induction variable. + // The cost of these is the same as the scalar operation. + if (I->getOpcode() == Instruction::Trunc && + Legal->isInductionVariable(I->getOperand(0))) + return VTTI->getCastInstrCost(I->getOpcode(), I->getType(), + I->getOperand(0)->getType()); + Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } -- cgit v1.1 From 9ad73e93a51c473a90e010a489a9c7c221112030 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 13 Dec 2012 23:11:54 +0000 Subject: Enable the Loop Vectorizer by default for O2 and O3. Disable if-conversion by default. I plan to revert this patch later today. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170157 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 475bea1..d143f91 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -44,7 +44,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect.")); static cl::opt -EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, +EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); namespace { -- cgit v1.1 From 0ef0e2e6d0a45cdbc792eee9d76f0a4b7cda5c8f Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Tue, 18 Dec 2012 18:40:20 +0000 Subject: LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector ops instead of scalar operations. For example on x86 with SSE4.2 a <8 x i8> add reduction becomes movdqa %xmm0, %xmm1 movhlps %xmm1, %xmm1 ## xmm1 = xmm1[1,1] paddw %xmm0, %xmm1 pshufd $1, %xmm1, %xmm0 ## xmm0 = xmm1[1,0,0,0] paddw %xmm1, %xmm0 phaddw %xmm0, %xmm0 pextrb $0, %xmm0, %edx instead of pextrb $2, %xmm0, %esi pextrb $0, %xmm0, %edx addb %sil, %dl pextrb $4, %xmm0, %esi addb %dl, %sil pextrb $6, %xmm0, %edx addb %sil, %dl pextrb $8, %xmm0, %esi addb %dl, %sil pextrb $10, %xmm0, %edi pextrb $14, %xmm0, %edx addb %sil, %dil pextrb $12, %xmm0, %esi addb %dil, %sil addb %sil, %dl git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170439 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 43 +++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 12 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index d143f91..e3c76bb 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -817,34 +817,53 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { NewPhi->addIncoming(VectorStart, LoopBypassBlock); NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); - // Extract the first scalar. - Value *Scalar0 = - Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); - // Extract and reduce the remaining vector elements. - for (unsigned i=1; i < VF; ++i) { - Value *Scalar1 = - Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = NewPhi; + SmallVector ShuffleMask(VF, 0); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i/2; ++j) + ShuffleMask[j] = Builder.getInt32(i/2 + j); + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i/2], ShuffleMask.end(), + UndefValue::get(Builder.getInt32Ty())); + + Value *Shuf = + Builder.CreateShuffleVector(TmpVec, + UndefValue::get(TmpVec->getType()), + ConstantVector::get(ShuffleMask), + "rdx.shuf"); + + // Emit the operation on the shuffled value. switch (RdxDesc.Kind) { case LoopVectorizationLegality::IntegerAdd: - Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx"); + TmpVec = Builder.CreateAdd(TmpVec, Shuf, "add.rdx"); break; case LoopVectorizationLegality::IntegerMult: - Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx"); + TmpVec = Builder.CreateMul(TmpVec, Shuf, "mul.rdx"); break; case LoopVectorizationLegality::IntegerOr: - Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx"); + TmpVec = Builder.CreateOr(TmpVec, Shuf, "or.rdx"); break; case LoopVectorizationLegality::IntegerAnd: - Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx"); + TmpVec = Builder.CreateAnd(TmpVec, Shuf, "and.rdx"); break; case LoopVectorizationLegality::IntegerXor: - Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx"); + TmpVec = Builder.CreateXor(TmpVec, Shuf, "xor.rdx"); break; default: llvm_unreachable("Unknown reduction operation"); } } + // The result is in the first element of the vector. + Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. // We know that the loop is in LCSSA form. We need to update the -- cgit v1.1 From 034b94b17006f51722886b0f2283fb6fb19aca1f Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Wed, 19 Dec 2012 07:18:57 +0000 Subject: Rename the 'Attributes' class to 'Attribute'. It's going to represent a single attribute in the future. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170502 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index e3c76bb..ee544b3 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -95,7 +95,7 @@ struct LoopVectorize : public LoopPass { // Check the function attribues to find out if this function should be // optimized for size. Function *F = L->getHeader()->getParent(); - Attributes::AttrVal SzAttr= Attributes::OptimizeForSize; + Attribute::AttrVal SzAttr= Attribute::OptimizeForSize; bool OptForSize = F->getFnAttributes().hasAttribute(SzAttr); unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); -- cgit v1.1 From d3071bb042303ed239e5c770fd4026bfc339fcd0 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 20 Dec 2012 02:00:02 +0000 Subject: Loop Vectorizer: Enable if-conversion. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170632 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index ee544b3..ddb6f84 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -44,7 +44,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect.")); static cl::opt -EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden, +EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); namespace { -- cgit v1.1 From d5d46ace89d2b7ec1a54610925bc0f13bb394335 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 20 Dec 2012 17:42:53 +0000 Subject: Loop Vectorizer: turn-off if-conversion. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170708 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index ddb6f84..ee544b3 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -44,7 +44,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect.")); static cl::opt -EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, +EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); namespace { -- cgit v1.1 From 8386acd7348003d13a2db3c4dd3274653d8ffe10 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 20 Dec 2012 20:24:40 +0000 Subject: LoopVectorize: Fix a bug in the scalarization of instructions. Before if-conversion we could check if a value is loop invariant if it was declared inside the basic block. Now that loops have multiple blocks this check is incorrect. This fixes External/SPEC/CINT95/099_go/099_go git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170756 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index ee544b3..827c13f 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -284,7 +284,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // If the src is an instruction that appeared earlier in the basic block // then it should already be vectorized. - if (SrcInst && SrcInst->getParent() == Instr->getParent()) { + if (SrcInst && OrigLoop->contains(SrcInst)) { assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); // The parameter is a vector value from earlier. Params.push_back(WidenMap[SrcInst]); -- cgit v1.1 From 55306bdea5d2d53be39f3ac59fadf5220ee6b5d0 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 21 Dec 2012 00:07:35 +0000 Subject: Fix a bug in the code that checks if we can vectorize loops while using dynamic memory bound checks. Before the fix we were able to vectorize this loop from the Livermore Loops benchmark: for ( k=1 ; k(*I); - assert(ST && "Bad StoreInst"); + StoreInst *ST = cast(*I); Value* Ptr = ST->getPointerOperand(); if (isUniform(Ptr)) { @@ -1609,8 +1608,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { - LoadInst *LD = dyn_cast(*I); - assert(LD && "Bad LoadInst"); + LoadInst *LD = cast(*I); Value* Ptr = LD->getPointerOperand(); // If we did *not* see this pointer before, insert it to the // read list. If we *did* see it before, then it is already in @@ -1633,13 +1631,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. - bool RT = true; + bool CanDoRT = true; for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) if (hasComputableBounds(*I)) { PtrRtCheck.insert(SE, TheLoop, *I); DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); } else { - RT = false; + CanDoRT = false; break; } for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) @@ -1647,23 +1645,23 @@ bool LoopVectorizationLegality::canVectorizeMemory() { PtrRtCheck.insert(SE, TheLoop, *I); DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); } else { - RT = false; + CanDoRT = false; break; } // Check that we did not collect too many pointers or found a // unsizeable pointer. - if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { + if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { PtrRtCheck.reset(); - RT = false; + CanDoRT = false; } - PtrRtCheck.Need = RT; - - if (RT) { + if (CanDoRT) { DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); } + bool NeedRTCheck = false; + // Now that the pointers are in two lists (Reads and ReadWrites), we // can check that there are no conflicts between each of the writes and // between the writes to the reads. @@ -1678,12 +1676,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() { it != e; ++it) { if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); - return RT; + NeedRTCheck = true; } if (!WriteObjects.insert(*it)) { DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **it <<"\n"); - return RT; + return false; } } TempObjects.clear(); @@ -1696,20 +1694,27 @@ bool LoopVectorizationLegality::canVectorizeMemory() { it != e; ++it) { if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); - return RT; + NeedRTCheck = true; } if (WriteObjects.count(*it)) { DEBUG(dbgs() << "LV: Found a possible read/write reorder:" << **it <<"\n"); - return RT; + return false; } } TempObjects.clear(); } - // It is safe to vectorize and we don't need any runtime checks. - DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); - PtrRtCheck.reset(); + PtrRtCheck.Need = NeedRTCheck; + if (NeedRTCheck && !CanDoRT) { + DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << + "the array bounds.\n"); + PtrRtCheck.reset(); + return false; + } + + DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << + " need a runtime memory check.\n"); return true; } -- cgit v1.1 From ebf395d39ee81d1635783688dbc49ff86c24e7e6 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 21 Dec 2012 04:47:54 +0000 Subject: Enable if-conversion. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170841 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 4a90d78..f533235 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -44,7 +44,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect.")); static cl::opt -EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden, +EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); namespace { -- cgit v1.1 From 38b06020dbd804f01ee3802779a52c05cffdf87d Mon Sep 17 00:00:00 2001 From: Roman Divacky Date: Fri, 21 Dec 2012 17:06:44 +0000 Subject: Remove duplicate includes. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170902 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 1 - 1 file changed, 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index f533235..6f8c65a 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -15,7 +15,6 @@ #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/Verifier.h" -- cgit v1.1 From 629fb82419d9bfff6ae475363bcce66192dfcc8e Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Sat, 22 Dec 2012 00:37:52 +0000 Subject: Change 'AttrVal' to 'AttrKind' to better reflect that it's a kind of attribute instead of the value of the attribute. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170972 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 6f8c65a..f5ff79c 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -94,7 +94,7 @@ struct LoopVectorize : public LoopPass { // Check the function attribues to find out if this function should be // optimized for size. Function *F = L->getHeader()->getParent(); - Attribute::AttrVal SzAttr= Attribute::OptimizeForSize; + Attribute::AttrKind SzAttr= Attribute::OptimizeForSize; bool OptForSize = F->getFnAttributes().hasAttribute(SzAttr); unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); -- cgit v1.1 From d54fed27865dcbc69932e1e6c372bb5a932e662a Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 23 Dec 2012 07:23:55 +0000 Subject: Loop Vectorizer: Update the cost model of scatter/gather operations and make them more expensive. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170995 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 42 +++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index f5ff79c..5b1db0b 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2080,17 +2080,23 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { VectorTy = ToVectorTy(ValTy, VF); if (VF == 1) - return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), SI->getPointerAddressSpace()); // Scalarized stores. if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { unsigned Cost = 0; - unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, - ValTy); - // The cost of extracting from the value vector. - Cost += VF * (ExtCost); + + // The cost of extracting from the value vector and pointer vector. + Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); + for (unsigned i = 0; i < VF; ++i) { + Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement, + VectorTy, i); + Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement, + PtrTy, i); + } + // The cost of the scalar stores. Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), @@ -2107,16 +2113,25 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { LoadInst *LI = cast(I); if (VF == 1) - return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), LI->getPointerAddressSpace()); // Scalarized loads. if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { unsigned Cost = 0; - unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); - // The cost of inserting the loaded value into the result vector. - Cost += VF * (InCost); + Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); + + // The cost of extracting from the pointer vector. + for (unsigned i = 0; i < VF; ++i) + Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement, + PtrTy, i); + + // The cost of inserting data to the result vector. + for (unsigned i = 0; i < VF; ++i) + Cost += VTTI->getVectorInstrCost(Instruction::InsertElement, + VectorTy, i); + // The cost of the scalar stores. Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), RetTy->getScalarType(), @@ -2169,18 +2184,19 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { bool IsVoid = RetTy->isVoidTy(); unsigned InsCost = (IsVoid ? 0 : - VTTI->getInstrCost(Instruction::InsertElement, + VTTI->getVectorInstrCost(Instruction::InsertElement, VectorTy)); - unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + unsigned ExtCost = VTTI->getVectorInstrCost(Instruction::ExtractElement, VectorTy); // The cost of inserting the results plus extracting each one of the // operands. Cost += VF * (InsCost + ExtCost * I->getNumOperands()); - // The cost of executing VF copies of the scalar instruction. - Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); + // The cost of executing VF copies of the scalar instruction. This opcode + // is unknown. Assume that it is the same as 'mul'. + Cost += VF * VTTI->getArithmeticInstrCost(Instruction::Mul, VectorTy); return Cost; } }// end of switch. -- cgit v1.1 From 417872ed08128e8885b4c7fbaeb2d735c150ea57 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sun, 23 Dec 2012 13:19:18 +0000 Subject: LoopVectorize: For scalars and void types there is no need to compute vector insert/extract costs. Fixes an assert during the build of oggenc in the test suite. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171000 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 5b1db0b..ddb7f26 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2181,18 +2181,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // elements, times the vector width. unsigned Cost = 0; - bool IsVoid = RetTy->isVoidTy(); - - unsigned InsCost = (IsVoid ? 0 : - VTTI->getVectorInstrCost(Instruction::InsertElement, - VectorTy)); - - unsigned ExtCost = VTTI->getVectorInstrCost(Instruction::ExtractElement, - VectorTy); - - // The cost of inserting the results plus extracting each one of the - // operands. - Cost += VF * (InsCost + ExtCost * I->getNumOperands()); + if (RetTy->isVoidTy() || VF != 1) { + unsigned InsCost = VTTI->getVectorInstrCost(Instruction::InsertElement, + VectorTy); + unsigned ExtCost = VTTI->getVectorInstrCost(Instruction::ExtractElement, + VectorTy); + + // The cost of inserting the results plus extracting each one of the + // operands. + Cost += VF * (InsCost + ExtCost * I->getNumOperands()); + } // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. -- cgit v1.1 From a1acf55738a9d6c0376c7d36b27897817685e157 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sun, 23 Dec 2012 13:21:41 +0000 Subject: LoopVectorize: Fix accidentaly inverted condition. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171001 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index ddb7f26..1d78fac 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2181,7 +2181,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // elements, times the vector width. unsigned Cost = 0; - if (RetTy->isVoidTy() || VF != 1) { + if (!RetTy->isVoidTy() && VF != 1) { unsigned InsCost = VTTI->getVectorInstrCost(Instruction::InsertElement, VectorTy); unsigned ExtCost = VTTI->getVectorInstrCost(Instruction::ExtractElement, -- cgit v1.1 From 470ea9b72f87f2ce4bb09fc6f9829211a090652a Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 24 Dec 2012 01:22:06 +0000 Subject: LoopVectorizer: Fix an endless loop in the code that looks for reductions. The bug was in the code that detects PHIs in if-then-else block sequence. PR14701. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171008 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 1d78fac..20bcf86 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1737,10 +1737,9 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, Instruction *ExitInstruction = 0; // Iter is our iterator. We start with the PHI node and scan for all of the - // users of this instruction. All users must be instructions which can be + // users of this instruction. All users must be instructions that can be // used as reduction variables (such as ADD). We may have a single - // out-of-block user. They cycle must end with the original PHI. - // Also, we can't have multiple block-local users. + // out-of-block user. The cycle must end with the original PHI. Instruction *Iter = Phi; while (true) { // If the instruction has no users then this is a broken @@ -1752,9 +1751,9 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, if (!isReductionInstr(Iter, Kind)) return false; - // Did we find a user inside this block ? + // Did we find a user inside this loop already ? bool FoundInBlockUser = false; - // Did we reach the initial PHI node ? + // Did we reach the initial PHI node already ? bool FoundStartPHI = false; // For each of the *users* of iter. @@ -1779,8 +1778,10 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // We allow in-loop PHINodes which are not the original reduction PHI // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE // structure) then don't skip this PHI. - if (isa(U) && U->getParent() != TheLoop->getHeader() && - TheLoop->contains(U) && Iter->getNumUses() > 1) + if (isa(Iter) && isa(U) && + U->getParent() != TheLoop->getHeader() && + TheLoop->contains(U) && + Iter->getNumUses() > 1) continue; // We can't have multiple inside users. -- cgit v1.1 From 9e5329d77e590f757dbd8384f418e44df9dbf91a Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 24 Dec 2012 09:14:18 +0000 Subject: LoopVectorizer: When checking for vectorizable types, also check the StoreInst operands. PR14705. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171023 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 20bcf86..d571903 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1464,13 +1464,20 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { return false; } - // We do not re-vectorize vectors. + // Check that the instruction return type is vectorizable. if (!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) { DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); return false; } + // Check that the stored type is vectorizable. + if (StoreInst *ST = dyn_cast(it)) { + Type *T = ST->getValueOperand()->getType(); + if (!VectorType::isValidElementType(T)) + return false; + } + // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. if (!AllowedExit.count(it)) -- cgit v1.1 From 1d59f5fa53cac23b6debc1d7214451c65b0399a7 Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Tue, 25 Dec 2012 23:21:29 +0000 Subject: LoopVectorize: Enable vectorization of the fmuladd intrinsic git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171076 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index d571903..b8b934a 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -711,6 +711,7 @@ isTriviallyVectorizableIntrinsic(Instruction *Inst) { case Intrinsic::nearbyint: case Intrinsic::pow: case Intrinsic::fma: + case Intrinsic::fmuladd: return true; default: return false; -- cgit v1.1 From 13eb1e7817be11ea84be6571dce827a77bc9640b Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 26 Dec 2012 19:08:17 +0000 Subject: LoopVectorizer: Optimize the vectorization of consecutive memory access when the iteration step is -1 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171114 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 85 ++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 22 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index b8b934a..d64295c 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -202,7 +202,7 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) { return Builder.CreateAdd(Val, Cv, "induction"); } -bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { +int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); // If this value is a pointer induction variable we know it is consecutive. @@ -210,12 +210,12 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { if (Phi && Inductions.count(Phi)) { InductionInfo II = Inductions[Phi]; if (PtrInduction == II.IK) - return true; + return 1; } GetElementPtrInst *Gep = dyn_cast_or_null(Ptr); if (!Gep) - return false; + return 0; unsigned NumOperands = Gep->getNumOperands(); Value *LastIndex = Gep->getOperand(NumOperands - 1); @@ -223,7 +223,7 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // Check that all of the gep indices are uniform except for the last. for (unsigned i = 0; i < NumOperands - 1; ++i) if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) - return false; + return 0; // We can emit wide load/stores only if the last index is the induction // variable. @@ -234,10 +234,12 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // The memory is consecutive because the last index is consecutive // and all other indices are loop invariant. if (Step->isOne()) - return true; + return 1; + if (Step->isAllOnesValue()) + return -1; } - return false; + return 0; } bool LoopVectorizationLegality::isUniform(Value *V) { @@ -263,6 +265,17 @@ InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); } +Value *InnerLoopVectorizer::reverseVector(Value *Vec) { + assert(Vec->getType()->isVectorTy() && "Invalid type"); + SmallVector ShuffleMask; + for (unsigned i = 0; i < VF; ++i) + ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); + + return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), + ConstantVector::get(ShuffleMask), + "reverse"); +} + void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. @@ -941,8 +954,7 @@ Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { void InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, PhiVector *PV) { - Constant *Zero = - ConstantInt::get(IntegerType::getInt32Ty(BB->getContext()), 0); + Constant *Zero = Builder.getInt32(0); // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { @@ -1142,14 +1154,15 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, assert(!Legal->isUniform(Ptr) && "We do not allow storing to uniform addresses"); - GetElementPtrInst *Gep = dyn_cast(Ptr); - // This store does not use GEPs. - if (!Legal->isConsecutivePtr(Ptr)) { + int Stride = Legal->isConsecutivePtr(Ptr); + bool Reverse = Stride < 0; + if (Stride == 0) { scalarizeInstruction(it); break; } + GetElementPtrInst *Gep = dyn_cast(Ptr); if (Gep) { // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; @@ -1166,8 +1179,16 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, assert(isa(Ptr) && "Invalid induction ptr"); Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); } + + // If the address is consecutive but reversed, then the + // wide load needs to start at the last vector element. + if (Reverse) + Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF)); + Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); Value *Val = getVectorValue(SI->getValueOperand()); + if (Reverse) + Val = reverseVector(Val); Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); break; } @@ -1177,16 +1198,17 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, Type *RetTy = VectorType::get(LI->getType(), VF); Value *Ptr = LI->getPointerOperand(); unsigned Alignment = LI->getAlignment(); - GetElementPtrInst *Gep = dyn_cast(Ptr); // If the pointer is loop invariant or if it is non consecutive, // scalarize the load. - bool Con = Legal->isConsecutivePtr(Ptr); - if (Legal->isUniform(Ptr) || !Con) { + int Stride = Legal->isConsecutivePtr(Ptr); + bool Reverse = Stride < 0; + if (Legal->isUniform(Ptr) || Stride == 0) { scalarizeInstruction(it); break; } + GetElementPtrInst *Gep = dyn_cast(Ptr); if (Gep) { // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; @@ -1203,12 +1225,17 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, assert(isa(Ptr) && "Invalid induction ptr"); Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); } + // If the address is consecutive but reversed, then the + // wide load needs to start at the last vector element. + if (Reverse) + Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF)); Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); LI = Builder.CreateLoad(Ptr); LI->setAlignment(Alignment); + // Use this vector value for all users of the load. - WidenMap[it] = LI; + WidenMap[it] = Reverse ? reverseVector(LI) : LI; break; } case Instruction::ZExt: @@ -1625,7 +1652,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // If the address of i is unknown (for example A[B[i]]) then we may // read a few words, modify, and write a few words, and some of the // words may be written to the same address. - if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr)) + if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr)) Reads.push_back(Ptr); } @@ -2094,7 +2121,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { SI->getPointerAddressSpace()); // Scalarized stores. - if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { + int Stride = Legal->isConsecutivePtr(SI->getPointerOperand()); + bool Reverse = Stride < 0; + if (0 == Stride) { unsigned Cost = 0; // The cost of extracting from the value vector and pointer vector. @@ -2115,8 +2144,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { } // Wide stores. - return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), - SI->getPointerAddressSpace()); + unsigned Cost = VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, + SI->getAlignment(), + SI->getPointerAddressSpace()); + if (Reverse) + Cost += VTTI->getShuffleCost(VectorTargetTransformInfo::Reverse, + VectorTy, 0); + return Cost; } case Instruction::Load: { LoadInst *LI = cast(I); @@ -2127,7 +2161,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { LI->getPointerAddressSpace()); // Scalarized loads. - if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { + int Stride = Legal->isConsecutivePtr(LI->getPointerOperand()); + bool Reverse = Stride < 0; + if (0 == Stride) { unsigned Cost = 0; Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); @@ -2150,8 +2186,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { } // Wide loads. - return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), - LI->getPointerAddressSpace()); + unsigned Cost = VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, + LI->getAlignment(), + LI->getPointerAddressSpace()); + if (Reverse) + Cost += VTTI->getShuffleCost(VectorTargetTransformInfo::Reverse, + VectorTy, 0); + return Cost; } case Instruction::ZExt: case Instruction::SExt: -- cgit v1.1 From 5dd839430c1dbce6cd35dc44f68718a1fc69bfba Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 26 Dec 2012 23:30:53 +0000 Subject: If all of the write objects are identified then we can vectorize the loop even if the read objects are unidentified. PR14719. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171124 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index d64295c..7fb9bba 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1704,6 +1704,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // Check that the read-writes do not conflict with other read-write // pointers. + bool AllWritesIdentified = true; for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { GetUnderlyingObjects(*I, TempObjects, DL); for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); @@ -1711,6 +1712,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); NeedRTCheck = true; + AllWritesIdentified = false; } if (!WriteObjects.insert(*it)) { DEBUG(dbgs() << "LV: Found a possible write-write reorder:" @@ -1726,7 +1728,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { GetUnderlyingObjects(*I, TempObjects, DL); for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); it != e; ++it) { - if (!isIdentifiedObject(*it)) { + // If all of the writes are identified then we don't care if the read + // pointer is identified or not. + if (!AllWritesIdentified && !isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); NeedRTCheck = true; } -- cgit v1.1 From db2367512e87dbd7b93c3250ef30c9df5e40cb43 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 30 Dec 2012 07:47:00 +0000 Subject: LoopVectorizer: Fix a bug in the code that updates the loop exiting block. LCSSA PHIs may have undef values. The vectorizer updates values that are used by outside users such as PHIs. The bug happened because undefs are not loop values. This patch handles these PHIs. PR14725 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171251 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 7fb9bba..653c111 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -909,6 +909,19 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); }// end of for each redux variable. + + // The Loop exit block may have single value PHI nodes where the incoming + // value is 'undef'. While vectorizing we only handled real values that + // were defined inside the loop. Here we handle the 'undef case'. + // See PR14725. + for (BasicBlock::iterator LEI = LoopExitBlock->begin(), + LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { + PHINode *LCSSAPhi = dyn_cast(LEI); + if (!LCSSAPhi) continue; + if (LCSSAPhi->getNumIncomingValues() == 1) + LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), + LoopMiddleBlock); + } } Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { -- cgit v1.1 From 831737d329a727f53a1fb0572f7b7a8127208881 Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Sun, 30 Dec 2012 10:32:01 +0000 Subject: Remove the Function::getFnAttributes method in favor of using the AttributeSet directly. This is in preparation for removing the use of the 'Attribute' class as a collection of attributes. That will shift to the AttributeSet class instead. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171253 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 653c111..adf9081 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -95,7 +95,8 @@ struct LoopVectorize : public LoopPass { // optimized for size. Function *F = L->getHeader()->getParent(); Attribute::AttrKind SzAttr= Attribute::OptimizeForSize; - bool OptForSize = F->getFnAttributes().hasAttribute(SzAttr); + bool OptForSize = + F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, SzAttr); unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); -- cgit v1.1 From 6c3074958370bf25dc6e4e4b757f0c083e245dbe Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Tue, 1 Jan 2013 19:55:16 +0000 Subject: Add IRBuilder::CreateVectorSplat and use it to simplify code. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171349 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index adf9081..aadc134 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -150,11 +150,6 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, } Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { - // Create the types. - LLVMContext &C = V->getContext(); - Type *VTy = VectorType::get(V->getType(), VF); - Type *I32 = IntegerType::getInt32Ty(C); - // Save the current insertion location. Instruction *Loc = Builder.GetInsertPoint(); @@ -167,14 +162,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { if (Invariant) Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - Constant *Zero = ConstantInt::get(I32, 0); - Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); - Value *UndefVal = UndefValue::get(VTy); - // Insert the value into a new vector. - Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); // Broadcast the scalar into all locations in the vector. - Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, - "broadcast"); + Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); // Restore the builder insertion point. if (Invariant) -- cgit v1.1 From 0b8c9a80f20772c3793201ab5b251d3520b9cea3 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Wed, 2 Jan 2013 11:36:10 +0000 Subject: Move all of the header files which are involved in modelling the LLVM IR into their new header subdirectory: include/llvm/IR. This matches the directory structure of lib, and begins to correct a long standing point of file layout clutter in LLVM. There are still more header files to move here, but I wanted to handle them in separate commits to make tracking what files make sense at each layer easier. The only really questionable files here are the target intrinsic tablegen files. But that's a battle I'd rather not fight today. I've updated both CMake and Makefile build systems (I think, and my tests think, but I may have missed something). I've also re-sorted the includes throughout the project. I'll be committing updates to Clang, DragonEgg, and Polly momentarily. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171366 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index aadc134..f200452 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -18,14 +18,16 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/Verifier.h" -#include "llvm/Constants.h" -#include "llvm/DataLayout.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -35,8 +37,6 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" -#include "llvm/Type.h" -#include "llvm/Value.h" static cl::opt VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, -- cgit v1.1 From 00a6bcaeb4a74b930bc2b21f8021c3358d507ecf Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 2 Jan 2013 23:54:43 +0000 Subject: Avoid vectorization when the function has the "noimplicitflot" attribute. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171429 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index f200452..9b1d398 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -94,9 +94,17 @@ struct LoopVectorize : public LoopPass { // Check the function attribues to find out if this function should be // optimized for size. Function *F = L->getHeader()->getParent(); - Attribute::AttrKind SzAttr= Attribute::OptimizeForSize; - bool OptForSize = - F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, SzAttr); + Attribute::AttrKind SzAttr = Attribute::OptimizeForSize; + Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat; + unsigned FnIndex = AttributeSet::FunctionIndex; + bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr); + bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr); + + if (NoFloat) { + DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" + "attribute is used.\n"); + return false; + } unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); -- cgit v1.1 From e4159491a7d94f87f99fb99a15c76d5d7b26851c Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 3 Jan 2013 00:52:27 +0000 Subject: LoopVectorizer: Add support for loop-unrolling during vectorization for increasing the ILP. At the moment this feature is disabled by default and this commit should not cause any functional changes. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171436 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 422 ++++++++++++++++++----------- 1 file changed, 267 insertions(+), 155 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 9b1d398..8feea93 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -42,6 +42,11 @@ static cl::opt VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect.")); +static cl::opt +VectorizationUnroll("force-vector-unroll", cl::init(1), cl::Hidden, + cl::desc("Sets the vectorization unroll count. " + "Zero is autoselect.")); + static cl::opt EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); @@ -117,7 +122,7 @@ struct LoopVectorize : public LoopPass { F->getParent()->getModuleIdentifier()<<"\n"); // If we decided that it is *legal* to vectorizer the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF); + InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, VectorizationUnroll); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -180,7 +185,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } -Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) { +Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx, + bool Negate) { assert(Val->getType()->isVectorTy() && "Must be a vector"); assert(Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer"); @@ -191,8 +197,10 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) { SmallVector Indices; // Create a vector of consecutive numbers from zero to VF. - for (int i = 0; i < VLen; ++i) - Indices.push_back(ConstantInt::get(ITy, Negate ? (-i): i )); + for (int i = 0; i < VLen; ++i) { + int Idx = Negate ? (-i): i; + Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx)); + } // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); @@ -244,18 +252,20 @@ bool LoopVectorizationLegality::isUniform(Value *V) { return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); } -Value *InnerLoopVectorizer::getVectorValue(Value *V) { +InnerLoopVectorizer::VectorParts& +InnerLoopVectorizer::getVectorValue(Value *V) { assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); - // If we saved a vectorized copy of V, use it. - Value *&MapEntry = WidenMap[V]; - if (MapEntry) - return MapEntry; - // Broadcast V and save the value for future uses. + // If we have this scalar in the map, return it. + if (WidenMap.has(V)) + return WidenMap.get(V); + + // If this scalar is unknown, assume that it is a constant or that it is + // loop invariant. Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); - MapEntry = B; - return B; + WidenMap.splat(V, B); + return WidenMap.get(V); } Constant* @@ -277,7 +287,7 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. - SmallVector Params; + SmallVector Params; // Find all of the vectorized parameters. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { @@ -295,12 +305,14 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // If the src is an instruction that appeared earlier in the basic block // then it should already be vectorized. if (SrcInst && OrigLoop->contains(SrcInst)) { - assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); + assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); // The parameter is a vector value from earlier. - Params.push_back(WidenMap[SrcInst]); + Params.push_back(WidenMap.get(SrcInst)); } else { // The parameter is a scalar from outside the loop. Maybe even a constant. - Params.push_back(SrcOp); + VectorParts Scalars; + Scalars.append(UF, SrcOp); + Params.push_back(Scalars); } } @@ -309,39 +321,38 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *VecResults = 0; - // If we have a return value, create an empty vector. We place the scalarized - // instructions in this vector. - if (!IsVoidRetTy) - VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); + Value *UndefVec = IsVoidRetTy ? 0 : + UndefValue::get(VectorType::get(Instr->getType(), VF)); + // Create a new entry in the WidenMap and initialize it to Undef or Null. + VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); // For each scalar that we create: - for (unsigned i = 0; i < VF; ++i) { - Instruction *Cloned = Instr->clone(); - if (!IsVoidRetTy) - Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instrucions with extracted scalars. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - Value *Op = Params[op]; - // Param is a vector. Need to extract the right lane. - if (Op->getType()->isVectorTy()) - Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); - Cloned->setOperand(op, Op); - } + for (unsigned Width = 0; Width < VF; ++Width) { + // For each vector unroll 'part': + for (unsigned Part = 0; Part < UF; ++Part) { + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); + // Replace the operands of the cloned instrucions with extracted scalars. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *Op = Params[op][Part]; + // Param is a vector. Need to extract the right lane. + if (Op->getType()->isVectorTy()) + Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); + Cloned->setOperand(op, Op); + } - // Place the cloned scalar in the new loop. - Builder.Insert(Cloned); + // Place the cloned scalar in the new loop. + Builder.Insert(Cloned); - // If the original scalar returns a value we need to place it in a vector - // so that future users will be able to use it. - if (!IsVoidRetTy) - VecResults = Builder.CreateInsertElement(VecResults, Cloned, - Builder.getInt32(i)); + // If the original scalar returns a value we need to place it in a vector + // so that future users will be able to use it. + if (!IsVoidRetTy) + VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, + Builder.getInt32(Width)); + } } - - if (!IsVoidRetTy) - WidenMap[Instr] = VecResults; } Value* @@ -503,7 +514,9 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Generate the induction variable. Induction = Builder.CreatePHI(IdxTy, 2, "index"); - Constant *Step = ConstantInt::get(IdxTy, VF); + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Constant *Step = ConstantInt::get(IdxTy, VF * UF); // We may need to extend the index in case there is a type mismatch. // We know that the count starts at zero and does not overflow. @@ -521,8 +534,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Now we need to generate the expression for N - (N % VF), which is // the part that the vectorized body will execute. - Constant *CIVF = ConstantInt::get(IdxTy, VF); - Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); + Value *R = BinaryOperator::CreateURem(Count, Step, "n.mod.vf", Loc); Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, "end.idx.rnd.down", Loc); @@ -775,7 +787,6 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); it != e; ++it) { PHINode *RdxPhi = *it; - PHINode *VecRdxPhi = dyn_cast(WidenMap[RdxPhi]); assert(RdxPhi && "Unable to recover vectorized PHI"); // Find the reduction variable descriptor. @@ -791,8 +802,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); // This is the vector-clone of the value that leaves the loop. - Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); - Type *VecTy = VectorExit->getType(); + VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); + Type *VecTy = VectorExit[0]->getType(); // Find the reduction identity variable. Zero for addition, or, xor, // one for multiplication, -1 for And. @@ -811,10 +822,17 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Reductions do not have to start at zero. They can start with // any loop invariant values. - VecRdxPhi->addIncoming(VectorStart, VecPreheader); - Value *Val = - getVectorValue(RdxPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); - VecRdxPhi->addIncoming(Val, LoopVectorBody); + VectorParts &VecRdxPhi = WidenMap.get(RdxPhi); + BasicBlock *Latch = OrigLoop->getLoopLatch(); + Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); + VectorParts &Val = getVectorValue(LoopVal); + for (unsigned part = 0; part < UF; ++part) { + // Make sure to add the reduction stat value only to the + // first unroll part. + Value *StartVal = (part == 0) ? VectorStart : Identity; + cast(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); + cast(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody); + } // Before each round, move the insertion point right between // the PHIs and the values we are going to write. @@ -822,18 +840,54 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // instructions. Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); - // This PHINode contains the vectorized reduction variable, or - // the initial value vector, if we bypass the vector loop. - PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); - NewPhi->addIncoming(VectorStart, LoopBypassBlock); - NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); + VectorParts RdxParts; + for (unsigned part = 0; part < UF; ++part) { + // This PHINode contains the vectorized reduction variable, or + // the initial value vector, if we bypass the vector loop. + VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); + PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); + Value *StartVal = (part == 0) ? VectorStart : Identity; + NewPhi->addIncoming(StartVal, LoopBypassBlock); + NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody); + RdxParts.push_back(NewPhi); + } + + // Reduce all of the unrolled parts into a single vector. + Value *ReducedPartRdx = RdxParts[0]; + for (unsigned part = 1; part < UF; ++part) { + switch (RdxDesc.Kind) { + case LoopVectorizationLegality::IntegerAdd: + ReducedPartRdx = + Builder.CreateAdd(RdxParts[part], ReducedPartRdx, "add.rdx"); + break; + case LoopVectorizationLegality::IntegerMult: + ReducedPartRdx = + Builder.CreateMul(RdxParts[part], ReducedPartRdx, "mul.rdx"); + break; + case LoopVectorizationLegality::IntegerOr: + ReducedPartRdx = + Builder.CreateOr(RdxParts[part], ReducedPartRdx, "or.rdx"); + break; + case LoopVectorizationLegality::IntegerAnd: + ReducedPartRdx = + Builder.CreateAnd(RdxParts[part], ReducedPartRdx, "and.rdx"); + break; + case LoopVectorizationLegality::IntegerXor: + ReducedPartRdx = + Builder.CreateXor(RdxParts[part], ReducedPartRdx, "xor.rdx"); + break; + default: + llvm_unreachable("Unknown reduction operation"); + } + } + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each // round. assert(isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!"); - Value *TmpVec = NewPhi; + Value *TmpVec = ReducedPartRdx; SmallVector ShuffleMask(VF, 0); for (unsigned i = VF; i != 1; i >>= 1) { // Move the upper half of the vector to the lower half. @@ -922,27 +976,34 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { } } -Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { +InnerLoopVectorizer::VectorParts +InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && "Invalid edge"); - Value *SrcMask = createBlockInMask(Src); + VectorParts SrcMask = createBlockInMask(Src); // The terminator has to be a branch inst! BranchInst *BI = dyn_cast(Src->getTerminator()); assert(BI && "Unexpected terminator found"); - Value *EdgeMask = SrcMask; if (BI->isConditional()) { - EdgeMask = getVectorValue(BI->getCondition()); + VectorParts EdgeMask = getVectorValue(BI->getCondition()); + if (BI->getSuccessor(0) != Dst) - EdgeMask = Builder.CreateNot(EdgeMask); + for (unsigned part = 0; part < UF; ++part) + EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); + + for (unsigned part = 0; part < UF; ++part) + EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); + return EdgeMask; } - return Builder.CreateAnd(EdgeMask, SrcMask); + return SrcMask; } -Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { +InnerLoopVectorizer::VectorParts +InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); // Loop incoming mask is all-one. @@ -953,11 +1014,14 @@ Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { // This is the block mask. We OR all incoming edges, and with zero. Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); - Value *BlockMask = getVectorValue(Zero); + VectorParts BlockMask = getVectorValue(Zero); // For each pred: - for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) - BlockMask = Builder.CreateOr(BlockMask, createEdgeMask(*it, BB)); + for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { + VectorParts EM = createEdgeMask(*it, BB); + for (unsigned part = 0; part < UF; ++part) + BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); + } return BlockMask; } @@ -969,6 +1033,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + VectorParts &Entry = WidenMap.get(it); switch (it->getOpcode()) { case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the @@ -978,11 +1043,12 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, PHINode* P = cast(it); // Handle reduction variables: if (Legal->getReductionVars()->count(P)) { - // This is phase one of vectorizing PHIs. - Type *VecTy = VectorType::get(it->getType(), VF); - WidenMap[it] = - PHINode::Create(VecTy, 2, "vec.phi", - LoopVectorBody->getFirstInsertionPt()); + for (unsigned part = 0; part < UF; ++part) { + // This is phase one of vectorizing PHIs. + Type *VecTy = VectorType::get(it->getType(), VF); + Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", + LoopVectorBody-> getFirstInsertionPt()); + } PV->push_back(P); continue; } @@ -996,12 +1062,15 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. - Value *Cond = createEdgeMask(P->getIncomingBlock(0), P->getParent()); - WidenMap[P] = - Builder.CreateSelect(Cond, - getVectorValue(P->getIncomingValue(0)), - getVectorValue(P->getIncomingValue(1)), - "predphi"); + VectorParts Cond = createEdgeMask(P->getIncomingBlock(0), + P->getParent()); + + for (unsigned part = 0; part < UF; ++part) { + VectorParts &In0 = getVectorValue(P->getIncomingValue(0)); + VectorParts &In1 = getVectorValue(P->getIncomingValue(1)); + Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part], + "predphi"); + } continue; } @@ -1021,8 +1090,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, Value *Broadcasted = getBroadcastInstrs(Induction); // After broadcasting the induction variable we need to make the // vector consecutive by adding 0, 1, 2 ... - Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted); - WidenMap[OldInduction] = ConsecutiveInduction; + for (unsigned part = 0; part < UF; ++part) + Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); continue; } case LoopVectorizationLegality::ReverseIntInduction: @@ -1054,9 +1123,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, Value *Broadcasted = getBroadcastInstrs(ReverseInd); // After broadcasting the induction variable we need to make the // vector consecutive by adding ... -3, -2, -1, 0. - Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted, - true); - WidenMap[it] = ConsecutiveInduction; + for (unsigned part = 0; part < UF; ++part) + Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true); continue; } @@ -1065,19 +1133,21 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // This is the vector of results. Notice that we don't generate // vector geps because scalar geps result in better code. - Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); - for (unsigned int i = 0; i < VF; ++i) { - Constant *Idx = ConstantInt::get(Induction->getType(), i); - Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, - "gep.idx"); - Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, - "next.gep"); - VecVal = Builder.CreateInsertElement(VecVal, SclrGep, - Builder.getInt32(i), - "insert.gep"); + for (unsigned part = 0; part < UF; ++part) { + Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); + for (unsigned int i = 0; i < VF; ++i) { + Constant *Idx = ConstantInt::get(Induction->getType(), + i + part * VF); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, + "gep.idx"); + Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, + "next.gep"); + VecVal = Builder.CreateInsertElement(VecVal, SclrGep, + Builder.getInt32(i), + "insert.gep"); + } + Entry[part] = VecVal; } - - WidenMap[it] = VecVal; continue; } @@ -1103,41 +1173,48 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, case Instruction::Xor: { // Just widen binops. BinaryOperator *BinOp = dyn_cast(it); - Value *A = getVectorValue(it->getOperand(0)); - Value *B = getVectorValue(it->getOperand(1)); + VectorParts &A = getVectorValue(it->getOperand(0)); + VectorParts &B = getVectorValue(it->getOperand(1)); // Use this vector value for all users of the original instruction. - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); - WidenMap[it] = V; - - // Update the NSW, NUW and Exact flags. - BinaryOperator *VecOp = cast(V); - if (isa(BinOp)) { - VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); - VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); + + // Update the NSW, NUW and Exact flags. + BinaryOperator *VecOp = cast(V); + if (isa(BinOp)) { + VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); + VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); + } + if (isa(VecOp)) + VecOp->setIsExact(BinOp->isExact()); + + Entry[Part] = V; } - if (isa(VecOp)) - VecOp->setIsExact(BinOp->isExact()); break; } case Instruction::Select: { // Widen selects. // If the selector is loop invariant we can create a select // instruction with a scalar condition. Otherwise, use vector-select. - Value *Cond = it->getOperand(0); - bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); + bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), + OrigLoop); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. - Cond = getVectorValue(Cond); - if (InvariantCond) - Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); - - Value *Op0 = getVectorValue(it->getOperand(1)); - Value *Op1 = getVectorValue(it->getOperand(2)); - WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1); + VectorParts &Cond = getVectorValue(it->getOperand(0)); + VectorParts &Op0 = getVectorValue(it->getOperand(1)); + VectorParts &Op1 = getVectorValue(it->getOperand(2)); + Value *ScalarCond = Builder.CreateExtractElement(Cond[0], + Builder.getInt32(0)); + for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part] = Builder.CreateSelect( + InvariantCond ? ScalarCond : Cond[Part], + Op0[Part], + Op1[Part]); + } break; } @@ -1146,12 +1223,16 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // Widen compares. Generate vector compares. bool FCmp = (it->getOpcode() == Instruction::FCmp); CmpInst *Cmp = dyn_cast(it); - Value *A = getVectorValue(it->getOperand(0)); - Value *B = getVectorValue(it->getOperand(1)); - if (FCmp) - WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); - else - WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B); + VectorParts &A = getVectorValue(it->getOperand(0)); + VectorParts &B = getVectorValue(it->getOperand(1)); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *C = 0; + if (FCmp) + C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); + else + C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); + Entry[Part] = C; + } break; } @@ -1173,12 +1254,17 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, break; } + // Handle consecutive stores. + GetElementPtrInst *Gep = dyn_cast(Ptr); if (Gep) { // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); - Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); + + Value *LastGepOperand = Gep->getOperand(NumOperands - 1); + VectorParts &GEPParts = getVectorValue(LastGepOperand); + Value *LastIndex = GEPParts[0]; LastIndex = Builder.CreateExtractElement(LastIndex, Zero); // Create the new GEP with the new induction variable. @@ -1188,19 +1274,28 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, } else { // Use the induction element ptr. assert(isa(Ptr) && "Invalid induction ptr"); - Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); + VectorParts &PtrVal = getVectorValue(Ptr); + Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); } - // If the address is consecutive but reversed, then the - // wide load needs to start at the last vector element. - if (Reverse) - Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF)); + VectorParts &StoredVal = getVectorValue(SI->getValueOperand()); + for (unsigned Part = 0; Part < UF; ++Part) { + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + + if (Reverse) { + // If we store to reverse consecutive memory locations then we need + // to reverse the order of elements in the stored value. + StoredVal[Part] = reverseVector(StoredVal[Part]); + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + } - Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); - Value *Val = getVectorValue(SI->getValueOperand()); - if (Reverse) - Val = reverseVector(Val); - Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); + Value *VecPtr = Builder.CreateBitCast(PartPtr, StTy->getPointerTo()); + Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); + } break; } case Instruction::Load: { @@ -1224,7 +1319,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); - Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); + + Value *LastGepOperand = Gep->getOperand(NumOperands - 1); + VectorParts &GEPParts = getVectorValue(LastGepOperand); + Value *LastIndex = GEPParts[0]; LastIndex = Builder.CreateExtractElement(LastIndex, Zero); // Create the new GEP with the new induction variable. @@ -1234,19 +1332,26 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, } else { // Use the induction element ptr. assert(isa(Ptr) && "Invalid induction ptr"); - Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); + VectorParts &PtrVal = getVectorValue(Ptr); + Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); } - // If the address is consecutive but reversed, then the - // wide load needs to start at the last vector element. - if (Reverse) - Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF)); - Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); - LI = Builder.CreateLoad(Ptr); - LI->setAlignment(Alignment); + for (unsigned Part = 0; Part < UF; ++Part) { + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); - // Use this vector value for all users of the load. - WidenMap[it] = Reverse ? reverseVector(LI) : LI; + if (Reverse) { + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + } + + Value *VecPtr = Builder.CreateBitCast(PartPtr, RetTy->getPointerTo()); + Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); + cast(LI)->setAlignment(Alignment); + Entry[Part] = Reverse ? reverseVector(LI) : LI; + } break; } case Instruction::ZExt: @@ -1271,13 +1376,16 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); Value *Broadcasted = getBroadcastInstrs(ScalarCast); - WidenMap[it] = getConsecutiveVector(Broadcasted); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); break; } /// Vectorize casts. - Value *A = getVectorValue(it->getOperand(0)); Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); - WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy); + + VectorParts &A = getVectorValue(it->getOperand(0)); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); break; } @@ -1286,12 +1394,16 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, Module *M = BB->getParent()->getParent(); IntrinsicInst *II = cast(it); Intrinsic::ID ID = II->getIntrinsicID(); - SmallVector Args; - for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) - Args.push_back(getVectorValue(II->getArgOperand(i))); - Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) }; - Function *F = Intrinsic::getDeclaration(M, ID, Tys); - WidenMap[it] = Builder.CreateCall(F, Args); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector Args; + for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) { + VectorParts &Arg = getVectorValue(II->getArgOperand(i)); + Args.push_back(Arg[Part]); + } + Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) }; + Function *F = Intrinsic::getDeclaration(M, ID, Tys); + Entry[Part] = Builder.CreateCall(F, Args); + } break; } -- cgit v1.1 From e503319874f57ab4a0354521b03a71cf8e07b866 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 4 Jan 2013 17:48:25 +0000 Subject: LoopVectorizer: 1. Add code to estimate register pressure. 2. Add code to select the unroll factor based on register pressure. 3. Add bits to TargetTransformInfo to provide the number of registers. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171469 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 166 ++++++++++++++++++++++++++++- 1 file changed, 162 insertions(+), 4 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 8feea93..0f84fe0 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// #include "LoopVectorize.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" @@ -43,7 +44,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect.")); static cl::opt -VectorizationUnroll("force-vector-unroll", cl::init(1), cl::Hidden, +VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden, cl::desc("Sets the vectorization unroll count. " "Zero is autoselect.")); @@ -94,7 +95,7 @@ struct LoopVectorize : public LoopPass { if (TTI) VTTI = TTI->getVectorTargetTransformInfo(); // Use the cost model. - LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); + LoopVectorizationCostModel CM(L, SE, LI, &LVL, VTTI); // Check the function attribues to find out if this function should be // optimized for size. @@ -112,6 +113,7 @@ struct LoopVectorize : public LoopPass { } unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); + unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll); if (VF == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); @@ -120,9 +122,10 @@ struct LoopVectorize : public LoopPass { DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< F->getParent()->getModuleIdentifier()<<"\n"); + DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n"); // If we decided that it is *legal* to vectorizer the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, VectorizationUnroll); + InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -2082,7 +2085,7 @@ bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { unsigned LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, - unsigned UserVF) { + unsigned UserVF) { if (OptForSize && Legal->getRuntimePointerCheck()->Need) { DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); return 1; @@ -2148,6 +2151,161 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, return Width; } +unsigned +LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, + unsigned UserUF) { + // Use the user preference, unless 'auto' is selected. + if (UserUF != 0) + return UserUF; + + // When we optimize for size we don't unroll. + if (OptForSize) + return 1; + + unsigned TargetVectorRegisters = VTTI->getNumberOfRegisters(true); + DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters << + " vector registers\n"); + + LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); + // We divide by these constants so assume that we have at least one + // instruction that uses at least one register. + R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); + R.NumInstructions = std::max(R.NumInstructions, 1U); + + // We calculate the unroll factor using the following formula. + // Subtract the number of loop invariants from the number of available + // registers. These registers are used by all of the unrolled instances. + // Next, divide the remaining registers by the number of registers that is + // required by the loop, in order to estimate how many parallel instances + // fit without causing spills. + unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers; + + // We don't want to unroll the loops to the point where they do not fit into + // the decoded cache. Assume that we only allow 32 IR instructions. + UF = std::min(UF, (32 / R.NumInstructions)); + + // Clamp the unroll factor ranges to reasonable factors. + if (UF > MaxUnrollSize) + UF = MaxUnrollSize; + else if (UF < 1) + UF = 1; + + return UF; +} + +LoopVectorizationCostModel::RegisterUsage +LoopVectorizationCostModel::calculateRegisterUsage() { + // This function calculates the register usage by measuring the highest number + // of values that are alive at a single location. Obviously, this is a very + // rough estimation. We scan the loop in a topological order in order and + // assign a number to each instruction. We use RPO to ensure that defs are + // met before their users. We assume that each instruction that has in-loop + // users starts an interval. We record every time that an in-loop value is + // used, so we have a list of the first and last occurrences of each + // instruction. Next, we transpose this data structure into a multi map that + // holds the list of intervals that *end* at a specific location. This multi + // map allows us to perform a linear search. We scan the instructions linearly + // and record each time that a new interval starts, by placing it in a set. + // If we find this value in the multi-map then we remove it from the set. + // The max register usage is the maximum size of the set. + // We also search for instructions that are defined outside the loop, but are + // used inside the loop. We need this number separately from the max-interval + // usage number because when we unroll, loop-invariant values do not take + // more register. + LoopBlocksDFS DFS(TheLoop); + DFS.perform(LI); + + RegisterUsage R; + R.NumInstructions = 0; + + // Each 'key' in the map opens a new interval. The values + // of the map are the index of the 'last seen' usage of the + // instruction that is the key. + typedef DenseMap IntervalMap; + // Maps instruction to its index. + DenseMap IdxToInstr; + // Marks the end of each interval. + IntervalMap EndPoint; + // Saves the list of instruction indices that are used in the loop. + SmallSet Ends; + // Saves the list of values that are used in the loop but are + // defined outside the loop, such as arguments and constants. + SmallPtrSet LoopInvariants; + + unsigned Index = 0; + for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), + be = DFS.endRPO(); bb != be; ++bb) { + R.NumInstructions += (*bb)->size(); + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + Instruction *I = it; + IdxToInstr[Index++] = I; + + // Save the end location of each USE. + for (unsigned i = 0; i < I->getNumOperands(); ++i) { + Value *U = I->getOperand(i); + Instruction *Instr = dyn_cast(U); + + // Ignore non-instruction values such as arguments, constants, etc. + if (!Instr) continue; + + // If this instruction is outside the loop then record it and continue. + if (!TheLoop->contains(Instr)) { + LoopInvariants.insert(Instr); + continue; + } + + // Overwrite previous end points. + EndPoint[Instr] = Index; + Ends.insert(Instr); + } + } + } + + // Saves the list of intervals that end with the index in 'key'. + typedef SmallVector InstrList; + DenseMap TransposeEnds; + + // Transpose the EndPoints to a list of values that end at each index. + for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); + it != e; ++it) + TransposeEnds[it->second].push_back(it->first); + + SmallSet OpenIntervals; + unsigned MaxUsage = 0; + + + DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + for (unsigned int i = 0; i < Index; ++i) { + Instruction *I = IdxToInstr[i]; + // Ignore instructions that are never used within the loop. + if (!Ends.count(I)) continue; + + // Remove all of the instructions that end at this location. + InstrList &List = TransposeEnds[i]; + for (unsigned int i=0, e = List.size(); i < e; ++i) + OpenIntervals.erase(List[i]); + + // Count the number of live interals. + MaxUsage = std::max(MaxUsage, OpenIntervals.size()); + + DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << + OpenIntervals.size() <<"\n"); + + // Add the current instruction to the list of open intervals. + OpenIntervals.insert(I); + } + + unsigned Invariant = LoopInvariants.size(); + DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n"); + DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n"); + DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n"); + + R.LoopInvariantRegs = Invariant; + R.MaxLocalUsers = MaxUsage; + return R; +} + unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { unsigned Cost = 0; -- cgit v1.1 From 608467d1e1b318220307143d3af81f31cd7a18f8 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 4 Jan 2013 21:08:44 +0000 Subject: Fix a warning git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171525 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 0f84fe0..4e50898 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2283,8 +2283,8 @@ LoopVectorizationCostModel::calculateRegisterUsage() { // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; - for (unsigned int i=0, e = List.size(); i < e; ++i) - OpenIntervals.erase(List[i]); + for (unsigned int j=0, e = List.size(); j < e; ++j) + OpenIntervals.erase(List[j]); // Count the number of live interals. MaxUsage = std::max(MaxUsage, OpenIntervals.size()); -- cgit v1.1 From 5767d919569f7b4be51c0a13bff24f722625d9f3 Mon Sep 17 00:00:00 2001 From: Paul Redmond Date: Fri, 4 Jan 2013 22:10:16 +0000 Subject: Do not vectorize loops with subtraction reductions Since subtraction does not commute the loop vectorizer incorrectly vectorizes reductions such as x = A[i] - x. Disabling for now. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171537 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 1 - 1 file changed, 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 4e50898..5e2d797 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1986,7 +1986,6 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, // possibly. return true; case Instruction::Add: - case Instruction::Sub: return Kind == IntegerAdd; case Instruction::Mul: return Kind == IntegerMult; -- cgit v1.1 From d5b92c389133c5d587e4094af553ec345ed40045 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sat, 5 Jan 2013 01:15:47 +0000 Subject: iLoopVectorize: Non commutative operators can be used as reduction variables as long as the reduction chain is used in the LHS. PR14803. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171583 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 5e2d797..af2e846 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1912,10 +1912,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, if (Iter->use_empty()) return false; - // Any reduction instr must be of one of the allowed kinds. - if (!isReductionInstr(Iter, Kind)) - return false; - // Did we find a user inside this loop already ? bool FoundInBlockUser = false; // Did we reach the initial PHI node already ? @@ -1953,6 +1949,16 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, if (FoundInBlockUser) return false; FoundInBlockUser = true; + + // Any reduction instr must be of one of the allowed kinds. + if (!isReductionInstr(U, Kind)) + return false; + + // Reductions of instructions such as Div, and Sub is only + // possible if the LHS is the reduction variable. + if (!U->isCommutative() && U->getOperand(0) != Iter) + return false; + Iter = U; } @@ -1985,8 +1991,11 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, case Instruction::PHI: // possibly. return true; + case Instruction::Sub: case Instruction::Add: return Kind == IntegerAdd; + case Instruction::SDiv: + case Instruction::UDiv: case Instruction::Mul: return Kind == IntegerMult; case Instruction::And: -- cgit v1.1 From be73c7b903647221fbcaae302d31e90f53583040 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Sat, 5 Jan 2013 10:16:02 +0000 Subject: Switch the loop vectorizer from VTTI to just use TTI directly. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171620 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 94 ++++++++++++++---------------- 1 file changed, 45 insertions(+), 49 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index af2e846..1f46b91 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -90,12 +90,8 @@ struct LoopVectorize : public LoopPass { return false; } - // Select the preffered vectorization factor. - const VectorTargetTransformInfo *VTTI = 0; - if (TTI) - VTTI = TTI->getVectorTargetTransformInfo(); // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, VTTI); + LoopVectorizationCostModel CM(L, SE, LI, &LVL, TTI); // Check the function attribues to find out if this function should be // optimized for size. @@ -2134,7 +2130,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, return UserVF; } - if (!VTTI) { + if (!TTI) { DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); return 1; } @@ -2170,7 +2166,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, if (OptForSize) return 1; - unsigned TargetVectorRegisters = VTTI->getNumberOfRegisters(true); + unsigned TargetVectorRegisters = TTI->getNumberOfRegisters(true); DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters << " vector registers\n"); @@ -2345,7 +2341,7 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { - assert(VTTI && "Invalid vector target transformation info"); + assert(TTI && "Invalid vector target transformation info"); // If we know that this instruction will remain uniform, check the cost of // the scalar version. @@ -2363,7 +2359,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // generate vector geps. return 0; case Instruction::Br: { - return VTTI->getCFInstrCost(I->getOpcode()); + return TTI->getCFInstrCost(I->getOpcode()); } case Instruction::PHI: //TODO: IF-converted IFs become selects. @@ -2386,7 +2382,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::And: case Instruction::Or: case Instruction::Xor: - return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); + return TTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); case Instruction::Select: { SelectInst *SI = cast(I); const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); @@ -2395,13 +2391,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { if (ScalarCond) CondTy = VectorType::get(CondTy, VF); - return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); + return TTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); } case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); VectorTy = ToVectorTy(ValTy, VF); - return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); + return TTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); } case Instruction::Store: { StoreInst *SI = cast(I); @@ -2409,7 +2405,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { VectorTy = ToVectorTy(ValTy, VF); if (VF == 1) - return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, + return TTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), SI->getPointerAddressSpace()); @@ -2422,36 +2418,36 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // The cost of extracting from the value vector and pointer vector. Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); for (unsigned i = 0; i < VF; ++i) { - Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement, - VectorTy, i); - Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement, - PtrTy, i); + Cost += TTI->getVectorInstrCost(Instruction::ExtractElement, + VectorTy, i); + Cost += TTI->getVectorInstrCost(Instruction::ExtractElement, + PtrTy, i); } // The cost of the scalar stores. - Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), - ValTy->getScalarType(), + Cost += VF * TTI->getMemoryOpCost(I->getOpcode(), + ValTy->getScalarType(), SI->getAlignment(), SI->getPointerAddressSpace()); return Cost; } // Wide stores. - unsigned Cost = VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, - SI->getAlignment(), - SI->getPointerAddressSpace()); + unsigned Cost = TTI->getMemoryOpCost(I->getOpcode(), VectorTy, + SI->getAlignment(), + SI->getPointerAddressSpace()); if (Reverse) - Cost += VTTI->getShuffleCost(VectorTargetTransformInfo::Reverse, - VectorTy, 0); + Cost += TTI->getShuffleCost(TargetTransformInfo::Reverse, + VectorTy, 0); return Cost; } case Instruction::Load: { LoadInst *LI = cast(I); if (VF == 1) - return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, - LI->getAlignment(), - LI->getPointerAddressSpace()); + return TTI->getMemoryOpCost(I->getOpcode(), VectorTy, + LI->getAlignment(), + LI->getPointerAddressSpace()); // Scalarized loads. int Stride = Legal->isConsecutivePtr(LI->getPointerOperand()); @@ -2462,29 +2458,29 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // The cost of extracting from the pointer vector. for (unsigned i = 0; i < VF; ++i) - Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement, - PtrTy, i); + Cost += TTI->getVectorInstrCost(Instruction::ExtractElement, + PtrTy, i); // The cost of inserting data to the result vector. for (unsigned i = 0; i < VF; ++i) - Cost += VTTI->getVectorInstrCost(Instruction::InsertElement, - VectorTy, i); + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, + VectorTy, i); // The cost of the scalar stores. - Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), - RetTy->getScalarType(), - LI->getAlignment(), - LI->getPointerAddressSpace()); + Cost += VF * TTI->getMemoryOpCost(I->getOpcode(), + RetTy->getScalarType(), + LI->getAlignment(), + LI->getPointerAddressSpace()); return Cost; } // Wide loads. - unsigned Cost = VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, - LI->getAlignment(), - LI->getPointerAddressSpace()); + unsigned Cost = TTI->getMemoryOpCost(I->getOpcode(), VectorTy, + LI->getAlignment(), + LI->getPointerAddressSpace()); if (Reverse) - Cost += VTTI->getShuffleCost(VectorTargetTransformInfo::Reverse, - VectorTy, 0); + Cost += TTI->getShuffleCost(TargetTransformInfo::Reverse, + VectorTy, 0); return Cost; } case Instruction::ZExt: @@ -2503,11 +2499,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // The cost of these is the same as the scalar operation. if (I->getOpcode() == Instruction::Trunc && Legal->isInductionVariable(I->getOperand(0))) - return VTTI->getCastInstrCost(I->getOpcode(), I->getType(), - I->getOperand(0)->getType()); + return TTI->getCastInstrCost(I->getOpcode(), I->getType(), + I->getOperand(0)->getType()); Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); - return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + return TTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } case Instruction::Call: { assert(isTriviallyVectorizableIntrinsic(I)); @@ -2516,7 +2512,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { SmallVector Tys; for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF)); - return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); + return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); } default: { // We are scalarizing the instruction. Return the cost of the scalar @@ -2525,10 +2521,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { unsigned Cost = 0; if (!RetTy->isVoidTy() && VF != 1) { - unsigned InsCost = VTTI->getVectorInstrCost(Instruction::InsertElement, - VectorTy); - unsigned ExtCost = VTTI->getVectorInstrCost(Instruction::ExtractElement, - VectorTy); + unsigned InsCost = TTI->getVectorInstrCost(Instruction::InsertElement, + VectorTy); + unsigned ExtCost = TTI->getVectorInstrCost(Instruction::ExtractElement, + VectorTy); // The cost of inserting the results plus extracting each one of the // operands. @@ -2537,7 +2533,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - Cost += VF * VTTI->getArithmeticInstrCost(Instruction::Mul, VectorTy); + Cost += VF * TTI->getArithmeticInstrCost(Instruction::Mul, VectorTy); return Cost; } }// end of switch. -- cgit v1.1 From be04929f7fd76a921540e9901f24563e51dc1219 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 7 Jan 2013 03:08:10 +0000 Subject: Move TargetTransformInfo to live under the Analysis library. This no longer would violate any dependency layering and it is in fact an analysis. =] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171686 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 1f46b91..6a26bef 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -17,6 +17,7 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/Verifier.h" #include "llvm/IR/Constants.h" @@ -33,7 +34,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/TargetTransformInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -- cgit v1.1 From bb00800ff46e7a2a628d0a6741a7f0422c74c198 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 7 Jan 2013 03:20:02 +0000 Subject: Fix the enumerator names for ShuffleKind to match tho coding standards, and make its comments doxygen comments. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171688 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 6a26bef..17d9eb1 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2437,7 +2437,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { SI->getAlignment(), SI->getPointerAddressSpace()); if (Reverse) - Cost += TTI->getShuffleCost(TargetTransformInfo::Reverse, + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); return Cost; } @@ -2479,7 +2479,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { LI->getAlignment(), LI->getPointerAddressSpace()); if (Reverse) - Cost += TTI->getShuffleCost(TargetTransformInfo::Reverse, + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); return Cost; } -- cgit v1.1 From f3252b12e02b1fcf01abf0a79b761c53de5985d0 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 7 Jan 2013 10:44:06 +0000 Subject: Merge the unused header file for LoopVectorizer into the source file. This makes the loop vectorizer match the pattern followed by roughly all other passses. =] Notably, this header file was braken in several regards: it contained a using namespace directive, global #define's that aren't globaly appropriate, and global constants defined directly in the header file. As a side benefit, lots of the types in this file become internal, which will cause the optimizer to chew on this pass more effectively. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171723 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 522 ++++++++++++++++++++++++++++- 1 file changed, 519 insertions(+), 3 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 17d9eb1..d51114e 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6,8 +6,51 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -#include "LoopVectorize.h" +// +// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops +// and generates target-independent LLVM-IR. Legalization of the IR is done +// in the codegen. However, the vectorizes uses (will use) the codegen +// interfaces to generate IR that is likely to result in an optimal binary. +// +// The loop vectorizer combines consecutive loop iteration into a single +// 'wide' iteration. After this transformation the index is incremented +// by the SIMD vector width, and not by one. +// +// This pass has three parts: +// 1. The main loop pass that drives the different parts. +// 2. LoopVectorizationLegality - A unit that checks for the legality +// of the vectorization. +// 3. InnerLoopVectorizer - A unit that performs the actual +// widening of instructions. +// 4. LoopVectorizationCostModel - A unit that checks for the profitability +// of vectorization. It decides on the optimal vector width, which +// can be one, if vectorization is not profitable. +// +//===----------------------------------------------------------------------===// +// +// The reduction-variable vectorization is based on the paper: +// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. +// +// Variable uniformity checks are inspired by: +// Karrenberg, R. and Hack, S. Whole Function Vectorization. +// +// Other ideas/concepts are from: +// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. +// +// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of +// Vectorizing Compilers. +// +//===----------------------------------------------------------------------===// + +#define LV_NAME "loop-vectorize" +#define DEBUG_TYPE LV_NAME + +#include "llvm/Transforms/Vectorize.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" @@ -15,6 +58,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -24,6 +68,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" @@ -37,7 +82,10 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Vectorize.h" +#include +#include + +using namespace llvm; static cl::opt VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, @@ -52,8 +100,476 @@ static cl::opt EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); +/// We don't vectorize loops with a known constant trip count below this number. +static const unsigned TinyTripCountThreshold = 16; + +/// When performing a runtime memory check, do not check more than this +/// number of pointers. Notice that the check is quadratic! +static const unsigned RuntimeMemoryCheckThreshold = 4; + +/// This is the highest vector width that we try to generate. +static const unsigned MaxVectorSize = 8; + +/// This is the highest Unroll Factor. +static const unsigned MaxUnrollSize = 4; + namespace { +// Forward declarations. +class LoopVectorizationLegality; +class LoopVectorizationCostModel; + +/// InnerLoopVectorizer vectorizes loops which contain only one basic +/// block to a specified vectorization factor (VF). +/// This class performs the widening of scalars into vectors, or multiple +/// scalars. This class also implements the following features: +/// * It inserts an epilogue loop for handling loops that don't have iteration +/// counts that are known to be a multiple of the vectorization factor. +/// * It handles the code generation for reduction variables. +/// * Scalarization (implementation using scalars) of un-vectorizable +/// instructions. +/// InnerLoopVectorizer does not perform any vectorization-legality +/// checks, and relies on the caller to check for the different legality +/// aspects. The InnerLoopVectorizer relies on the +/// LoopVectorizationLegality class to provide information about the induction +/// and reduction variables that were found to a given vectorization factor. +class InnerLoopVectorizer { +public: + InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, + DominatorTree *DT, DataLayout *DL, unsigned VecWidth, + unsigned UnrollFactor) + : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), VF(VecWidth), + UF(UnrollFactor), Builder(SE->getContext()), Induction(0), + OldInduction(0), WidenMap(UnrollFactor) {} + + // Perform the actual loop widening (vectorization). + void vectorize(LoopVectorizationLegality *Legal) { + // Create a new empty loop. Unlink the old loop and connect the new one. + createEmptyLoop(Legal); + // Widen each instruction in the old loop to a new one in the new loop. + // Use the Legality module to find the induction and reduction variables. + vectorizeLoop(Legal); + // Register the new loop and update the analysis passes. + updateAnalysis(); + } + +private: + /// A small list of PHINodes. + typedef SmallVector PhiVector; + /// When we unroll loops we have multiple vector values for each scalar. + /// This data structure holds the unrolled and vectorized values that + /// originated from one scalar instruction. + typedef SmallVector VectorParts; + + /// Add code that checks at runtime if the accessed arrays overlap. + /// Returns the comparator value or NULL if no check is needed. + Value *addRuntimeCheck(LoopVectorizationLegality *Legal, + Instruction *Loc); + /// Create an empty loop, based on the loop ranges of the old loop. + void createEmptyLoop(LoopVectorizationLegality *Legal); + /// Copy and widen the instructions from the old loop. + void vectorizeLoop(LoopVectorizationLegality *Legal); + + /// A helper function that computes the predicate of the block BB, assuming + /// that the header block of the loop is set to True. It returns the *entry* + /// mask for the block BB. + VectorParts createBlockInMask(BasicBlock *BB); + /// A helper function that computes the predicate of the edge between SRC + /// and DST. + VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); + + /// A helper function to vectorize a single BB within the innermost loop. + void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, + PhiVector *PV); + + /// Insert the new loop to the loop hierarchy and pass manager + /// and update the analysis passes. + void updateAnalysis(); + + /// This instruction is un-vectorizable. Implement it as a sequence + /// of scalars. + void scalarizeInstruction(Instruction *Instr); + + /// Create a broadcast instruction. This method generates a broadcast + /// instruction (shuffle) for loop invariant values and for the induction + /// value. If this is the induction variable then we extend it to N, N+1, ... + /// this is needed because each iteration in the loop corresponds to a SIMD + /// element. + Value *getBroadcastInstrs(Value *V); + + /// This function adds 0, 1, 2 ... to each vector element, starting at zero. + /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). + /// The sequence starts at StartIndex. + Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate); + + /// When we go over instructions in the basic block we rely on previous + /// values within the current basic block or on loop invariant values. + /// When we widen (vectorize) values we place them in the map. If the values + /// are not within the map, they have to be loop invariant, so we simply + /// broadcast them into a vector. + VectorParts &getVectorValue(Value *V); + + /// Get a uniform vector of constant integers. We use this to get + /// vectors of ones and zeros for the reduction code. + Constant* getUniformVector(unsigned Val, Type* ScalarTy); + + /// Generate a shuffle sequence that will reverse the vector Vec. + Value *reverseVector(Value *Vec); + + /// This is a helper class that holds the vectorizer state. It maps scalar + /// instructions to vector instructions. When the code is 'unrolled' then + /// then a single scalar value is mapped to multiple vector parts. The parts + /// are stored in the VectorPart type. + struct ValueMap { + /// C'tor. UnrollFactor controls the number of vectors ('parts') that + /// are mapped. + ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} + + /// \return True if 'Key' is saved in the Value Map. + bool has(Value *Key) { return MapStoreage.count(Key); } + + /// Initializes a new entry in the map. Sets all of the vector parts to the + /// save value in 'Val'. + /// \return A reference to a vector with splat values. + VectorParts &splat(Value *Key, Value *Val) { + MapStoreage[Key].clear(); + MapStoreage[Key].append(UF, Val); + return MapStoreage[Key]; + } + + ///\return A reference to the value that is stored at 'Key'. + VectorParts &get(Value *Key) { + if (!has(Key)) + MapStoreage[Key].resize(UF); + return MapStoreage[Key]; + } + + /// The unroll factor. Each entry in the map stores this number of vector + /// elements. + unsigned UF; + + /// Map storage. We use std::map and not DenseMap because insertions to a + /// dense map invalidates its iterators. + std::map MapStoreage; + }; + + /// The original loop. + Loop *OrigLoop; + /// Scev analysis to use. + ScalarEvolution *SE; + /// Loop Info. + LoopInfo *LI; + /// Dominator Tree. + DominatorTree *DT; + /// Data Layout. + DataLayout *DL; + /// The vectorization SIMD factor to use. Each vector will have this many + /// vector elements. + unsigned VF; + /// The vectorization unroll factor to use. Each scalar is vectorized to this + /// many different vector instructions. + unsigned UF; + + /// The builder that we use + IRBuilder<> Builder; + + // --- Vectorization state --- + + /// The vector-loop preheader. + BasicBlock *LoopVectorPreHeader; + /// The scalar-loop preheader. + BasicBlock *LoopScalarPreHeader; + /// Middle Block between the vector and the scalar. + BasicBlock *LoopMiddleBlock; + ///The ExitBlock of the scalar loop. + BasicBlock *LoopExitBlock; + ///The vector loop body. + BasicBlock *LoopVectorBody; + ///The scalar loop body. + BasicBlock *LoopScalarBody; + ///The first bypass block. + BasicBlock *LoopBypassBlock; + + /// The new Induction variable which was added to the new block. + PHINode *Induction; + /// The induction variable of the old basic block. + PHINode *OldInduction; + /// Maps scalars to widened vectors. + ValueMap WidenMap; +}; + +/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and +/// to what vectorization factor. +/// This class does not look at the profitability of vectorization, only the +/// legality. This class has two main kinds of checks: +/// * Memory checks - The code in canVectorizeMemory checks if vectorization +/// will change the order of memory accesses in a way that will change the +/// correctness of the program. +/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory +/// checks for a number of different conditions, such as the availability of a +/// single induction variable, that all types are supported and vectorize-able, +/// etc. This code reflects the capabilities of InnerLoopVectorizer. +/// This class is also used by InnerLoopVectorizer for identifying +/// induction variable and the different reduction variables. +class LoopVectorizationLegality { +public: + LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, + DominatorTree *DT) + : TheLoop(L), SE(SE), DL(DL), DT(DT), Induction(0) {} + + /// This enum represents the kinds of reductions that we support. + enum ReductionKind { + NoReduction, ///< Not a reduction. + IntegerAdd, ///< Sum of numbers. + IntegerMult, ///< Product of numbers. + IntegerOr, ///< Bitwise or logical OR of numbers. + IntegerAnd, ///< Bitwise or logical AND of numbers. + IntegerXor ///< Bitwise or logical XOR of numbers. + }; + + /// This enum represents the kinds of inductions that we support. + enum InductionKind { + NoInduction, ///< Not an induction variable. + IntInduction, ///< Integer induction variable. Step = 1. + ReverseIntInduction, ///< Reverse int induction variable. Step = -1. + PtrInduction ///< Pointer induction variable. Step = sizeof(elem). + }; + + /// This POD struct holds information about reduction variables. + struct ReductionDescriptor { + ReductionDescriptor() : StartValue(0), LoopExitInstr(0), Kind(NoReduction) { + } + + ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K) + : StartValue(Start), LoopExitInstr(Exit), Kind(K) {} + + // The starting value of the reduction. + // It does not have to be zero! + Value *StartValue; + // The instruction who's value is used outside the loop. + Instruction *LoopExitInstr; + // The kind of the reduction. + ReductionKind Kind; + }; + + // This POD struct holds information about the memory runtime legality + // check that a group of pointers do not overlap. + struct RuntimePointerCheck { + RuntimePointerCheck() : Need(false) {} + + /// Reset the state of the pointer runtime information. + void reset() { + Need = false; + Pointers.clear(); + Starts.clear(); + Ends.clear(); + } + + /// Insert a pointer and calculate the start and end SCEVs. + void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr); + + /// This flag indicates if we need to add the runtime check. + bool Need; + /// Holds the pointers that we need to check. + SmallVector Pointers; + /// Holds the pointer value at the beginning of the loop. + SmallVector Starts; + /// Holds the pointer value at the end of the loop. + SmallVector Ends; + }; + + /// A POD for saving information about induction variables. + struct InductionInfo { + InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} + InductionInfo() : StartValue(0), IK(NoInduction) {} + /// Start value. + Value *StartValue; + /// Induction kind. + InductionKind IK; + }; + + /// ReductionList contains the reduction descriptors for all + /// of the reductions that were found in the loop. + typedef DenseMap ReductionList; + + /// InductionList saves induction variables and maps them to the + /// induction descriptor. + typedef MapVector InductionList; + + /// Returns true if it is legal to vectorize this loop. + /// This does not mean that it is profitable to vectorize this + /// loop, only that it is legal to do so. + bool canVectorize(); + + /// Returns the Induction variable. + PHINode *getInduction() { return Induction; } + + /// Returns the reduction variables found in the loop. + ReductionList *getReductionVars() { return &Reductions; } + + /// Returns the induction variables found in the loop. + InductionList *getInductionVars() { return &Inductions; } + + /// Returns True if V is an induction variable in this loop. + bool isInductionVariable(const Value *V); + + /// Return true if the block BB needs to be predicated in order for the loop + /// to be vectorized. + bool blockNeedsPredication(BasicBlock *BB); + + /// Check if this pointer is consecutive when vectorizing. This happens + /// when the last index of the GEP is the induction variable, or that the + /// pointer itself is an induction variable. + /// This check allows us to vectorize A[idx] into a wide load/store. + /// Returns: + /// 0 - Stride is unknown or non consecutive. + /// 1 - Address is consecutive. + /// -1 - Address is consecutive, and decreasing. + int isConsecutivePtr(Value *Ptr); + + /// Returns true if the value V is uniform within the loop. + bool isUniform(Value *V); + + /// Returns true if this instruction will remain scalar after vectorization. + bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } + + /// Returns the information that we collected about runtime memory check. + RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } +private: + /// Check if a single basic block loop is vectorizable. + /// At this point we know that this is a loop with a constant trip count + /// and we only need to check individual instructions. + bool canVectorizeInstrs(); + + /// When we vectorize loops we may change the order in which + /// we read and write from memory. This method checks if it is + /// legal to vectorize the code, considering only memory constrains. + /// Returns true if the loop is vectorizable + bool canVectorizeMemory(); + + /// Return true if we can vectorize this loop using the IF-conversion + /// transformation. + bool canVectorizeWithIfConvert(); + + /// Collect the variables that need to stay uniform after vectorization. + void collectLoopUniforms(); + + /// Return true if all of the instructions in the block can be speculatively + /// executed. + bool blockCanBePredicated(BasicBlock *BB); + + /// Returns True, if 'Phi' is the kind of reduction variable for type + /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. + bool AddReductionVar(PHINode *Phi, ReductionKind Kind); + /// Returns true if the instruction I can be a reduction variable of type + /// 'Kind'. + bool isReductionInstr(Instruction *I, ReductionKind Kind); + /// Returns the induction kind of Phi. This function may return NoInduction + /// if the PHI is not an induction variable. + InductionKind isInductionVariable(PHINode *Phi); + /// Return true if can compute the address bounds of Ptr within the loop. + bool hasComputableBounds(Value *Ptr); + + /// The loop that we evaluate. + Loop *TheLoop; + /// Scev analysis. + ScalarEvolution *SE; + /// DataLayout analysis. + DataLayout *DL; + // Dominators. + DominatorTree *DT; + + // --- vectorization state --- // + + /// Holds the integer induction variable. This is the counter of the + /// loop. + PHINode *Induction; + /// Holds the reduction variables. + ReductionList Reductions; + /// Holds all of the induction variables that we found in the loop. + /// Notice that inductions don't need to start at zero and that induction + /// variables can be pointers. + InductionList Inductions; + + /// Allowed outside users. This holds the reduction + /// vars which can be accessed from outside the loop. + SmallPtrSet AllowedExit; + /// This set holds the variables which are known to be uniform after + /// vectorization. + SmallPtrSet Uniforms; + /// We need to check that all of the pointers in this list are disjoint + /// at runtime. + RuntimePointerCheck PtrRtCheck; +}; + +/// LoopVectorizationCostModel - estimates the expected speedups due to +/// vectorization. +/// In many cases vectorization is not profitable. This can happen because of +/// a number of reasons. In this class we mainly attempt to predict the +/// expected speedup/slowdowns due to the supported instruction set. We use the +/// TargetTransformInfo to query the different backends for the cost of +/// different operations. +class LoopVectorizationCostModel { +public: + LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, + LoopVectorizationLegality *Legal, + const TargetTransformInfo *TTI) + : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI) {} + + /// \return The most profitable vectorization factor. + /// This method checks every power of two up to VF. If UserVF is not ZERO + /// then this vectorization factor will be selected if vectorization is + /// possible. + unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF); + + + /// \return The most profitable unroll factor. + /// If UserUF is non-zero then this method finds the best unroll-factor + /// based on register pressure and other parameters. + unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF); + + /// \brief A struct that represents some properties of the register usage + /// of a loop. + struct RegisterUsage { + /// Holds the number of loop invariant values that are used in the loop. + unsigned LoopInvariantRegs; + /// Holds the maximum number of concurrent live intervals in the loop. + unsigned MaxLocalUsers; + /// Holds the number of instructions in the loop. + unsigned NumInstructions; + }; + + /// \return information about the register usage of the loop. + RegisterUsage calculateRegisterUsage(); + +private: + /// Returns the expected execution cost. The unit of the cost does + /// not matter because we use the 'cost' units to compare different + /// vector widths. The cost that is returned is *not* normalized by + /// the factor width. + unsigned expectedCost(unsigned VF); + + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + unsigned getInstructionCost(Instruction *I, unsigned VF); + + /// A helper function for converting Scalar types to vector types. + /// If the incoming type is void, we return void. If the VF is 1, we return + /// the scalar type. + static Type* ToVectorTy(Type *Scalar, unsigned VF); + + /// The loop that we evaluate. + Loop *TheLoop; + /// Scev analysis. + ScalarEvolution *SE; + /// Loop Info analysis. + LoopInfo *LI; + /// Vectorization legality. + LoopVectorizationLegality *Legal; + /// Vector target information. + const TargetTransformInfo *TTI; +}; + /// The LoopVectorize Pass. struct LoopVectorize : public LoopPass { /// Pass identification, replacement for typeid @@ -141,7 +657,7 @@ struct LoopVectorize : public LoopPass { }; -}// namespace +} // end anonymous namespace //===----------------------------------------------------------------------===// // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and -- cgit v1.1 From 1cbeaeb1944f6fd1ab0997201a47ea6c23e9a979 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 7 Jan 2013 11:12:29 +0000 Subject: Simplify LoopVectorize to require target transform info and rely on it being present. Make a member of one of the helper classes a reference as part of this. Reformatting goodness brought to you by clang-format. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171726 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 98 +++++++++++++----------------- 1 file changed, 43 insertions(+), 55 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index d51114e..2c1af1d 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -513,7 +513,7 @@ class LoopVectorizationCostModel { public: LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationLegality *Legal, - const TargetTransformInfo *TTI) + const TargetTransformInfo &TTI) : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI) {} /// \return The most profitable vectorization factor. @@ -567,7 +567,7 @@ private: /// Vectorization legality. LoopVectorizationLegality *Legal; /// Vector target information. - const TargetTransformInfo *TTI; + const TargetTransformInfo &TTI; }; /// The LoopVectorize Pass. @@ -593,7 +593,7 @@ struct LoopVectorize : public LoopPass { SE = &getAnalysis(); DL = getAnalysisIfAvailable(); LI = &getAnalysis(); - TTI = getAnalysisIfAvailable(); + TTI = &getAnalysis(); DT = &getAnalysis(); DEBUG(dbgs() << "LV: Checking a loop in \"" << @@ -607,7 +607,7 @@ struct LoopVectorize : public LoopPass { } // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, TTI); + LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI); // Check the function attribues to find out if this function should be // optimized for size. @@ -648,9 +648,10 @@ struct LoopVectorize : public LoopPass { LoopPass::getAnalysisUsage(AU); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); + AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); } @@ -2646,11 +2647,6 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, return UserVF; } - if (!TTI) { - DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); - return 1; - } - float Cost = expectedCost(1); unsigned Width = 1; DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); @@ -2682,7 +2678,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, if (OptForSize) return 1; - unsigned TargetVectorRegisters = TTI->getNumberOfRegisters(true); + unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true); DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters << " vector registers\n"); @@ -2857,8 +2853,6 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { - assert(TTI && "Invalid vector target transformation info"); - // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (Legal->isUniformAfterVectorization(I)) @@ -2875,7 +2869,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // generate vector geps. return 0; case Instruction::Br: { - return TTI->getCFInstrCost(I->getOpcode()); + return TTI.getCFInstrCost(I->getOpcode()); } case Instruction::PHI: //TODO: IF-converted IFs become selects. @@ -2898,7 +2892,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::And: case Instruction::Or: case Instruction::Xor: - return TTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); + return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy); case Instruction::Select: { SelectInst *SI = cast(I); const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); @@ -2907,13 +2901,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { if (ScalarCond) CondTy = VectorType::get(CondTy, VF); - return TTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); } case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); VectorTy = ToVectorTy(ValTy, VF); - return TTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); } case Instruction::Store: { StoreInst *SI = cast(I); @@ -2921,7 +2915,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { VectorTy = ToVectorTy(ValTy, VF); if (VF == 1) - return TTI->getMemoryOpCost(I->getOpcode(), VectorTy, + return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), SI->getPointerAddressSpace()); @@ -2934,26 +2928,24 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // The cost of extracting from the value vector and pointer vector. Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); for (unsigned i = 0; i < VF; ++i) { - Cost += TTI->getVectorInstrCost(Instruction::ExtractElement, - VectorTy, i); - Cost += TTI->getVectorInstrCost(Instruction::ExtractElement, - PtrTy, i); + Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, + i); + Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); } // The cost of the scalar stores. - Cost += VF * TTI->getMemoryOpCost(I->getOpcode(), - ValTy->getScalarType(), - SI->getAlignment(), - SI->getPointerAddressSpace()); + Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), + SI->getAlignment(), + SI->getPointerAddressSpace()); return Cost; } // Wide stores. - unsigned Cost = TTI->getMemoryOpCost(I->getOpcode(), VectorTy, - SI->getAlignment(), - SI->getPointerAddressSpace()); + unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy, + SI->getAlignment(), + SI->getPointerAddressSpace()); if (Reverse) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); return Cost; } @@ -2961,9 +2953,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { LoadInst *LI = cast(I); if (VF == 1) - return TTI->getMemoryOpCost(I->getOpcode(), VectorTy, - LI->getAlignment(), - LI->getPointerAddressSpace()); + return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), + LI->getPointerAddressSpace()); // Scalarized loads. int Stride = Legal->isConsecutivePtr(LI->getPointerOperand()); @@ -2974,29 +2965,25 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // The cost of extracting from the pointer vector. for (unsigned i = 0; i < VF; ++i) - Cost += TTI->getVectorInstrCost(Instruction::ExtractElement, - PtrTy, i); + Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); // The cost of inserting data to the result vector. for (unsigned i = 0; i < VF; ++i) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, - VectorTy, i); + Cost += TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy, i); // The cost of the scalar stores. - Cost += VF * TTI->getMemoryOpCost(I->getOpcode(), - RetTy->getScalarType(), - LI->getAlignment(), - LI->getPointerAddressSpace()); + Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), RetTy->getScalarType(), + LI->getAlignment(), + LI->getPointerAddressSpace()); return Cost; } // Wide loads. - unsigned Cost = TTI->getMemoryOpCost(I->getOpcode(), VectorTy, - LI->getAlignment(), - LI->getPointerAddressSpace()); + unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy, + LI->getAlignment(), + LI->getPointerAddressSpace()); if (Reverse) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, - VectorTy, 0); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); return Cost; } case Instruction::ZExt: @@ -3015,11 +3002,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // The cost of these is the same as the scalar operation. if (I->getOpcode() == Instruction::Trunc && Legal->isInductionVariable(I->getOperand(0))) - return TTI->getCastInstrCost(I->getOpcode(), I->getType(), - I->getOperand(0)->getType()); + return TTI.getCastInstrCost(I->getOpcode(), I->getType(), + I->getOperand(0)->getType()); Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); - return TTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } case Instruction::Call: { assert(isTriviallyVectorizableIntrinsic(I)); @@ -3028,7 +3015,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { SmallVector Tys; for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF)); - return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); + return TTI.getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); } default: { // We are scalarizing the instruction. Return the cost of the scalar @@ -3037,10 +3024,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { unsigned Cost = 0; if (!RetTy->isVoidTy() && VF != 1) { - unsigned InsCost = TTI->getVectorInstrCost(Instruction::InsertElement, - VectorTy); - unsigned ExtCost = TTI->getVectorInstrCost(Instruction::ExtractElement, - VectorTy); + unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement, + VectorTy); + unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement, + VectorTy); // The cost of inserting the results plus extracting each one of the // operands. @@ -3049,7 +3036,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - Cost += VF * TTI->getArithmeticInstrCost(Instruction::Mul, VectorTy); + Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); return Cost; } }// end of switch. @@ -3065,6 +3052,7 @@ char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) -- cgit v1.1 From 9a6c6a373629fb5a3cc5afd08aafa51339df95df Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 7 Jan 2013 21:54:51 +0000 Subject: LoopVectorizer: When we vectorizer and widen loops we process many elements at once. This is a good thing, except for small loops. On small loops post-loop that handles scalars (and runs slower) can take more time to execute than the rest of the loop. This patch disables widening of loops with a small static trip count. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171798 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 2c1af1d..b266d9d 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -101,7 +101,13 @@ EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); /// We don't vectorize loops with a known constant trip count below this number. -static const unsigned TinyTripCountThreshold = 16; +static const unsigned TinyTripCountVectorThreshold = 16; + +/// We don't unroll loops with a known constant trip count below this number. +static const unsigned TinyTripCountUnrollThreshold = 128; + +/// We don't unroll loops that are larget than this threshold. +static const unsigned MaxLoopSizeThreshold = 32; /// When performing a runtime memory check, do not check more than this /// number of pointers. Notice that the check is quadratic! @@ -2016,7 +2022,7 @@ bool LoopVectorizationLegality::canVectorize() { // Do not loop-vectorize loops with a tiny trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); - if (TC > 0u && TC < TinyTripCountThreshold) { + if (TC > 0u && TC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing.\n"); return false; @@ -2678,6 +2684,12 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, if (OptForSize) return 1; + // Do not unroll loops with a relatively small trip count. + unsigned TC = SE->getSmallConstantTripCount(TheLoop, + TheLoop->getLoopLatch()); + if (TC > 1 && TC < TinyTripCountUnrollThreshold) + return 1; + unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true); DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters << " vector registers\n"); @@ -2698,7 +2710,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // We don't want to unroll the loops to the point where they do not fit into // the decoded cache. Assume that we only allow 32 IR instructions. - UF = std::min(UF, (32 / R.NumInstructions)); + UF = std::min(UF, (MaxLoopSizeThreshold / R.NumInstructions)); // Clamp the unroll factor ranges to reasonable factors. if (UF > MaxUnrollSize) -- cgit v1.1 From 111e5fe7e089e0ffe73873848315ea5358120dfa Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 7 Jan 2013 23:13:00 +0000 Subject: LoopVectorizer: Add support for floating point reductions git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171812 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 149 ++++++++++++++++++----------- 1 file changed, 91 insertions(+), 58 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index b266d9d..cb6609f 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -215,10 +215,6 @@ private: /// broadcast them into a vector. VectorParts &getVectorValue(Value *V); - /// Get a uniform vector of constant integers. We use this to get - /// vectors of ones and zeros for the reduction code. - Constant* getUniformVector(unsigned Val, Type* ScalarTy); - /// Generate a shuffle sequence that will reverse the vector Vec. Value *reverseVector(Value *Vec); @@ -325,12 +321,14 @@ public: /// This enum represents the kinds of reductions that we support. enum ReductionKind { - NoReduction, ///< Not a reduction. - IntegerAdd, ///< Sum of numbers. - IntegerMult, ///< Product of numbers. - IntegerOr, ///< Bitwise or logical OR of numbers. - IntegerAnd, ///< Bitwise or logical AND of numbers. - IntegerXor ///< Bitwise or logical XOR of numbers. + RK_NoReduction, ///< Not a reduction. + RK_IntegerAdd, ///< Sum of integers. + RK_IntegerMult, ///< Product of integers. + RK_IntegerOr, ///< Bitwise or logical OR of numbers. + RK_IntegerAnd, ///< Bitwise or logical AND of numbers. + RK_IntegerXor, ///< Bitwise or logical XOR of numbers. + RK_FloatAdd, ///< Sum of floats. + RK_FloatMult ///< Product of floats. }; /// This enum represents the kinds of inductions that we support. @@ -343,8 +341,8 @@ public: /// This POD struct holds information about reduction variables. struct ReductionDescriptor { - ReductionDescriptor() : StartValue(0), LoopExitInstr(0), Kind(NoReduction) { - } + ReductionDescriptor() : StartValue(0), LoopExitInstr(0), + Kind(RK_NoReduction) {} ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K) : StartValue(Start), LoopExitInstr(Exit), Kind(K) {} @@ -790,11 +788,6 @@ InnerLoopVectorizer::getVectorValue(Value *V) { return WidenMap.get(V); } -Constant* -InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { - return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); -} - Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); SmallVector ShuffleMask; @@ -1215,20 +1208,26 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { /// This function returns the identity element (or neutral element) for /// the operation K. -static unsigned -getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { +static Constant* +getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) { switch (K) { - case LoopVectorizationLegality::IntegerXor: - case LoopVectorizationLegality::IntegerAdd: - case LoopVectorizationLegality::IntegerOr: + case LoopVectorizationLegality:: RK_IntegerXor: + case LoopVectorizationLegality:: RK_IntegerAdd: + case LoopVectorizationLegality:: RK_IntegerOr: // Adding, Xoring, Oring zero to a number does not change it. - return 0; - case LoopVectorizationLegality::IntegerMult: + return ConstantInt::get(Tp, 0); + case LoopVectorizationLegality:: RK_IntegerMult: // Multiplying a number by 1 does not change it. - return 1; - case LoopVectorizationLegality::IntegerAnd: + return ConstantInt::get(Tp, 1); + case LoopVectorizationLegality:: RK_IntegerAnd: // AND-ing a number with an all-1 value does not change it. - return -1; + return ConstantInt::get(Tp, -1, true); + case LoopVectorizationLegality:: RK_FloatMult: + // Multiplying a number by 1 does not change it. + return ConstantFP::get(Tp, 1.0L); + case LoopVectorizationLegality:: RK_FloatAdd: + // Adding zero to a number does not change it. + return ConstantFP::get(Tp, 0.0L); default: llvm_unreachable("Unknown reduction kind"); } @@ -1329,8 +1328,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Find the reduction identity variable. Zero for addition, or, xor, // one for multiplication, -1 for And. - Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), - VecTy->getScalarType()); + Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType()); + Constant *Identity = ConstantVector::getSplat(VF, Iden); // This vector is the Identity vector where the first element is the // incoming scalar reduction. @@ -1378,26 +1377,34 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Value *ReducedPartRdx = RdxParts[0]; for (unsigned part = 1; part < UF; ++part) { switch (RdxDesc.Kind) { - case LoopVectorizationLegality::IntegerAdd: + case LoopVectorizationLegality::RK_IntegerAdd: ReducedPartRdx = Builder.CreateAdd(RdxParts[part], ReducedPartRdx, "add.rdx"); break; - case LoopVectorizationLegality::IntegerMult: + case LoopVectorizationLegality::RK_IntegerMult: ReducedPartRdx = Builder.CreateMul(RdxParts[part], ReducedPartRdx, "mul.rdx"); break; - case LoopVectorizationLegality::IntegerOr: + case LoopVectorizationLegality::RK_IntegerOr: ReducedPartRdx = Builder.CreateOr(RdxParts[part], ReducedPartRdx, "or.rdx"); break; - case LoopVectorizationLegality::IntegerAnd: + case LoopVectorizationLegality::RK_IntegerAnd: ReducedPartRdx = Builder.CreateAnd(RdxParts[part], ReducedPartRdx, "and.rdx"); break; - case LoopVectorizationLegality::IntegerXor: + case LoopVectorizationLegality::RK_IntegerXor: ReducedPartRdx = Builder.CreateXor(RdxParts[part], ReducedPartRdx, "xor.rdx"); break; + case LoopVectorizationLegality::RK_FloatMult: + ReducedPartRdx = + Builder.CreateFMul(RdxParts[part], ReducedPartRdx, "fmul.rdx"); + break; + case LoopVectorizationLegality::RK_FloatAdd: + ReducedPartRdx = + Builder.CreateFAdd(RdxParts[part], ReducedPartRdx, "fadd.rdx"); + break; default: llvm_unreachable("Unknown reduction operation"); } @@ -1428,21 +1435,27 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Emit the operation on the shuffled value. switch (RdxDesc.Kind) { - case LoopVectorizationLegality::IntegerAdd: + case LoopVectorizationLegality::RK_IntegerAdd: TmpVec = Builder.CreateAdd(TmpVec, Shuf, "add.rdx"); break; - case LoopVectorizationLegality::IntegerMult: + case LoopVectorizationLegality::RK_IntegerMult: TmpVec = Builder.CreateMul(TmpVec, Shuf, "mul.rdx"); break; - case LoopVectorizationLegality::IntegerOr: + case LoopVectorizationLegality::RK_IntegerOr: TmpVec = Builder.CreateOr(TmpVec, Shuf, "or.rdx"); break; - case LoopVectorizationLegality::IntegerAnd: + case LoopVectorizationLegality::RK_IntegerAnd: TmpVec = Builder.CreateAnd(TmpVec, Shuf, "and.rdx"); break; - case LoopVectorizationLegality::IntegerXor: + case LoopVectorizationLegality::RK_IntegerXor: TmpVec = Builder.CreateXor(TmpVec, Shuf, "xor.rdx"); break; + case LoopVectorizationLegality::RK_FloatMult: + TmpVec = Builder.CreateFMul(TmpVec, Shuf, "fmul.rdx"); + break; + case LoopVectorizationLegality::RK_FloatAdd: + TmpVec = Builder.CreateFAdd(TmpVec, Shuf, "fadd.rdx"); + break; default: llvm_unreachable("Unknown reduction operation"); } @@ -2074,6 +2087,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Check that this PHI type is allowed. if (!Phi->getType()->isIntegerTy() && + !Phi->getType()->isFloatingPointTy() && !Phi->getType()->isPointerTy()) { DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; @@ -2105,26 +2119,34 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } - if (AddReductionVar(Phi, IntegerAdd)) { + if (AddReductionVar(Phi, RK_IntegerAdd)) { DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); continue; } - if (AddReductionVar(Phi, IntegerMult)) { + if (AddReductionVar(Phi, RK_IntegerMult)) { DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); continue; } - if (AddReductionVar(Phi, IntegerOr)) { + if (AddReductionVar(Phi, RK_IntegerOr)) { DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); continue; } - if (AddReductionVar(Phi, IntegerAnd)) { + if (AddReductionVar(Phi, RK_IntegerAnd)) { DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); continue; } - if (AddReductionVar(Phi, IntegerXor)) { + if (AddReductionVar(Phi, RK_IntegerXor)) { DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); continue; } + if (AddReductionVar(Phi, RK_FloatMult)) { + DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, RK_FloatAdd)) { + DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n"); + continue; + } DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); return false; @@ -2419,6 +2441,8 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // This includes users of the reduction, variables (which form a cycle // which ends in the phi node). Instruction *ExitInstruction = 0; + // Indicates that we found a binary operation in our scan. + bool FoundBinOp = false; // Iter is our iterator. We start with the PHI node and scan for all of the // users of this instruction. All users must be instructions that can be @@ -2436,6 +2460,9 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // Did we reach the initial PHI node already ? bool FoundStartPHI = false; + // Is this a bin op ? + FoundBinOp |= !isa(Iter); + // For each of the *users* of iter. for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); it != e; ++it) { @@ -2475,7 +2502,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // Reductions of instructions such as Div, and Sub is only // possible if the LHS is the reduction variable. - if (!U->isCommutative() && U->getOperand(0) != Iter) + if (!U->isCommutative() && !isa(U) && U->getOperand(0) != Iter) return false; Iter = U; @@ -2484,46 +2511,52 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // We found a reduction var if we have reached the original // phi node and we only have a single instruction with out-of-loop // users. - if (FoundStartPHI && ExitInstruction) { + if (FoundStartPHI) { // This instruction is allowed to have out-of-loop users. AllowedExit.insert(ExitInstruction); // Save the description of this reduction variable. ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); Reductions[Phi] = RD; - return true; + // We've ended the cycle. This is a reduction variable if we have an + // outside user and it has a binary op. + return FoundBinOp && ExitInstruction; } - - // If we've reached the start PHI but did not find an outside user then - // this is dead code. Abort. - if (FoundStartPHI) - return false; } } bool LoopVectorizationLegality::isReductionInstr(Instruction *I, ReductionKind Kind) { + bool FP = I->getType()->isFloatingPointTy(); + bool FastMath = (FP && I->isCommutative() && I->isAssociative()); + switch (I->getOpcode()) { default: return false; case Instruction::PHI: + if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd)) + return false; // possibly. return true; case Instruction::Sub: case Instruction::Add: - return Kind == IntegerAdd; + return Kind == RK_IntegerAdd; case Instruction::SDiv: case Instruction::UDiv: case Instruction::Mul: - return Kind == IntegerMult; + return Kind == RK_IntegerMult; case Instruction::And: - return Kind == IntegerAnd; + return Kind == RK_IntegerAnd; case Instruction::Or: - return Kind == IntegerOr; + return Kind == RK_IntegerOr; case Instruction::Xor: - return Kind == IntegerXor; - } + return Kind == RK_IntegerXor; + case Instruction::FMul: + return Kind == RK_FloatMult && FastMath; + case Instruction::FAdd: + return Kind == RK_FloatAdd && FastMath; + } } LoopVectorizationLegality::InductionKind -- cgit v1.1 From aae3d6fb53e1c472d9e25641a15a68c3c72c7129 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 8 Jan 2013 17:23:17 +0000 Subject: Rename the enum members to match the LLVM coding style. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171868 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 50 +++++++++++++++--------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index cb6609f..f37a0d8 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -333,10 +333,10 @@ public: /// This enum represents the kinds of inductions that we support. enum InductionKind { - NoInduction, ///< Not an induction variable. - IntInduction, ///< Integer induction variable. Step = 1. - ReverseIntInduction, ///< Reverse int induction variable. Step = -1. - PtrInduction ///< Pointer induction variable. Step = sizeof(elem). + IK_NoInduction, ///< Not an induction variable. + IK_IntInduction, ///< Integer induction variable. Step = 1. + IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. + IK_PtrInduction ///< Pointer induction variable. Step = sizeof(elem). }; /// This POD struct holds information about reduction variables. @@ -385,7 +385,7 @@ public: /// A POD for saving information about induction variables. struct InductionInfo { InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} - InductionInfo() : StartValue(0), IK(NoInduction) {} + InductionInfo() : StartValue(0), IK(IK_NoInduction) {} /// Start value. Value *StartValue; /// Induction kind. @@ -735,7 +735,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { PHINode *Phi = dyn_cast_or_null(Ptr); if (Phi && Inductions.count(Phi)) { InductionInfo II = Inductions[Phi]; - if (PtrInduction == II.IK) + if (IK_PtrInduction == II.IK) return 1; } @@ -1089,9 +1089,9 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { MiddleBlock->getTerminator()); Value *EndValue = 0; switch (II.IK) { - case LoopVectorizationLegality::NoInduction: + case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IntInduction: { + case LoopVectorizationLegality::IK_IntInduction: { // Handle the integer induction counter: assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); assert(OrigPhi == OldInduction && "Unknown integer PHI"); @@ -1101,7 +1101,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { ResumeIndex = ResumeVal; break; } - case LoopVectorizationLegality::ReverseIntInduction: { + case LoopVectorizationLegality::IK_ReverseIntInduction: { // Convert the CountRoundDown variable to the PHI size. unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits(); unsigned IISize = II.StartValue->getType()->getScalarSizeInBits(); @@ -1119,7 +1119,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { BypassBlock->getTerminator()); break; } - case LoopVectorizationLegality::PtrInduction: { + case LoopVectorizationLegality::IK_PtrInduction: { // For pointer induction variables, calculate the offset using // the end index. EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown, @@ -1618,9 +1618,9 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, Legal->getInductionVars()->lookup(P); switch (II.IK) { - case LoopVectorizationLegality::NoInduction: + case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IntInduction: { + case LoopVectorizationLegality::IK_IntInduction: { assert(P == OldInduction && "Unexpected PHI"); Value *Broadcasted = getBroadcastInstrs(Induction); // After broadcasting the induction variable we need to make the @@ -1629,8 +1629,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); continue; } - case LoopVectorizationLegality::ReverseIntInduction: - case LoopVectorizationLegality::PtrInduction: + case LoopVectorizationLegality::IK_ReverseIntInduction: + case LoopVectorizationLegality::IK_PtrInduction: // Handle reverse integer and pointer inductions. Value *StartIdx = 0; // If we have a single integer induction variable then use it. @@ -1647,7 +1647,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, "normalized.idx"); // Handle the reverse integer induction variable case. - if (LoopVectorizationLegality::ReverseIntInduction == II.IK) { + if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { IntegerType *DstTy = cast(II.StartValue->getType()); Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, "resize.norm.idx"); @@ -2104,9 +2104,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Check if this is an induction variable. InductionKind IK = isInductionVariable(Phi); - if (NoInduction != IK) { + if (IK_NoInduction != IK) { // Int inductions are special because we only allow one IV. - if (IK == IntInduction) { + if (IK == IK_IntInduction) { if (Induction) { DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); return false; @@ -2564,37 +2564,37 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { Type *PhiTy = Phi->getType(); // We only handle integer and pointer inductions variables. if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) - return NoInduction; + return IK_NoInduction; // Check that the PHI is consecutive and starts at zero. const SCEV *PhiScev = SE->getSCEV(Phi); const SCEVAddRecExpr *AR = dyn_cast(PhiScev); if (!AR) { DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); - return NoInduction; + return IK_NoInduction; } const SCEV *Step = AR->getStepRecurrence(*SE); // Integer inductions need to have a stride of one. if (PhiTy->isIntegerTy()) { if (Step->isOne()) - return IntInduction; + return IK_IntInduction; if (Step->isAllOnesValue()) - return ReverseIntInduction; - return NoInduction; + return IK_ReverseIntInduction; + return IK_NoInduction; } // Calculate the pointer stride and check if it is consecutive. const SCEVConstant *C = dyn_cast(Step); if (!C) - return NoInduction; + return IK_NoInduction; assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); if (C->getValue()->equalsInt(Size)) - return PtrInduction; + return IK_PtrInduction; - return NoInduction; + return IK_NoInduction; } bool LoopVectorizationLegality::isInductionVariable(const Value *V) { -- cgit v1.1 From 8327474e4b2c69d5eca4d62a01b89508705d0b40 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 8 Jan 2013 17:37:45 +0000 Subject: Code cleanup: refactor the switch statements in the generation of reduction variables into an IR builder call. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171871 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 87 ++++++++++-------------------- 1 file changed, 28 insertions(+), 59 deletions(-) (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index f37a0d8..9c82cb8 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1263,6 +1263,29 @@ isTriviallyVectorizableIntrinsic(Instruction *Inst) { return false; } +/// This function translates the reduction kind to an LLVM binary operator. +static Instruction::BinaryOps +getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { + switch (Kind) { + case LoopVectorizationLegality::RK_IntegerAdd: + return Instruction::Add; + case LoopVectorizationLegality::RK_IntegerMult: + return Instruction::Mul; + case LoopVectorizationLegality::RK_IntegerOr: + return Instruction::Or; + case LoopVectorizationLegality::RK_IntegerAnd: + return Instruction::And; + case LoopVectorizationLegality::RK_IntegerXor: + return Instruction::Xor; + case LoopVectorizationLegality::RK_FloatMult: + return Instruction::FMul; + case LoopVectorizationLegality::RK_FloatAdd: + return Instruction::FAdd; + default: + llvm_unreachable("Unknown reduction operation"); + } +} + void InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { //===------------------------------------------------===// @@ -1376,40 +1399,10 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = RdxParts[0]; for (unsigned part = 1; part < UF; ++part) { - switch (RdxDesc.Kind) { - case LoopVectorizationLegality::RK_IntegerAdd: - ReducedPartRdx = - Builder.CreateAdd(RdxParts[part], ReducedPartRdx, "add.rdx"); - break; - case LoopVectorizationLegality::RK_IntegerMult: - ReducedPartRdx = - Builder.CreateMul(RdxParts[part], ReducedPartRdx, "mul.rdx"); - break; - case LoopVectorizationLegality::RK_IntegerOr: - ReducedPartRdx = - Builder.CreateOr(RdxParts[part], ReducedPartRdx, "or.rdx"); - break; - case LoopVectorizationLegality::RK_IntegerAnd: - ReducedPartRdx = - Builder.CreateAnd(RdxParts[part], ReducedPartRdx, "and.rdx"); - break; - case LoopVectorizationLegality::RK_IntegerXor: - ReducedPartRdx = - Builder.CreateXor(RdxParts[part], ReducedPartRdx, "xor.rdx"); - break; - case LoopVectorizationLegality::RK_FloatMult: - ReducedPartRdx = - Builder.CreateFMul(RdxParts[part], ReducedPartRdx, "fmul.rdx"); - break; - case LoopVectorizationLegality::RK_FloatAdd: - ReducedPartRdx = - Builder.CreateFAdd(RdxParts[part], ReducedPartRdx, "fadd.rdx"); - break; - default: - llvm_unreachable("Unknown reduction operation"); - } + Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind); + ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx, + "bin.rdx"); } - // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each @@ -1433,32 +1426,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { ConstantVector::get(ShuffleMask), "rdx.shuf"); - // Emit the operation on the shuffled value. - switch (RdxDesc.Kind) { - case LoopVectorizationLegality::RK_IntegerAdd: - TmpVec = Builder.CreateAdd(TmpVec, Shuf, "add.rdx"); - break; - case LoopVectorizationLegality::RK_IntegerMult: - TmpVec = Builder.CreateMul(TmpVec, Shuf, "mul.rdx"); - break; - case LoopVectorizationLegality::RK_IntegerOr: - TmpVec = Builder.CreateOr(TmpVec, Shuf, "or.rdx"); - break; - case LoopVectorizationLegality::RK_IntegerAnd: - TmpVec = Builder.CreateAnd(TmpVec, Shuf, "and.rdx"); - break; - case LoopVectorizationLegality::RK_IntegerXor: - TmpVec = Builder.CreateXor(TmpVec, Shuf, "xor.rdx"); - break; - case LoopVectorizationLegality::RK_FloatMult: - TmpVec = Builder.CreateFMul(TmpVec, Shuf, "fmul.rdx"); - break; - case LoopVectorizationLegality::RK_FloatAdd: - TmpVec = Builder.CreateFAdd(TmpVec, Shuf, "fadd.rdx"); - break; - default: - llvm_unreachable("Unknown reduction operation"); - } + Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind); + TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx"); } // The result is in the first element of the vector. -- cgit v1.1